diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..a4c744e4 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-08-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.09755v2","updated":"2024-08-12T17:57:00Z","published":"2023-11-16T10:30:00Z","title":"On the Impact of Calibration Data in Post-training Quantization and\n Pruning","summary":" Quantization and pruning form the foundation of compression for neural\nnetworks, enabling efficient inference for large language models (LLMs).\nRecently, various quantization and pruning techniques have demonstrated\nremarkable performance in a post-training setting. They rely upon calibration\ndata, a small set of unlabeled examples that are used to generate layer\nactivations. However, no prior work has systematically investigated how the\ncalibration data impacts the effectiveness of model compression methods. In\nthis paper, we present the first extensive empirical study on the effect of\ncalibration data upon LLM performance. We trial a variety of quantization and\npruning methods, datasets, tasks, and models. Surprisingly, we find substantial\nvariations in downstream task performance, contrasting existing work that\nsuggests a greater level of robustness to the calibration data. Finally, we\nmake a series of recommendations for the effective use of calibration data in\nLLM quantization and pruning.\n","authors":["Miles Williams","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2311.09755v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2402.00798v4","updated":"2024-08-12T17:54:32Z","published":"2024-02-01T17:30:50Z","title":"Formal-LLM: Integrating Formal Language and Natural Language for\n Controllable LLM-based Agents","summary":" Recent advancements on Large Language Models (LLMs) enable AI Agents to\nautomatically generate and execute multi-step plans to solve complex tasks.\nHowever, since LLM's content generation process is hardly controllable, current\nLLM-based agents frequently generate invalid or non-executable plans, which\njeopardizes the performance of the generated plans and corrupts users' trust in\nLLM-based agents. In response, this paper proposes a novel \"Formal-LLM\"\nframework for LLM-based agents by integrating the expressiveness of natural\nlanguage and the precision of formal language. Specifically, the framework\nallows agent developers to express their requirements or constraints for the\nplanning process as an automaton. A stack-based LLM plan generation process is\nthen conducted under the supervision of the automaton to ensure that the\ngenerated plan satisfies the constraints, making the planning process\ncontrollable. We conduct experiments on both benchmark tasks and practical\nreal-life tasks, and our framework achieves over 50% overall performance\nincrease, which validates the feasibility and effectiveness of employing\nFormal-LLM to guide the plan generation of agents, preventing the agents from\ngenerating invalid and unsuccessful plans. Further, more controllable LLM-based\nagents can facilitate the broader utilization of LLM in application scenarios\nwhere high validity of planning is essential. The source code of this work is\navailable at https://github.com/agiresearch/Formal-LLM.\n","authors":["Zelong Li","Wenyue Hua","Hao Wang","He Zhu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00798v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17012v2","updated":"2024-08-12T17:53:13Z","published":"2023-09-29T06:53:10Z","title":"Benchmarking Cognitive Biases in Large Language Models as Evaluators","summary":" Large Language Models (LLMs) have recently been shown to be effective as\nautomatic evaluators with simple prompting and in-context learning. In this\nwork, we assemble 15 LLMs of four different size ranges and evaluate their\noutput responses by preference ranking from the other LLMs as evaluators, such\nas System Star is better than System Square. We then evaluate the quality of\nranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators\n(CoBBLEr), a benchmark to measure six different cognitive biases in LLM\nevaluation outputs, such as the Egocentric bias where a model prefers to rank\nits own outputs highly in evaluation. We find that LLMs are biased text quality\nevaluators, exhibiting strong indications on our bias benchmark (average of 40%\nof comparisons across all models) within each of their evaluations that\nquestion their robustness as evaluators. Furthermore, we examine the\ncorrelation between human and machine preferences and calculate the average\nRank-Biased Overlap (RBO) score to be 49.6%, indicating that machine\npreferences are misaligned with humans. According to our findings, LLMs may\nstill be unable to be utilized for automatic annotation aligned with human\npreferences. Our project page is at: https://minnesotanlp.github.io/cobbler.\n","authors":["Ryan Koo","Minhwa Lee","Vipul Raheja","Jong Inn Park","Zae Myung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2309.17012v2.pdf","comment":"Publishsed at 2024. 29 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2408.06335v1","updated":"2024-08-12T17:52:11Z","published":"2024-08-12T17:52:11Z","title":"LOLgorithm: Integrating Semantic,Syntactic and Contextual Elements for\n Humor Classification","summary":" This paper explores humor detection through a linguistic lens, prioritizing\nsyntactic, semantic, and contextual features over computational methods in\nNatural Language Processing. We categorize features into syntactic, semantic,\nand contextual dimensions, including lexicons, structural statistics, Word2Vec,\nWordNet, and phonetic style. Our proposed model, Colbert, utilizes BERT\nembeddings and parallel hidden layers to capture sentence congruity. By\ncombining syntactic, semantic, and contextual features, we train Colbert for\nhumor detection. Feature engineering examines essential syntactic and semantic\nfeatures alongside BERT embeddings. SHAP interpretations and decision trees\nidentify influential features, revealing that a holistic approach improves\nhumor detection accuracy on unseen data. Integrating linguistic cues from\ndifferent dimensions enhances the model's ability to understand humor\ncomplexity beyond traditional computational methods.\n","authors":["Tanisha Khurana","Kaushik Pillalamarri","Vikram Pande","Munindar Singh"],"pdf_url":"https://arxiv.org/pdf/2408.06335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06333v1","updated":"2024-08-12T17:50:02Z","published":"2024-08-12T17:50:02Z","title":"FastFiD: Improve Inference Efficiency of Open Domain Question Answering\n via Sentence Selection","summary":" Open Domain Question Answering (ODQA) has been advancing rapidly in recent\ntimes, driven by significant developments in dense passage retrieval and\npretrained language models. Current models typically incorporate the FiD\nframework, which is composed by a neural retriever alongside an encoder-decoder\nneural reader. In the answer generation process, the retriever will retrieve\nnumerous passages (around 100 for instance), each of which is then individually\nencoded by the encoder. Subsequently, the decoder makes predictions based on\nthese encoded passages. Nevertheless, this framework can be relatively\ntime-consuming, particularly due to the extensive length of the gathered\npassages. To address this, we introduce FastFiD in this paper, a novel approach\nthat executes sentence selection on the encoded passages. This aids in\nretaining valuable sentences while reducing the context length required for\ngenerating answers. Experiments on three commonly used datasets (Natural\nQuestions, TriviaQA and ASQA) demonstrate that our method can enhance the\ninference speed by 2.3X-5.7X, while simultaneously maintaining the model's\nperformance. Moreover, an in-depth analysis of the model's attention reveals\nthat the selected sentences indeed hold a substantial contribution towards the\nfinal answer. The codes are publicly available at\nhttps://github.com/thunlp/FastFiD.\n","authors":["Yufei Huang","Xu Han","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.06333v1.pdf","comment":"ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2408.06332v1","updated":"2024-08-12T17:48:55Z","published":"2024-08-12T17:48:55Z","title":"Animate, or Inanimate, That is the Question for Large Language Models","summary":" The cognitive essence of humans is deeply intertwined with the concept of\nanimacy, which plays an essential role in shaping their memory, vision, and\nmulti-layered language understanding. Although animacy appears in language via\nnuanced constraints on verbs and adjectives, it is also learned and refined\nthrough extralinguistic information. Similarly, we assume that the LLMs'\nlimited abilities to understand natural language when processing animacy are\nmotivated by the fact that these models are trained exclusively on text.\n Hence, the question this paper aims to answer arises: can LLMs, in their\ndigital wisdom, process animacy in a similar way to what humans would do? We\nthen propose a systematic analysis via prompting approaches. In particular, we\nprobe different LLMs by prompting them using animate, inanimate, usual, and\nstranger contexts. Results reveal that, although LLMs have been trained\npredominantly on textual data, they exhibit human-like behavior when faced with\ntypical animate and inanimate entities in alignment with earlier studies.\nHence, LLMs can adapt to understand unconventional situations by recognizing\noddities as animated without needing to interface with unspoken cognitive\ntriggers humans rely on to break down animations.\n","authors":["Leonardo Ranaldi","Giulia Pucci","Fabio Massimo Zanzotto"],"pdf_url":"https://arxiv.org/pdf/2408.06332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06327v1","updated":"2024-08-12T17:44:17Z","published":"2024-08-12T17:44:17Z","title":"VisualAgentBench: Towards Large Multimodal Models as Visual Foundation\n Agents","summary":" Large Multimodal Models (LMMs) have ushered in a new era in artificial\nintelligence, merging capabilities in both language and vision to form highly\ncapable Visual Foundation Agents. These agents are postulated to excel across a\nmyriad of tasks, potentially approaching general artificial intelligence.\nHowever, existing benchmarks fail to sufficiently challenge or showcase the\nfull potential of LMMs in complex, real-world environments. To address this\ngap, we introduce VisualAgentBench (VAB), a comprehensive and pioneering\nbenchmark specifically designed to train and evaluate LMMs as visual foundation\nagents across diverse scenarios, including Embodied, Graphical User Interface,\nand Visual Design, with tasks formulated to probe the depth of LMMs'\nunderstanding and interaction capabilities. Through rigorous testing across\nnine proprietary LMM APIs and eight open models, we demonstrate the\nconsiderable yet still developing agent capabilities of these models.\nAdditionally, VAB constructs a trajectory training set constructed through\nhybrid methods including Program-based Solvers, LMM Agent Bootstrapping, and\nHuman Demonstrations, promoting substantial performance improvements in LMMs\nthrough behavior cloning. Our work not only aims to benchmark existing models\nbut also provides a solid foundation for future development into visual\nfoundation agents. Code, train \\& test data, and part of fine-tuned open LMMs\nare available at \\url{https://github.com/THUDM/VisualAgentBench}.\n","authors":["Xiao Liu","Tianjie Zhang","Yu Gu","Iat Long Iong","Yifan Xu","Xixuan Song","Shudan Zhang","Hanyu Lai","Xinyi Liu","Hanlin Zhao","Jiadai Sun","Xinyue Yang","Yu Yang","Zehan Qi","Shuntian Yao","Xueqiao Sun","Siyi Cheng","Qinkai Zheng","Hao Yu","Hanchen Zhang","Wenyi Hong","Ming Ding","Lihang Pan","Xiaotao Gu","Aohan Zeng","Zhengxiao Du","Chan Hee Song","Yu Su","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.06327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18957v2","updated":"2024-08-12T17:20:35Z","published":"2024-03-27T19:02:13Z","title":"Moderating Illicit Online Image Promotion for Unsafe User-Generated\n Content Games Using Large Vision-Language Models","summary":" Online user generated content games (UGCGs) are increasingly popular among\nchildren and adolescents for social interaction and more creative online\nentertainment. However, they pose a heightened risk of exposure to explicit\ncontent, raising growing concerns for the online safety of children and\nadolescents. Despite these concerns, few studies have addressed the issue of\nillicit image-based promotions of unsafe UGCGs on social media, which can\ninadvertently attract young users. This challenge arises from the difficulty of\nobtaining comprehensive training data for UGCG images and the unique nature of\nthese images, which differ from traditional unsafe content. In this work, we\ntake the first step towards studying the threat of illicit promotions of unsafe\nUGCGs. We collect a real-world dataset comprising 2,924 images that display\ndiverse sexually explicit and violent content used to promote UGCGs by their\ngame creators. Our in-depth studies reveal a new understanding of this problem\nand the urgent need for automatically flagging illicit UGCG promotions. We\nadditionally create a cutting-edge system, UGCG-Guard, designed to aid social\nmedia platforms in effectively identifying images used for illicit UGCG\npromotions. This system leverages recently introduced large vision-language\nmodels (VLMs) and employs a novel conditional prompting strategy for zero-shot\ndomain adaptation, along with chain-of-thought (CoT) reasoning for contextual\nidentification. UGCG-Guard achieves outstanding results, with an accuracy rate\nof 94% in detecting these images used for the illicit promotion of such games\nin real-world scenarios.\n","authors":["Keyan Guo","Ayush Utkarsh","Wenbo Ding","Isabelle Ondracek","Ziming Zhao","Guo Freeman","Nishant Vishwamitra","Hongxin Hu"],"pdf_url":"https://arxiv.org/pdf/2403.18957v2.pdf","comment":"To Appear in the 33rd USENIX Security Symposium, August 14-16, 2024"},{"id":"http://arxiv.org/abs/2408.06303v1","updated":"2024-08-12T17:15:02Z","published":"2024-08-12T17:15:02Z","title":"Long-Form Answers to Visual Questions from Blind and Low Vision People","summary":" Vision language models can now generate long-form answers to questions about\nimages - long-form visual question answers (LFVQA). We contribute VizWiz-LF, a\ndataset of long-form answers to visual questions posed by blind and low vision\n(BLV) users. VizWiz-LF contains 4.2k long-form answers to 600 visual questions,\ncollected from human expert describers and six VQA models. We develop and\nannotate functional roles of sentences of LFVQA and demonstrate that long-form\nanswers contain information beyond the question answer such as explanations and\nsuggestions. We further conduct automatic and human evaluations with BLV and\nsighted people to evaluate long-form answers. BLV people perceive both\nhuman-written and generated long-form answers to be plausible, but generated\nanswers often hallucinate incorrect visual details, especially for unanswerable\nvisual questions (e.g., blurry or irrelevant images). To reduce hallucinations,\nwe evaluate the ability of VQA models to abstain from answering unanswerable\nquestions across multiple prompting strategies.\n","authors":["Mina Huh","Fangyuan Xu","Yi-Hao Peng","Chongyan Chen","Hansika Murugu","Danna Gurari","Eunsol Choi","Amy Pavel"],"pdf_url":"https://arxiv.org/pdf/2408.06303v1.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2403.09040v2","updated":"2024-08-12T17:12:04Z","published":"2024-03-14T02:26:31Z","title":"RAGGED: Towards Informed Design of Retrieval Augmented Generation\n Systems","summary":" Retrieval-augmented generation (RAG) can significantly improve the\nperformance of language models (LMs) by providing additional context for tasks\nsuch as document-based question answering (DBQA). However, the effectiveness of\nRAG is highly dependent on its configuration. To systematically find the\noptimal configuration, we introduce RAGGED, a framework for analyzing RAG\nconfigurations across various DBQA tasks. Using the framework, we discover\ndistinct LM behaviors in response to varying context quantities, context\nqualities, and retrievers. For instance, while some models are robust to noisy\ncontexts, monotonically performing better with more contexts, others are more\nnoise-sensitive and can effectively use only a few contexts before declining in\nperformance. This framework also provides a deeper analysis of these\ndifferences by evaluating the LMs' sensitivity to signal and noise under\nspecific context quality conditions. Using RAGGED, researchers and\npractitioners can derive actionable insights about how to optimally configure\ntheir RAG systems for their specific question-answering tasks.\n","authors":["Jennifer Hsia","Afreen Shaikh","Zhiruo Wang","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2403.09040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06292v1","updated":"2024-08-12T16:58:11Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aids to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06285v1","updated":"2024-08-12T16:49:22Z","published":"2024-08-12T16:49:22Z","title":"Synthetic Patient-Physician Dialogue Generation from Clinical Notes\n Using LLM","summary":" Medical dialogue systems (MDS) enhance patient-physician communication,\nimprove healthcare accessibility, and reduce costs. However, acquiring suitable\ndata to train these systems poses significant challenges. Privacy concerns\nprevent the use of real conversations, necessitating synthetic alternatives.\nSynthetic dialogue generation from publicly available clinical notes offers a\npromising solution to this issue, providing realistic data while safeguarding\nprivacy. Our approach, SynDial, uses a single LLM iteratively with zero-shot\nprompting and a feedback loop to generate and refine high-quality synthetic\ndialogues. The feedback consists of weighted evaluation scores for similarity\nand extractiveness. The iterative process ensures dialogues meet predefined\nthresholds, achieving superior extractiveness as a result of the feedback loop.\nAdditionally, evaluation shows that the generated dialogues excel in factuality\nmetric compared to the baselines and has comparable diversity scores with GPT4.\n","authors":["Trisha Das","Dina Albassam","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2408.06285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06281v1","updated":"2024-08-12T16:43:09Z","published":"2024-08-12T16:43:09Z","title":"MovieSum: An Abstractive Summarization Dataset for Movie Screenplays","summary":" Movie screenplay summarization is challenging, as it requires an\nunderstanding of long input contexts and various elements unique to movies.\nLarge language models have shown significant advancements in document\nsummarization, but they often struggle with processing long input contexts.\nFurthermore, while television transcripts have received attention in recent\nstudies, movie screenplay summarization remains underexplored. To stimulate\nresearch in this area, we present a new dataset, MovieSum, for abstractive\nsummarization of movie screenplays. This dataset comprises 2200 movie\nscreenplays accompanied by their Wikipedia plot summaries. We manually\nformatted the movie screenplays to represent their structural elements.\nCompared to existing datasets, MovieSum possesses several distinctive features:\n(1) It includes movie screenplays, which are longer than scripts of TV\nepisodes. (2) It is twice the size of previous movie screenplay datasets. (3)\nIt provides metadata with IMDb IDs to facilitate access to additional external\nknowledge. We also show the results of recently released large language models\napplied to summarization on our dataset to provide a detailed baseline.\n","authors":["Rohit Saxena","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2408.06281v1.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.06276v1","updated":"2024-08-12T16:39:03Z","published":"2024-08-12T16:39:03Z","title":"Review-driven Personalized Preference Reasoning with Large Language\n Models for Recommendation","summary":" Recent advancements in Large Language Models (LLMs) have demonstrated\nexceptional performance across a wide range of tasks, generating significant\ninterest in their application to recommendation systems. However, existing\nmethods have not fully capitalized on the potential of LLMs, often constrained\nby limited input information or failing to fully utilize their advanced\nreasoning capabilities. To address these limitations, we introduce EXP3RT, a\nnovel LLM-based recommender designed to leverage rich preference information\ncontained in user and item reviews. EXP3RT is basically fine-tuned through\ndistillation from a teacher LLM to perform three key tasks in order: EXP3RT\nfirst extracts and encapsulates essential subjective preferences from raw\nreviews, aggregates and summarizes them according to specific criteria to\ncreate user and item profiles. It then generates detailed step-by-step\nreasoning followed by predicted rating, i.e., reasoning-enhanced rating\nprediction, by considering both subjective and objective information from\nuser/item profiles and item descriptions. This personalized preference\nreasoning from EXP3RT enhances rating prediction accuracy and also provides\nfaithful and reasonable explanations for recommendation. Extensive experiments\nshow that EXP3RT outperforms existing methods on both rating prediction and\ncandidate item reranking for top-k recommendation, while significantly\nenhancing the explainability of recommendation systems.\n","authors":["Jieyong Kim","Hyunseo Kim","Hyunjin Cho","SeongKu Kang","Buru Chang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06273v1","updated":"2024-08-12T16:34:56Z","published":"2024-08-12T16:34:56Z","title":"FuxiTranyu: A Multilingual Large Language Model Trained with Balanced\n Data","summary":" Large language models (LLMs) have demonstrated prowess in a wide range of\ntasks. However, many LLMs exhibit significant performance discrepancies between\nhigh- and low-resource languages. To mitigate this challenge, we present\nFuxiTranyu, an open-source multilingual LLM, which is designed to satisfy the\nneed of the research community for balanced and high-performing multilingual\ncapabilities. FuxiTranyu-8B, the base model with 8 billion parameters, is\ntrained from scratch on a meticulously balanced multilingual data repository\nthat contains 600 billion tokens covering 43 natural languages and 16\nprogramming languages. In addition to the base model, we also develop two\ninstruction-tuned models: FuxiTranyu-8B-SFT that is fine-tuned on a diverse\nmultilingual instruction dataset, and FuxiTranyu-8B-DPO that is further refined\nwith DPO on a preference dataset for enhanced alignment ability. Extensive\nexperiments on a wide range of multilingual benchmarks demonstrate the\ncompetitive performance of FuxiTranyu against existing multilingual LLMs, e.g.,\nBLOOM-7B, PolyLM-13B, Llama-2-Chat-7B and Mistral-7B-Instruct. Interpretability\nanalyses at both the neuron and representation level suggest that FuxiTranyu is\nable to learn consistent multilingual representations across different\nlanguages. To promote further research into multilingual LLMs and their working\nmechanisms, we release both the base and instruction-tuned FuxiTranyu models\ntogether with 58 pretraining checkpoints at HuggingFace and Github.\n","authors":["Haoran Sun","Renren Jin","Shaoyang Xu","Leiyu Pan"," Supryadi","Menglong Cui","Jiangcun Dui","Yikun Lei","Lei Yang","Ling Shi","Juesi Xiao","Shaolin Zhu","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.06273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06266v1","updated":"2024-08-12T16:24:51Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06259v1","updated":"2024-08-12T16:15:32Z","published":"2024-08-12T16:15:32Z","title":"Context-aware Visual Storytelling with Visual Prefix Tuning and\n Contrastive Learning","summary":" Visual storytelling systems generate multi-sentence stories from image\nsequences. In this task, capturing contextual information and bridging visual\nvariation bring additional challenges. We propose a simple yet effective\nframework that leverages the generalization capabilities of pretrained\nfoundation models, only training a lightweight vision-language mapping network\nto connect modalities, while incorporating context to enhance coherence. We\nintroduce a multimodal contrastive objective that also improves visual\nrelevance and story informativeness. Extensive experimental results, across\nboth automatic metrics and human evaluations, demonstrate that the stories\ngenerated by our framework are diverse, coherent, informative, and interesting.\n","authors":["Yingjin Song","Denis Paperno","Albert Gatt"],"pdf_url":"https://arxiv.org/pdf/2408.06259v1.pdf","comment":"18 pages, 12 figures, accepted by INLG 2024"},{"id":"http://arxiv.org/abs/2407.12451v2","updated":"2024-08-12T15:44:26Z","published":"2024-07-17T09:59:52Z","title":"Across Platforms and Languages: Dutch Influencers and Legal Disclosures\n on Instagram, YouTube and TikTok","summary":" Content monetization on social media fuels a growing influencer economy.\nInfluencer marketing remains largely undisclosed or inappropriately disclosed\non social media. Non-disclosure issues have become a priority for national and\nsupranational authorities worldwide, who are starting to impose increasingly\nharsher sanctions on them. This paper proposes a transparent methodology for\nmeasuring whether and how influencers comply with disclosures based on legal\nstandards. We introduce a novel distinction between disclosures that are\nlegally sufficient (green) and legally insufficient (yellow). We apply this\nmethodology to an original dataset reflecting the content of 150 Dutch\ninfluencers publicly registered with the Dutch Media Authority based on\nrecently introduced registration obligations. The dataset consists of 292,315\nposts and is multi-language (English and Dutch) and cross-platform (Instagram,\nYouTube and TikTok). We find that influencer marketing remains generally\nunderdisclosed on social media, and that bigger influencers are not necessarily\nmore compliant with disclosure standards.\n","authors":["Haoyang Gui","Thales Bertaglia","Catalina Goanta","Sybe de Vries","Gerasimos Spanakis"],"pdf_url":"https://arxiv.org/pdf/2407.12451v2.pdf","comment":"Accept for publication at the 16th International Conference on\n Advances in Social Networks Analysis and Mining - ASONAM-2024"},{"id":"http://arxiv.org/abs/2408.06227v1","updated":"2024-08-12T15:28:51Z","published":"2024-08-12T15:28:51Z","title":"FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks","summary":" This paper introduces FLEURS-R, a speech restoration applied version of the\nFew-shot Learning Evaluation of Universal Representations of Speech (FLEURS)\ncorpus. FLEURS-R maintains an N-way parallel speech corpus in 102 languages as\nFLEURS, with improved audio quality and fidelity by applying the speech\nrestoration model Miipher. The aim of FLEURS-R is to advance speech technology\nin more languages and catalyze research including text-to-speech (TTS) and\nother speech generation tasks in low-resource languages. Comprehensive\nevaluations with the restored speech and TTS baseline models trained from the\nnew corpus show that the new corpus obtained significantly improved speech\nquality while maintaining the semantic contents of the speech. The corpus is\npublicly released via Hugging Face.\n","authors":["Min Ma","Yuma Koizumi","Shigeki Karita","Heiga Zen","Jason Riesa","Haruko Ishikawa","Michiel Bacchiani"],"pdf_url":"https://arxiv.org/pdf/2408.06227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06223v1","updated":"2024-08-12T15:24:50Z","published":"2024-08-12T15:24:50Z","title":"On Effects of Steering Latent Representation for Large Language Model\n Unlearning","summary":" Representation Misdirection for Unlearning (RMU), which steers model\nrepresentation in the intermediate layer to a target random representation, is\nan effective method for large language model (LLM) unlearning. Despite its high\nperformance, the underlying cause and explanation remain underexplored. In this\npaper, we first theoretically demonstrate that steering forget representations\nin the intermediate layer reduces token confidence, causing LLMs to generate\nwrong or nonsense responses. Second, we investigate how the coefficient\ninfluences the alignment of forget-sample representations with the random\ndirection and hint at the optimal coefficient values for effective unlearning\nacross different network layers. Third, we show that RMU unlearned models are\nrobust against adversarial jailbreak attacks. Last, our empirical analysis\nshows that RMU is less effective when applied to the middle and later layers in\nLLMs. To resolve this drawback, we propose Adaptive RMU -- a simple yet\neffective alternative method that makes unlearning effective with most layers.\nExtensive experiments demonstrate that Adaptive RMU significantly improves the\nunlearning performance compared to prior art while incurring no additional\ncomputational cost.\n","authors":["Dang Huu-Tien","Trung-Tin Pham","Hoang Thanh-Tung","Naoya Inoue"],"pdf_url":"https://arxiv.org/pdf/2408.06223v1.pdf","comment":"15 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2406.05930v2","updated":"2024-08-12T15:10:00Z","published":"2024-06-09T22:46:41Z","title":"Semisupervised Neural Proto-Language Reconstruction","summary":" Existing work implementing comparative reconstruction of ancestral languages\n(proto-languages) has usually required full supervision. However, historical\nreconstruction models are only of practical value if they can be trained with a\nlimited amount of labeled data. We propose a semisupervised historical\nreconstruction task in which the model is trained on only a small amount of\nlabeled data (cognate sets with proto-forms) and a large amount of unlabeled\ndata (cognate sets without proto-forms). We propose a neural architecture for\ncomparative reconstruction (DPD-BiReconstructor) incorporating an essential\ninsight from linguists' comparative method: that reconstructed words should not\nonly be reconstructable from their daughter words, but also deterministically\ntransformable back into their daughter words. We show that this architecture is\nable to leverage unlabeled cognate sets to outperform strong semisupervised\nbaselines on this novel task.\n","authors":["Liang Lu","Peirong Xie","David R. Mortensen"],"pdf_url":"https://arxiv.org/pdf/2406.05930v2.pdf","comment":"Accepted to ACL 2024; v2: correct typo"},{"id":"http://arxiv.org/abs/2401.04343v2","updated":"2024-08-12T15:07:50Z","published":"2024-01-09T03:53:59Z","title":"Private Fine-tuning of Large Language Models with Zeroth-order\n Optimization","summary":" Differentially private stochastic gradient descent (DP-SGD) allows models to\nbe trained in a privacy-preserving manner, but has proven difficult to scale to\nthe era of foundation models. We introduce DP-ZO, a private fine-tuning\nframework for large language models by privatizing zeroth order optimization\nmethods. A key insight into the design of our method is that the direction of\nthe gradient in the zeroth-order optimization we use is random and the only\ninformation from training data is the step size, i.e., a scalar. Therefore, we\nonly need to privatize the scalar step size, which is memory-efficient. DP-ZO\nprovides a strong privacy-utility trade-off across different tasks, and model\nsizes that are comparable to DP-SGD in $(\\varepsilon,\\delta)$-DP. Notably,\nDP-ZO possesses significant advantages over DP-SGD in memory efficiency, and\nobtains higher utility in $\\varepsilon$-DP when using the Laplace mechanism.\n","authors":["Xinyu Tang","Ashwinee Panda","Milad Nasr","Saeed Mahloujifar","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2401.04343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06195v1","updated":"2024-08-12T14:42:13Z","published":"2024-08-12T14:42:13Z","title":"Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers","summary":" This paper introduces rStar, a self-play mutual reasoning approach that\nsignificantly improves reasoning capabilities of small language models (SLMs)\nwithout fine-tuning or superior models. rStar decouples reasoning into a\nself-play mutual generation-discrimination process. First, a target SLM\naugments the Monte Carlo Tree Search (MCTS) with a rich set of human-like\nreasoning actions to construct higher quality reasoning trajectories. Next,\nanother SLM, with capabilities similar to the target SLM, acts as a\ndiscriminator to verify each trajectory generated by the target SLM. The\nmutually agreed reasoning trajectories are considered mutual consistent, thus\nare more likely to be correct. Extensive experiments across five SLMs\ndemonstrate rStar can effectively solve diverse reasoning problems, including\nGSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K\naccuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for\nMistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be\navailable at https://github.com/zhentingqi/rStar.\n","authors":["Zhenting Qi","Mingyuan Ma","Jiahang Xu","Li Lyna Zhang","Fan Yang","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06186v1","updated":"2024-08-12T14:34:06Z","published":"2024-08-12T14:34:06Z","title":"Improving Structural Diversity of Blackbox LLMs via\n Chain-of-Specification Prompting","summary":" The capability to generate diverse text is a key challenge facing large\nlanguage models (LLMs). Thus far, diversity has been studied via metrics such\nas $n$-gram diversity or diversity of BERT embeddings. However, for these kinds\nof diversity, the user has little control over the dimensions along which\ndiversity is considered. For example, in the poetry domain, one might desire\ndiversity in terms of rhyme and meter, whereas in the code domain, one might\ndesire diversity in terms of the kinds of expressions used to solve a problem.\nWe propose a diversity metric called structural diversity, where the user\nprovides a mapping from generated text to features capturing the kinds of\ndiversity that they care about. In addition, we propose a novel strategy called\nchain-of-specification (CoS) prompting for improving diversity by first having\nthe LLM generate a specification encoding one instance of structural features,\nand then prompting the LLM to generate text that satisfies these features;\nnotably, our strategy works with blackbox LLMs. In our experiments, we show\nthat for structural diversity in the poetry and code domains, CoS significantly\nimproves diversity compared to several baselines.\n","authors":["Halley Young","Yimeng Zeng","Jacob Gardner","Osbert Bastani"],"pdf_url":"https://arxiv.org/pdf/2408.06186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03745v3","updated":"2024-08-12T14:13:15Z","published":"2024-04-04T18:34:32Z","title":"Fakes of Varying Shades: How Warning Affects Human Perception and\n Engagement Regarding LLM Hallucinations","summary":" The widespread adoption and transformative effects of large language models\n(LLMs) have sparked concerns regarding their capacity to produce inaccurate and\nfictitious content, referred to as `hallucinations'. Given the potential risks\nassociated with hallucinations, humans should be able to identify them. This\nresearch aims to understand the human perception of LLM hallucinations by\nsystematically varying the degree of hallucination (genuine, minor\nhallucination, major hallucination) and examining its interaction with warning\n(i.e., a warning of potential inaccuracies: absent vs. present). Participants\n(N=419) from Prolific rated the perceived accuracy and engaged with content\n(e.g., like, dislike, share) in a Q/A format. Participants ranked content as\ntruthful in the order of genuine, minor hallucination, and major hallucination,\nand user engagement behaviors mirrored this pattern. More importantly, we\nobserved that warning improved the detection of hallucination without\nsignificantly affecting the perceived truthfulness of genuine content. We\nconclude by offering insights for future tools to aid human detection of\nhallucinations. All survey materials, demographic questions, and post-session\nquestions are available at:\nhttps://github.com/MahjabinNahar/fakes-of-varying-shades-survey-materials\n","authors":["Mahjabin Nahar","Haeseung Seo","Eun-Ju Lee","Aiping Xiong","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2404.03745v3.pdf","comment":"Accepted at COLM 2024"},{"id":"http://arxiv.org/abs/2408.04660v2","updated":"2024-08-12T14:12:23Z","published":"2024-08-05T20:01:10Z","title":"XMainframe: A Large Language Model for Mainframe Modernization","summary":" Mainframe operating systems, despite their inception in the 1940s, continue\nto support critical sectors like finance and government. However, these systems\nare often viewed as outdated, requiring extensive maintenance and\nmodernization. Addressing this challenge necessitates innovative tools that can\nunderstand and interact with legacy codebases. To this end, we introduce\nXMainframe, a state-of-the-art large language model (LLM) specifically designed\nwith knowledge of mainframe legacy systems and COBOL codebases. Our solution\ninvolves the creation of an extensive data collection pipeline to produce\nhigh-quality training datasets, enhancing XMainframe's performance in this\nspecialized domain. Additionally, we present MainframeBench, a comprehensive\nbenchmark for assessing mainframe knowledge, including multiple-choice\nquestions, question answering, and COBOL code summarization. Our empirical\nevaluations demonstrate that XMainframe consistently outperforms existing\nstate-of-the-art LLMs across these tasks. Specifically, XMainframe achieves 30%\nhigher accuracy than DeepSeek-Coder on multiple-choice questions, doubles the\nBLEU score of Mixtral-Instruct 8x7B on question answering, and scores six times\nhigher than GPT-3.5 on COBOL summarization. Our work highlights the potential\nof XMainframe to drive significant advancements in managing and modernizing\nlegacy systems, thereby enhancing productivity and saving time for software\ndevelopers.\n","authors":["Anh T. V. Dau","Hieu Trung Dao","Anh Tuan Nguyen","Hieu Trung Tran","Phong X. Nguyen","Nghi D. Q. Bui"],"pdf_url":"https://arxiv.org/pdf/2408.04660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10620v2","updated":"2024-08-12T14:07:32Z","published":"2024-05-17T08:33:27Z","title":"MC-GPT: Empowering Vision-and-Language Navigation with Memory Map and\n Reasoning Chains","summary":" In the Vision-and-Language Navigation (VLN) task, the agent is required to\nnavigate to a destination following a natural language instruction. While\nlearning-based approaches have been a major solution to the task, they suffer\nfrom high training costs and lack of interpretability. Recently, Large Language\nModels (LLMs) have emerged as a promising tool for VLN due to their strong\ngeneralization capabilities. However, existing LLM-based methods face\nlimitations in memory construction and diversity of navigation strategies. To\naddress these challenges, we propose a suite of techniques. Firstly, we\nintroduce a method to maintain a topological map that stores navigation\nhistory, retaining information about viewpoints, objects, and their spatial\nrelationships. This map also serves as a global action space. Additionally, we\npresent a Navigation Chain of Thoughts module, leveraging human navigation\nexamples to enrich navigation strategy diversity. Finally, we establish a\npipeline that integrates navigational memory and strategies with perception and\naction prediction modules. Experimental results on the REVERIE and R2R datasets\nshow that our method effectively enhances the navigation ability of the LLM and\nimproves the interpretability of navigation reasoning.\n","authors":["Zhaohuan Zhan","Lisha Yu","Sijie Yu","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2405.10620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00487v2","updated":"2024-08-12T14:06:48Z","published":"2024-06-29T16:34:23Z","title":"It's Morphing Time: Unleashing the Potential of Multiple LLMs via\n Multi-objective Optimization","summary":" In this paper, we introduce a novel approach for large language model merging\nvia black-box multi-objective optimization algorithms. The goal of model\nmerging is to combine multiple models, each excelling in different tasks, into\na single model that outperforms any of the individual source models. However,\nmodel merging faces two significant challenges: First, existing methods rely\nheavily on human intuition and customized strategies to tackle multiple tasks.\nSecond, it's difficult to search for the great model merging configuration in\nlimited evaluations. To address these challenges, we propose a multi-objective\noptimization based model merging method named MM-MO. The proposed method can\nautomatically search merging configurations for multiple tasks with\nmulti-objective optimization algorithms. Moreover, to obtain high-quality model\nmerging configurations within a limited number of evaluation iterations, we\nhave made several improvements to multi-objective Bayesian optimization\nspecifically for model merging scenarios. First, we introduced a weak-to-strong\nmethod to improve the acquisition strategy. Second, we employed Fisher\ninformation to select configurations, further increasing the chances of\ndiscovering superior model merging configurations. Third, we designed a\nsparsity metric as an additional optimization objective to enhance the model's\ngeneralization performance across different tasks. We conducted comprehensive\nexperiments with other mainstream model merging methods, demonstrating that our\nmethod consistently outperforms them. Moreover, performance improvements are\nobserved even on the tasks not explicitly targeted as optimization objectives,\nindicating that our method enhances the overall potential of the model. ...\n","authors":["Bingdong Li","Zixiang Di","Yanting Yang","Hong Qian","Peng Yang","Hao Hao","Ke Tang","Aimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.00487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06150v1","updated":"2024-08-12T13:44:24Z","published":"2024-08-12T13:44:24Z","title":"LipidBERT: A Lipid Language Model Pre-trained on METiS de novo Lipid\n Library","summary":" In this study, we generate and maintain a database of 10 million virtual\nlipids through METiS's in-house de novo lipid generation algorithms and lipid\nvirtual screening techniques. These virtual lipids serve as a corpus for\npre-training, lipid representation learning, and downstream task knowledge\ntransfer, culminating in state-of-the-art LNP property prediction performance.\nWe propose LipidBERT, a BERT-like model pre-trained with the Masked Language\nModel (MLM) and various secondary tasks. Additionally, we compare the\nperformance of embeddings generated by LipidBERT and PhatGPT, our GPT-like\nlipid generation model, on downstream tasks. The proposed bilingual LipidBERT\nmodel operates in two languages: the language of ionizable lipid pre-training,\nusing in-house dry-lab lipid structures, and the language of LNP fine-tuning,\nutilizing in-house LNP wet-lab data. This dual capability positions LipidBERT\nas a key AI-based filter for future screening tasks, including new versions of\nMETiS de novo lipid libraries and, more importantly, candidates for in vivo\ntesting for orgran-targeting LNPs. To the best of our knowledge, this is the\nfirst successful demonstration of the capability of a pre-trained language\nmodel on virtual lipids and its effectiveness in downstream tasks using web-lab\ndata. This work showcases the clever utilization of METiS's in-house de novo\nlipid library as well as the power of dry-wet lab integration.\n","authors":["Tianhao Yu","Cai Yao","Zhuorui Sun","Feng Shi","Lin Zhang","Kangjie Lyu","Xuan Bai","Andong Liu","Xicheng Zhang","Jiali Zou","Wenshou Wang","Chris Lai","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06142v1","updated":"2024-08-12T13:37:31Z","published":"2024-08-12T13:37:31Z","title":"Med42-v2: A Suite of Clinical LLMs","summary":" Med42-v2 introduces a suite of clinical large language models (LLMs) designed\nto address the limitations of generic models in healthcare settings. These\nmodels are built on Llama3 architecture and fine-tuned using specialized\nclinical data. They underwent multi-stage preference alignment to effectively\nrespond to natural prompts. While generic models are often preference-aligned\nto avoid answering clinical queries as a precaution, Med42-v2 is specifically\ntrained to overcome this limitation, enabling its use in clinical settings.\nMed42-v2 models demonstrate superior performance compared to the original\nLlama3 models in both 8B and 70B parameter configurations and GPT-4 across\nvarious medical benchmarks. These LLMs are developed to understand clinical\nqueries, perform reasoning tasks, and provide valuable assistance in clinical\nenvironments. The models are now publicly available at\n\\href{https://huggingface.co/m42-health}{https://huggingface.co/m42-health}.\n","authors":["Clément Christophe","Praveen K Kanithi","Tathagata Raha","Shadab Khan","Marco AF Pimentel"],"pdf_url":"https://arxiv.org/pdf/2408.06142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04655v2","updated":"2024-08-12T13:20:36Z","published":"2024-08-05T11:27:51Z","title":"Strong and weak alignment of large language models with human values","summary":" Minimizing negative impacts of Artificial Intelligent (AI) systems on human\nsocieties without human supervision requires them to be able to align with\nhuman values. However, most current work only addresses this issue from a\ntechnical point of view, e.g., improving current methods relying on\nreinforcement learning from human feedback, neglecting what it means and is\nrequired for alignment to occur. Here, we propose to distinguish strong and\nweak value alignment. Strong alignment requires cognitive abilities (either\nhuman-like or different from humans) such as understanding and reasoning about\nagents' intentions and their ability to causally produce desired effects. We\nargue that this is required for AI systems like large language models (LLMs) to\nbe able to recognize situations presenting a risk that human values may be\nflouted. To illustrate this distinction, we present a series of prompts showing\nChatGPT's, Gemini's and Copilot's failures to recognize some of these\nsituations. We moreover analyze word embeddings to show that the nearest\nneighbors of some human values in LLMs differ from humans' semantic\nrepresentations. We then propose a new thought experiment that we call \"the\nChinese room with a word transition dictionary\", in extension of John Searle's\nfamous proposal. We finally mention current promising research directions\ntowards a weak alignment, which could produce statistically satisfying answers\nin a number of common situations, however so far without ensuring any truth\nvalue.\n","authors":["Mehdi Khamassi","Marceau Nahon","Raja Chatila"],"pdf_url":"https://arxiv.org/pdf/2408.04655v2.pdf","comment":"Accepted for publication in Scientific Reports, special issue on AI\n aligment"},{"id":"http://arxiv.org/abs/2403.14402v2","updated":"2024-08-12T13:16:48Z","published":"2024-03-21T13:52:17Z","title":"XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for\n Noise-Robust Speech Perception","summary":" Speech recognition and translation systems perform poorly on noisy inputs,\nwhich are frequent in realistic environments. Augmenting these systems with\nvisual signals has the potential to improve robustness to noise. However,\naudio-visual (AV) data is only available in limited amounts and for fewer\nlanguages than audio-only resources. To address this gap, we present XLAVS-R, a\ncross-lingual audio-visual speech representation model for noise-robust speech\nrecognition and translation in over 100 languages. It is designed to maximize\nthe benefits of limited multilingual AV pre-training data, by building on top\nof audio-only multilingual pre-training and simplifying existing pre-training\nschemes. Extensive evaluation on the MuAViC benchmark shows the strength of\nXLAVS-R on downstream audio-visual speech recognition and translation tasks,\nwhere it outperforms the previous state of the art by up to 18.5% WER and 4.7\nBLEU given noisy AV inputs, and enables strong zero-shot audio-visual ability\nwith audio-only fine-tuning.\n","authors":["HyoJung Han","Mohamed Anwar","Juan Pino","Wei-Ning Hsu","Marine Carpuat","Bowen Shi","Changhan Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14402v2.pdf","comment":"ACL2024"},{"id":"http://arxiv.org/abs/2408.06124v1","updated":"2024-08-12T13:07:34Z","published":"2024-08-12T13:07:34Z","title":"Utilize Transformers for translating Wikipedia category names","summary":" On Wikipedia, articles are categorized to aid readers in navigating content\nefficiently. The manual creation of new categories can be laborious and\ntime-intensive. To tackle this issue, we built language models to translate\nWikipedia categories from English to Vietnamese with a dataset containing\n15,000 English-Vietnamese category pairs. Subsequently, small to medium-scale\nTransformer pre-trained models with a sequence-to-sequence architecture were\nfine-tuned for category translation. The experiments revealed that\nOPUS-MT-en-vi surpassed other models, attaining the highest performance with a\nBLEU score of 0.73, despite its smaller model storage. We expect our paper to\nbe an alternative solution for translation tasks with limited computer\nresources.\n","authors":["Hoang-Thang Ta","Quoc Thang La"],"pdf_url":"https://arxiv.org/pdf/2408.06124v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.06120v1","updated":"2024-08-12T13:02:31Z","published":"2024-08-12T13:02:31Z","title":"How ChatGPT Changed the Media's Narratives on AI: A Semi-Automated\n Narrative Analysis Through Frame Semantics","summary":" The recent explosion of attention to AI is arguably one of the biggest in the\ntechnology's media coverage. To investigate the effects it has on the\ndiscourse, we perform a mixed-method frame semantics-based analysis on a\ndataset of more than 49,000 sentences collected from 5846 news articles that\nmention AI. The dataset covers the twelve-month period centred around the\nlaunch of OpenAI's chatbot ChatGPT and is collected from the most visited\nopen-access English-language news publishers. Our findings indicate that during\nthe half year succeeding the launch, media attention rose\ntenfold$\\unicode{x2014}$from already historically high levels. During this\nperiod, discourse has become increasingly centred around experts and political\nleaders, and AI has become more closely associated with dangers and risks. A\ndeeper review of the data also suggests a qualitative shift in the types of\nthreat AI is thought to represent, as well as the anthropomorphic qualities\nascribed to it.\n","authors":["Igor Ryazanov","Carl Öhman","Johanna Björklund"],"pdf_url":"https://arxiv.org/pdf/2408.06120v1.pdf","comment":"18 pages, 6 figures and 2 appendices (5 pages)"},{"id":"http://arxiv.org/abs/2407.11046v3","updated":"2024-08-12T12:41:57Z","published":"2024-07-08T12:32:10Z","title":"A Survey on LoRA of Large Language Models","summary":" Low-Rank Adaptation~(LoRA), which updates the dense neural network layers\nwith pluggable low-rank matrices, is one of the best performed parameter\nefficient fine-tuning paradigms. Furthermore, it has significant advantages in\ncross-task generalization and privacy-preserving. Hence, LoRA has gained much\nattention recently, and the number of related literature demonstrates\nexponential growth. It is necessary to conduct a comprehensive overview of the\ncurrent progress on LoRA. This survey categorizes and reviews the progress from\nthe perspectives of (1) downstream adaptation improving variants that improve\nLoRA's performance on downstream tasks; (2) cross-task generalization methods\nthat mix multiple LoRA plugins to achieve cross-task generalization; (3)\nefficiency-improving methods that boost the computation-efficiency of LoRA; (4)\ndata privacy-preserving methods that use LoRA in federated learning; (5)\napplication. Besides, this survey also discusses the future directions in this\nfield. At last, we provide a Github\npage~\\footnote{\\href{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}}\nfor readers to check the updates and initiate discussions on this survey paper.\n","authors":["Yuren Mao","Yuhang Ge","Yijiang Fan","Wenyi Xu","Yu Mi","Zhonghao Hu","Yunjun Gao"],"pdf_url":"https://arxiv.org/pdf/2407.11046v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07032v2","updated":"2024-08-12T12:11:52Z","published":"2023-11-13T02:31:16Z","title":"ExpNote: Black-box Large Language Models are Better Task Solvers with\n Experience Notebook","summary":" Black-box Large Language Models (LLMs) have shown great power in solving\nvarious tasks and are considered general problem solvers. However, LLMs still\nfail in many specific tasks although understand the task instruction. In this\npaper, we focus on the problem of boosting the ability of black-box LLMs to\nsolve downstream tasks. We propose ExpNote, an automated framework to help LLMs\nbetter adapt to unfamiliar tasks through reflecting and noting experiences from\ntraining data and retrieving them from external memory during testing. We\nevaluate ExpNote on multiple tasks and the experimental results demonstrate\nthat the proposed method significantly improves the performance of black-box\nLLMs. The data and code are available at\nhttps://github.com/forangel2014/ExpNote\n","authors":["Wangtao Sun","Xuanqing Yu","Shizhu He","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.07032v2.pdf","comment":"EMNLP 2023 findings"},{"id":"http://arxiv.org/abs/2408.06087v1","updated":"2024-08-12T12:04:14Z","published":"2024-08-12T12:04:14Z","title":"Building Decision Making Models Through Language Model Regime","summary":" We propose a novel approach for decision making problems leveraging the\ngeneralization capabilities of large language models (LLMs). Traditional\nmethods such as expert systems, planning algorithms, and reinforcement learning\noften exhibit limited generalization, typically requiring the training of new\nmodels for each unique task. In contrast, LLMs demonstrate remarkable success\nin generalizing across varied language tasks, inspiring a new strategy for\ntraining decision making models. Our approach, referred to as \"Learning then\nUsing\" (LTU), entails a two-stage process. Initially, the \\textit{learning}\nphase develops a robust foundational decision making model by integrating\ndiverse knowledge from various domains and decision making contexts. The\nsubsequent \\textit{using} phase refines this foundation model for specific\ndecision making scenarios. Distinct from other studies that employ LLMs for\ndecision making through supervised learning, our LTU method embraces a\nversatile training methodology that combines broad pre-training with targeted\nfine-tuning. Experiments in e-commerce domains such as advertising and search\noptimization have shown that LTU approach outperforms traditional supervised\nlearning regimes in decision making capabilities and generalization. The LTU\napproach is the first practical training architecture for both single-step and\nmulti-step decision making tasks combined with LLMs, which can be applied\nbeyond game and robot domains. It provides a robust and adaptable framework for\ndecision making, enhances the effectiveness and flexibility of various systems\nin tackling various challenges.\n","authors":["Yu Zhang","Haoxiang Liu","Feijun Jiang","Weihua Luo","Kaifu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06065v1","updated":"2024-08-12T11:32:34Z","published":"2024-08-12T11:32:34Z","title":"An Investigation Into Explainable Audio Hate Speech Detection","summary":" Research on hate speech has predominantly revolved around detection and\ninterpretation from textual inputs, leaving verbal content largely unexplored.\nWhile there has been limited exploration into hate speech detection within\nverbal acoustic speech inputs, the aspect of interpretability has been\noverlooked. Therefore, we introduce a new task of explainable audio hate speech\ndetection. Specifically, we aim to identify the precise time intervals,\nreferred to as audio frame-level rationales, which serve as evidence for hate\nspeech classification. Towards this end, we propose two different approaches:\ncascading and End-to-End (E2E). The cascading approach initially converts audio\nto transcripts, identifies hate speech within these transcripts, and\nsubsequently locates the corresponding audio time frames. Conversely, the E2E\napproach processes audio utterances directly, which allows it to pinpoint hate\nspeech within specific time frames. Additionally, due to the lack of\nexplainable audio hate speech datasets that include audio frame-level\nrationales, we curated a synthetic audio dataset to train our models. We\nfurther validated these models on actual human speech utterances and found that\nthe E2E approach outperforms the cascading method in terms of the audio frame\nIntersection over Union (IoU) metric. Furthermore, we observed that including\nframe-level rationales significantly enhances hate speech detection accuracy\nfor the E2E approach.\n \\textbf{Disclaimer} The reader may encounter content of an offensive or\nhateful nature. However, given the nature of the work, this cannot be avoided.\n","authors":["Jinmyeong An","Wonjun Lee","Yejin Jeon","Jungseul Ok","Yunsu Kim","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06065v1.pdf","comment":"Accepted to SIGDIAL 2024"},{"id":"http://arxiv.org/abs/2408.06062v1","updated":"2024-08-12T11:23:24Z","published":"2024-08-12T11:23:24Z","title":"On Tables with Numbers, with Numbers","summary":" This paper is a critical reflection on the epistemic culture of contemporary\ncomputational linguistics, framed in the context of its growing obsession with\ntables with numbers. We argue against tables with numbers on the basis of their\nepistemic irrelevance, their environmental impact, their role in enabling and\nexacerbating social inequalities, and their deep ties to commercial\napplications and profit-driven research. We substantiate our arguments with\nempirical evidence drawn from a meta-analysis of computational linguistics\nresearch over the last decade.\n","authors":["Konstantinos Kogkalidis","Stergios Chatzikyriakidis"],"pdf_url":"https://arxiv.org/pdf/2408.06062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06061v1","updated":"2024-08-12T11:21:40Z","published":"2024-08-12T11:21:40Z","title":"Quantum Algorithms for Compositional Text Processing","summary":" Quantum computing and AI have found a fruitful intersection in the field of\nnatural language processing. We focus on the recently proposed DisCoCirc\nframework for natural language, and propose a quantum adaptation, QDisCoCirc.\nThis is motivated by a compositional approach to rendering AI interpretable:\nthe behavior of the whole can be understood in terms of the behavior of parts,\nand the way they are put together. For the model-native primitive operation of\ntext similarity, we derive quantum algorithms for fault-tolerant quantum\ncomputers to solve the task of question-answering within QDisCoCirc, and show\nthat this is BQP-hard; note that we do not consider the complexity of\nquestion-answering in other natural language processing models. Assuming\nwidely-held conjectures, implementing the proposed model classically would\nrequire super-polynomial resources. Therefore, it could provide a meaningful\ndemonstration of the power of practical quantum processors. The model\nconstruction builds on previous work in compositional quantum natural language\nprocessing. Word embeddings are encoded as parameterized quantum circuits, and\ncompositionality here means that the quantum circuits compose according to the\nlinguistic structure of the text. We outline a method for evaluating the model\non near-term quantum processors, and elsewhere we report on a recent\nimplementation of this on quantum hardware. In addition, we adapt a quantum\nalgorithm for the closest vector problem to obtain a Grover-like speedup in the\nfault-tolerant regime for our model. This provides an unconditional quadratic\nspeedup over any classical algorithm in certain circumstances, which we will\nverify empirically in future work.\n","authors":["Tuomas Laakkonen","Konstantinos Meichanetzidis","Bob Coecke"],"pdf_url":"https://arxiv.org/pdf/2408.06061v1.pdf","comment":"In Proceedings QPL 2024, arXiv:2408.05113"},{"id":"http://arxiv.org/abs/2403.06765v3","updated":"2024-08-12T10:55:38Z","published":"2024-03-11T14:35:45Z","title":"ConspEmoLLM: Conspiracy Theory Detection Using an Emotion-Based Large\n Language Model","summary":" The internet has brought both benefits and harms to society. A prime example\nof the latter is misinformation, including conspiracy theories, which flood the\nweb. Recent advances in natural language processing, particularly the emergence\nof large language models (LLMs), have improved the prospects of accurate\nmisinformation detection. However, most LLM-based approaches to conspiracy\ntheory detection focus only on binary classification and fail to account for\nthe important relationship between misinformation and affective features (i.e.,\nsentiment and emotions). Driven by a comprehensive analysis of conspiracy text\nthat reveals its distinctive affective features, we propose ConspEmoLLM, the\nfirst open-source LLM that integrates affective information and is able to\nperform diverse tasks relating to conspiracy theories. These tasks include not\nonly conspiracy theory detection, but also classification of theory type and\ndetection of related discussion (e.g., opinions towards theories). ConspEmoLLM\nis fine-tuned based on an emotion-oriented LLM using our novel ConDID dataset,\nwhich includes five tasks to support LLM instruction tuning and evaluation. We\ndemonstrate that when applied to these tasks, ConspEmoLLM largely outperforms\nseveral open-source general domain LLMs and ChatGPT, as well as an LLM that has\nbeen fine-tuned using ConDID, but which does not use affective features. This\nproject will be released on https://github.com/lzw108/ConspEmoLLM/.\n","authors":["Zhiwei Liu","Boyang Liu","Paul Thompson","Kailai Yang","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2403.06765v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.06044v1","updated":"2024-08-12T10:26:39Z","published":"2024-08-12T10:26:39Z","title":"DiagESC: Dialogue Synthesis for Integrating Depression Diagnosis into\n Emotional Support Conversation","summary":" Dialogue systems for mental health care aim to provide appropriate support to\nindividuals experiencing mental distress. While extensive research has been\nconducted to deliver adequate emotional support, existing studies cannot\nidentify individuals who require professional medical intervention and cannot\noffer suitable guidance. We introduce the Diagnostic Emotional Support\nConversation task for an advanced mental health management system. We develop\nthe DESC dataset to assess depression symptoms while maintaining user\nexperience by utilizing task-specific utterance generation prompts and a strict\nfiltering algorithm. Evaluations by professional psychological counselors\nindicate that DESC has a superior ability to diagnose depression than existing\ndata. Additionally, conversational quality evaluation reveals that DESC\nmaintains fluent, consistent, and coherent dialogues.\n","authors":["Seungyeon Seo","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06044v1.pdf","comment":"Accepted by SIGDIAL 2024"},{"id":"http://arxiv.org/abs/2408.06043v1","updated":"2024-08-12T10:21:09Z","published":"2024-08-12T10:21:09Z","title":"Enhancing Dialogue Speech Recognition with Robust Contextual Awareness\n via Noise Representation Learning","summary":" Recent dialogue systems rely on turn-based spoken interactions, requiring\naccurate Automatic Speech Recognition (ASR). Errors in ASR can significantly\nimpact downstream dialogue tasks. To address this, using dialogue context from\nuser and agent interactions for transcribing subsequent utterances has been\nproposed. This method incorporates the transcription of the user's speech and\nthe agent's response as model input, using the accumulated context generated by\neach turn. However, this context is susceptible to ASR errors because it is\ngenerated by the ASR model in an auto-regressive fashion. Such noisy context\ncan further degrade the benefits of context input, resulting in suboptimal ASR\nperformance. In this paper, we introduce Context Noise Representation Learning\n(CNRL) to enhance robustness against noisy context, ultimately improving\ndialogue speech recognition accuracy. To maximize the advantage of context\nawareness, our approach includes decoder pre-training using text-based dialogue\ndata and noise representation learning for a context encoder. Based on the\nevaluation of speech dialogues, our method shows superior results compared to\nbaselines. Furthermore, the strength of our approach is highlighted in noisy\nenvironments where user speech is barely audible due to real-world noise,\nrelying on contextual information to transcribe the input accurately.\n","authors":["Wonjun Lee","San Kim","Gary Geunbae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06043v1.pdf","comment":"11 pages, 2 figures, Accepted to SIGDIAL2024"},{"id":"http://arxiv.org/abs/2408.06040v1","updated":"2024-08-12T10:15:13Z","published":"2024-08-12T10:15:13Z","title":"ARPA: A Novel Hybrid Model for Advancing Visual Word Disambiguation\n Using Large Language Models and Transformers","summary":" In the rapidly evolving fields of natural language processing and computer\nvision, Visual Word Sense Disambiguation (VWSD) stands as a critical, yet\nchallenging task. The quest for models that can seamlessly integrate and\ninterpret multimodal data is more pressing than ever. Imagine a system that can\nunderstand language with the depth and nuance of human cognition, while\nsimultaneously interpreting the rich visual context of the world around it.\n We present ARPA, an architecture that fuses the unparalleled contextual\nunderstanding of large language models with the advanced feature extraction\ncapabilities of transformers, which then pass through a custom Graph Neural\nNetwork (GNN) layer to learn intricate relationships and subtle nuances within\nthe data. This innovative architecture not only sets a new benchmark in visual\nword disambiguation but also introduces a versatile framework poised to\ntransform how linguistic and visual data interact by harnessing the synergistic\nstrengths of its components, ensuring robust performance even in the most\ncomplex disambiguation scenarios. Through a series of experiments and\ncomparative analysis, we reveal the substantial advantages of our model,\nunderscoring its potential to redefine standards in the field. Beyond its\narchitectural prowess, our architecture excels through experimental\nenrichments, including sophisticated data augmentation and multi-modal training\ntechniques.\n ARPA's introduction marks a significant milestone in visual word\ndisambiguation, offering a compelling solution that bridges the gap between\nlinguistic and visual modalities. We invite researchers and practitioners to\nexplore the capabilities of our model, envisioning a future where such hybrid\nmodels drive unprecedented advancements in artificial intelligence.\n","authors":["Aristi Papastavrou","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2408.06040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08618v2","updated":"2024-08-12T09:50:12Z","published":"2024-07-11T15:56:02Z","title":"Tamil Language Computing: the Present and the Future","summary":" This paper delves into the text processing aspects of Language Computing,\nwhich enables computers to understand, interpret, and generate human language.\nFocusing on tasks such as speech recognition, machine translation, sentiment\nanalysis, text summarization, and language modelling, language computing\nintegrates disciplines including linguistics, computer science, and cognitive\npsychology to create meaningful human-computer interactions. Recent\nadvancements in deep learning have made computers more accessible and capable\nof independent learning and adaptation. In examining the landscape of language\ncomputing, the paper emphasises foundational work like encoding, where Tamil\ntransitioned from ASCII to Unicode, enhancing digital communication. It\ndiscusses the development of computational resources, including raw data,\ndictionaries, glossaries, annotated data, and computational grammars, necessary\nfor effective language processing. The challenges of linguistic annotation, the\ncreation of treebanks, and the training of large language models are also\ncovered, emphasising the need for high-quality, annotated data and advanced\nlanguage models. The paper underscores the importance of building practical\napplications for languages like Tamil to address everyday communication needs,\nhighlighting gaps in current technology. It calls for increased research\ncollaboration, digitization of historical texts, and fostering digital usage to\nensure the comprehensive development of Tamil language processing, ultimately\nenhancing global communication and access to digital services.\n","authors":["Kengatharaiyer Sarveswaran"],"pdf_url":"https://arxiv.org/pdf/2407.08618v2.pdf","comment":"11 pages, This is the write-up of the address delivered at the 30th\n Annual Sessions of the Jaffna Science Association, held from March 29-31,\n 2023, at the University of Jaffna, Sri Lanka"},{"id":"http://arxiv.org/abs/2406.05510v2","updated":"2024-08-12T09:31:30Z","published":"2024-06-08T16:19:18Z","title":"Representation Learning with Conditional Information Flow Maximization","summary":" This paper proposes an information-theoretic representation learning\nframework, named conditional information flow maximization, to extract\nnoise-invariant sufficient representations for the input data and target task.\nIt promotes the learned representations have good feature uniformity and\nsufficient predictive ability, which can enhance the generalization of\npre-trained language models (PLMs) for the target task. Firstly, an information\nflow maximization principle is proposed to learn more sufficient\nrepresentations for the input and target by simultaneously maximizing both\ninput-representation and representation-label mutual information. Unlike the\ninformation bottleneck, we handle the input-representation information in an\nopposite way to avoid the over-compression issue of latent representations.\nBesides, to mitigate the negative effect of potential redundant features from\nthe input, we design a conditional information minimization principle to\neliminate negative redundant features while preserve noise-invariant features.\nExperiments on 13 language understanding benchmarks demonstrate that our method\neffectively improves the performance of PLMs for classification and regression.\nExtensive experiments show that the learned representations are more\nsufficient, robust and transferable.\n","authors":["Dou Hu","Lingwei Wei","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2406.05510v2.pdf","comment":"16 pages, accepted to ACL 2024 (main conference), the code is\n available at https://github.com/zerohd4869/CIFM"},{"id":"http://arxiv.org/abs/2404.12059v2","updated":"2024-08-12T09:30:25Z","published":"2024-04-18T10:17:04Z","title":"Unsupervised Parsing by Searching for Frequent Word Sequences among\n Sentences with Equivalent Predicate-Argument Structures","summary":" Unsupervised constituency parsing focuses on identifying word sequences that\nform a syntactic unit (i.e., constituents) in target sentences. Linguists\nidentify the constituent by evaluating a set of Predicate-Argument Structure\n(PAS) equivalent sentences where we find the constituent appears more\nfrequently than non-constituents (i.e., the constituent corresponds to a\nfrequent word sequence within the sentence set). However, such frequency\ninformation is unavailable in previous parsing methods that identify the\nconstituent by observing sentences with diverse PAS. In this study, we\nempirically show that constituents correspond to frequent word sequences in the\nPAS-equivalent sentence set. We propose a frequency-based parser span-overlap\nthat (1) computes the span-overlap score as the word sequence's frequency in\nthe PAS-equivalent sentence set and (2) identifies the constituent structure by\nfinding a constituent tree with the maximum span-overlap score. The parser\nachieves state-of-the-art level parsing accuracy, outperforming existing\nunsupervised parsers in eight out of ten languages. Additionally, we discover a\nmultilingual phenomenon: participant-denoting constituents tend to have higher\nspan-overlap scores than equal-length event-denoting constituents, meaning that\nthe former tend to appear more frequently in the PAS-equivalent sentence set\nthan the latter. The phenomenon indicates a statistical difference between the\ntwo constituent types, laying the foundation for future labeled unsupervised\nparsing research.\n","authors":["Junjie Chen","Xiangheng He","Danushka Bollegala","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2404.12059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06022v1","updated":"2024-08-12T09:21:41Z","published":"2024-08-12T09:21:41Z","title":"Controlling Surprisal in Music Generation via Information Content Curve\n Matching","summary":" In recent years, the quality and public interest in music generation systems\nhave grown, encouraging research into various ways to control these systems. We\npropose a novel method for controlling surprisal in music generation using\nsequence models. To achieve this goal, we define a metric called Instantaneous\nInformation Content (IIC). The IIC serves as a proxy function for the perceived\nmusical surprisal (as estimated from a probabilistic model) and can be\ncalculated at any point within a music piece. This enables the comparison of\nsurprisal across different musical content even if the musical events occur in\nirregular time intervals. We use beam search to generate musical material whose\nIIC curve closely approximates a given target IIC. We experimentally show that\nthe IIC correlates with harmonic and rhythmic complexity and note density. The\ncorrelation decreases with the length of the musical context used for\nestimating the IIC. Finally, we conduct a qualitative user study to test if\nhuman listeners can identify the IIC curves that have been used as targets when\ngenerating the respective musical material. We provide code for creating IIC\ninterpolations and IIC visualizations on https://github.com/muthissar/iic.\n","authors":["Mathias Rose Bjare","Stefan Lattner","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2408.06022v1.pdf","comment":"8 pages, 4 figures, 2 tables, accepted at the 25th Int. Society for\n Music Information Retrieval Conf., San Francisco, USA, 2024"},{"id":"http://arxiv.org/abs/2405.15604v2","updated":"2024-08-12T08:30:46Z","published":"2024-05-24T14:38:11Z","title":"Text Generation: A Systematic Literature Review of Tasks, Evaluation,\n and Challenges","summary":" Text generation has become more accessible than ever, and the increasing\ninterest in these systems, especially those using large language models, has\nspurred an increasing number of related publications. We provide a systematic\nliterature review comprising 244 selected papers between 2017 and 2024. This\nreview categorizes works in text generation into five main tasks: open-ended\ntext generation, summarization, translation, paraphrasing, and question\nanswering. For each task, we review their relevant characteristics, sub-tasks,\nand specific challenges (e.g., missing datasets for multi-document\nsummarization, coherence in story generation, and complex reasoning for\nquestion answering). Additionally, we assess current approaches for evaluating\ntext generation systems and ascertain problems with current metrics. Our\ninvestigation shows nine prominent challenges common to all tasks and sub-tasks\nin recent text generation publications: bias, reasoning, hallucinations,\nmisuse, privacy, interpretability, transparency, datasets, and computing. We\nprovide a detailed analysis of these challenges, their potential solutions, and\nwhich gaps still require further engagement from the community. This systematic\nliterature review targets two main audiences: early career researchers in\nnatural language processing looking for an overview of the field and promising\nresearch directions, as well as experienced researchers seeking a detailed view\nof tasks, evaluation methodologies, open challenges, and recent mitigation\nstrategies.\n","authors":["Jonas Becker","Jan Philip Wahle","Bela Gipp","Terry Ruas"],"pdf_url":"https://arxiv.org/pdf/2405.15604v2.pdf","comment":"35 pages, 2 figures, 2 tables, Under review"},{"id":"http://arxiv.org/abs/2403.15740v2","updated":"2024-08-12T08:21:32Z","published":"2024-03-23T06:36:32Z","title":"Protecting Copyrighted Material with Unique Identifiers in Large\n Language Model Training","summary":" A major public concern regarding the training of large language models (LLMs)\nis whether they abusing copyrighted online text. Previous membership inference\nmethods may be misled by similar examples in vast amounts of training data.\nAdditionally, these methods are often too complex for general users to\nunderstand and use, making them centralized, lacking transparency, and\ntrustworthiness. To address these issues, we propose an alternative\n\\textit{insert-and-detection} methodology, advocating that web users and\ncontent platforms employ \\textbf{\\textit{unique identifiers}} for reliable and\nindependent membership inference. Users and platforms can create their own\nidentifiers, embed them in copyrighted text, and independently detect them in\nfuture LLMs. As an initial demonstration, we introduce \\textit{ghost\nsentences}, a primitive form of unique identifiers, consisting primarily of\npassphrases made up of random words. By embedding one ghost sentences in a few\ncopyrighted texts, users can detect its membership using a perplexity test and\na \\textit{user-friendly} last-$k$ words test. The perplexity test is based on\nthe fact that LLMs trained on natural language should exhibit high perplexity\nwhen encountering unnatural passphrases. As the repetition increases, users can\nleverage the verbatim memorization ability of LLMs to perform a last-$k$ words\ntest by chatting with LLMs without writing any code. Both tests offer rigorous\nstatistical guarantees for membership inference. For LLaMA-13B, a perplexity\ntest on 30 ghost sentences with an average of 7 repetitions in 148K examples\nyields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out\nof 16 users, with an average of 24 examples each, successfully identify their\ndata from 1.8M examples.\n","authors":["Shuai Zhao","Linchao Zhu","Ruijie Quan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15740v2.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2408.05977v1","updated":"2024-08-12T08:05:30Z","published":"2024-08-12T08:05:30Z","title":"The Language of Trauma: Modeling Traumatic Event Descriptions Across\n Domains with Explainable AI","summary":" Psychological trauma can manifest following various distressing events and is\ncaptured in diverse online contexts. However, studies traditionally focus on a\nsingle aspect of trauma, often neglecting the transferability of findings\nacross different scenarios. We address this gap by training language models\nwith progressing complexity on trauma-related datasets, including\ngenocide-related court data, a Reddit dataset on post-traumatic stress disorder\n(PTSD), counseling conversations, and Incel forum posts. Our results show that\nthe fine-tuned RoBERTa model excels in predicting traumatic events across\ndomains, slightly outperforming large language models like GPT-4. Additionally,\nSLALOM-feature scores and conceptual explanations effectively differentiate and\ncluster trauma-related language, highlighting different trauma aspects and\nidentifying sexual abuse and experiences related to death as a common traumatic\nevent across all datasets. This transferability is crucial as it allows for the\ndevelopment of tools to enhance trauma detection and intervention in diverse\npopulations and settings.\n","authors":["Miriam Schirmer","Tobias Leemann","Gjergji Kasneci","Jürgen Pfeffer","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2408.05977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04975v2","updated":"2024-08-12T07:54:04Z","published":"2024-08-09T09:56:30Z","title":"reCSE: Portable Reshaping Features for Sentence Embedding in\n Self-supervised Contrastive Learning","summary":" We propose reCSE, a self supervised contrastive learning sentence\nrepresentation framework based on feature reshaping. This framework is\ndifferent from the current advanced models that use discrete data augmentation\nmethods, but instead reshapes the input features of the original sentence,\naggregates the global information of each token in the sentence, and alleviates\nthe common problems of representation polarity and GPU memory consumption\nlinear increase in current advanced models. In addition, our reCSE has achieved\ncompetitive performance in semantic similarity tasks. And the experiment proves\nthat our proposed feature reshaping method has strong universality, which can\nbe transplanted to other self supervised contrastive learning frameworks and\nenhance their representation ability, even achieving state-of-the-art\nperformance. Our code is available at https://github.com/heavenhellchen/reCSE.\n","authors":["Fufangchen Zhao","Gao Jian","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2408.04975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04889v2","updated":"2024-08-12T07:14:27Z","published":"2024-04-07T09:10:47Z","title":"Ethos and Pathos in Online Group Discussions: Corpora for Polarisation\n Issues in Social Media","summary":" Growing polarisation in society caught the attention of the scientific\ncommunity as well as news media, which devote special issues to this\nphenomenon. At the same time, digitalisation of social interactions requires to\nrevise concepts from social science regarding establishment of trust, which is\na key feature of all human interactions, and group polarisation, as well as new\ncomputational tools to process large quantities of available data. Existing\nmethods seem insufficient to tackle the problem fully, thus, we propose to\napproach the problem by investigating rhetorical strategies employed by\nindividuals in polarising discussions online. To this end, we develop\nmulti-topic and multi-platform corpora with manual annotation of appeals to\nethos and pathos, two modes of persuasion in Aristotelian rhetoric. It can be\nemployed for training language models to advance the study of communication\nstrategies online on a large scale. With the use of computational methods, our\ncorpora allows an investigation of recurring patterns in polarising exchanges\nacross topics of discussion and media platforms, and conduct both quantitative\nand qualitative analyses of language structures leading to and engaged in\npolarisation.\n","authors":["Ewelina Gajewska","Katarzyna Budzynska","Barbara Konat","Marcin Koszowy","Konrad Kiljan","Maciej Uberna","He Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05948v1","updated":"2024-08-12T06:48:43Z","published":"2024-08-12T06:48:43Z","title":"ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge\n Graph QA datasets with Large Language Models","summary":" The rapid advancement of Large Language Models (LLMs) and conversational\nassistants necessitates dynamic, scalable, and configurable conversational\ndatasets for training and evaluation. These datasets must accommodate diverse\nuser interaction modes, including text and voice, each presenting unique\nmodeling challenges. Knowledge Graphs (KGs), with their structured and evolving\nnature, offer an ideal foundation for current and precise knowledge. Although\nhuman-curated KG-based conversational datasets exist, they struggle to keep\npace with the rapidly changing user information needs. We present ConvKGYarn, a\nscalable method for generating up-to-date and configurable conversational KGQA\ndatasets. Qualitative psychometric analyses confirm our method can generate\nhigh-quality datasets rivaling a popular conversational KGQA dataset while\noffering it at scale and covering a wide range of human-interaction\nconfigurations. We showcase its utility by testing LLMs on diverse\nconversations - exploring model behavior on conversational KGQA sets with\ndifferent configurations grounded in the same KG fact set. Our results\nhighlight the ability of ConvKGYarn to improve KGQA foundations and evaluate\nparametric knowledge of LLMs, thus offering a robust solution to the constantly\nevolving landscape of conversational assistants.\n","authors":["Ronak Pradeep","Daniel Lee","Ali Mousavi","Jeff Pound","Yisi Sang","Jimmy Lin","Ihab Ilyas","Saloni Potdar","Mostafa Arefiyan","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.05948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20613v2","updated":"2024-08-12T06:36:10Z","published":"2024-05-31T04:05:09Z","title":"FineRadScore: A Radiology Report Line-by-Line Evaluation Technique\n Generating Corrections with Severity Scores","summary":" The current gold standard for evaluating generated chest x-ray (CXR) reports\nis through radiologist annotations. However, this process can be extremely\ntime-consuming and costly, especially when evaluating large numbers of reports.\nIn this work, we present FineRadScore, a Large Language Model (LLM)-based\nautomated evaluation metric for generated CXR reports. Given a candidate report\nand a ground-truth report, FineRadScore gives the minimum number of\nline-by-line corrections required to go from the candidate to the ground-truth\nreport. Additionally, FineRadScore provides an error severity rating with each\ncorrection and generates comments explaining why the correction was needed. We\ndemonstrate that FineRadScore's corrections and error severity scores align\nwith radiologist opinions. We also show that, when used to judge the quality of\nthe report as a whole, FineRadScore aligns with radiologists as well as current\nstate-of-the-art automated CXR evaluation metrics. Finally, we analyze\nFineRadScore's shortcomings to provide suggestions for future improvements.\n","authors":["Alyssa Huang","Oishi Banerjee","Kay Wu","Eduardo Pontes Reis","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2405.20613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06363v3","updated":"2024-08-12T06:17:21Z","published":"2023-12-11T13:11:04Z","title":"MMICT: Boosting Multi-Modal Fine-Tuning with In-Context Examples","summary":" Although In-Context Learning (ICL) brings remarkable performance gains to\nLarge Language Models (LLMs), the improvements remain lower than fine-tuning on\ndownstream tasks. This paper introduces Multi-Modal In-Context Tuning (MMICT),\na novel multi-modal fine-tuning paradigm that boosts multi-modal fine-tuning by\nfully leveraging the promising ICL capability of multi-modal LLMs (MM-LLMs). We\npropose the Multi-Modal Hub (M-Hub), a unified module that captures various\nmulti-modal features according to different inputs and objectives. Based on\nM-Hub, MMICT enables MM-LLMs to learn from in-context visual-guided textual\nfeatures and subsequently generate outputs conditioned on the textual-guided\nvisual features. Moreover, leveraging the flexibility of M-Hub, we design a\nvariety of in-context demonstrations. Extensive experiments on a diverse range\nof downstream multi-modal tasks demonstrate that MMICT significantly\noutperforms traditional fine-tuning strategy and the vanilla ICT method that\ndirectly takes the concatenation of all information from different modalities\nas input. Our implementation is available at:\nhttps://github.com/KDEGroup/MMICT.\n","authors":["Tao Chen","Enwei Zhang","Yuting Gao","Ke Li","Xing Sun","Yan Zhang","Hui Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.06363v3.pdf","comment":"TOMM 2024"},{"id":"http://arxiv.org/abs/2403.12968v2","updated":"2024-08-12T04:48:11Z","published":"2024-03-19T17:59:56Z","title":"LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic\n Prompt Compression","summary":" This paper focuses on task-agnostic prompt compression for better\ngeneralizability and efficiency. Considering the redundancy in natural\nlanguage, existing approaches compress prompts by removing tokens or lexical\nunits according to their information entropy obtained from a causal language\nmodel such as LLaMa-7B. The challenge is that information entropy may be a\nsuboptimal compression metric: (i) it only leverages unidirectional context and\nmay fail to capture all essential information needed for prompt compression;\n(ii) it is not aligned with the prompt compression objective.\n To address these issues, we propose a data distillation procedure to derive\nknowledge from an LLM to compress prompts without losing crucial information,\nand meantime, introduce an extractive text compression dataset. We formulate\nprompt compression as a token classification problem to guarantee the\nfaithfulness of the compressed prompt to the original one, and use a\nTransformer encoder as the base architecture to capture all essential\ninformation for prompt compression from the full bidirectional context. Our\napproach leads to lower latency by explicitly learning the compression\nobjective with smaller models such as XLM-RoBERTa-large and mBERT.\n We evaluate our method on both in-domain and out-of-domain datasets,\nincluding MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its\nsmall size, our model shows significant performance gains over strong baselines\nand demonstrates robust generalization ability across different LLMs.\nAdditionally, our model is 3x-6x faster than existing prompt compression\nmethods, while accelerating the end-to-end latency by 1.6x-2.9x with\ncompression ratios of 2x-5x. Our code is available at\nhttps://aka.ms/LLMLingua-2.\n","authors":["Zhuoshi Pan","Qianhui Wu","Huiqiang Jiang","Menglin Xia","Xufang Luo","Jue Zhang","Qingwei Lin","Victor Rühle","Yuqing Yang","Chin-Yew Lin","H. Vicky Zhao","Lili Qiu","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12968v2.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2310.06839v2","updated":"2024-08-12T03:53:35Z","published":"2023-10-10T17:59:58Z","title":"LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios\n via Prompt Compression","summary":" In long context scenarios, large language models (LLMs) face three main\nchallenges: higher computational cost, performance reduction, and position\nbias. Research indicates that LLM performance hinges on the density and\nposition of key information in the input prompt. Inspired by these findings, we\npropose LongLLMLingua for prompt compression towards improving LLMs' perception\nof the key information to simultaneously address the three challenges. Our\nextensive evaluation across various long context scenarios demonstrates that\nLongLLMLingua not only enhances performance but also significantly reduces\ncosts and latency. For instance, in the NaturalQuestions benchmark,\nLongLLMLingua boosts performance by up to 21.4% with around 4x fewer tokens in\nGPT-3.5-Turbo, leading to substantial cost savings. It achieves a 94.0% cost\nreduction in the LooGLE benchmark. Moreover, when compressing prompts of about\n10k tokens at ratios of 2x-6x, LongLLMLingua can accelerate end-to-end latency\nby 1.4x-2.6x. Our code is available at https://aka.ms/LongLLMLingua.\n","authors":["Huiqiang Jiang","Qianhui Wu","Xufang Luo","Dongsheng Li","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.06839v2.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2408.05911v1","updated":"2024-08-12T03:52:11Z","published":"2024-08-12T03:52:11Z","title":"A New Pipeline For Generating Instruction Dataset via RAG and Self\n Fine-Tuning","summary":" With the rapid development of large language models in recent years, there\nhas been an increasing demand for domain-specific Agents that can cater to the\nunique needs of enterprises and organizations. Unlike general models, which\nstrive for broad coverage, these specialized Agents rely on focused datasets\ntailored to their intended applications. This research proposes a pipeline that\nleverages the power of LLMs and the Retrieval-Augmented Generation related\nframework to construct high-quality instruction datasets for fine-tuning on\nspecific domains using custom document collections. By ingesting\ndomain-specific documents, the pipeline generates relevant and contextually\nappropriate instructions, thus effectively creating a comprehensive dataset for\nfine-tuning LLMs on the target domain. This approach overcomes the limitations\nof traditional dataset creation methods, which often rely on manual curation or\nweb-scraping techniques that may introduce noise and irrelevant data. Notably,\nour pipeline offers a dynamic solution that can quickly adapt to updates or\nmodifications in the domain-specific document collection, eliminating the need\nfor complete retraining. Additionally, it addresses the challenge of data\nscarcity by enabling the generation of instruction datasets from a limited set\nof initial documents, rendering it suitable for unpopular or specialized\ndomains where comprehensive datasets are scarce. As a case study, we apply this\napproach to the domain of psychiatry, a field requiring specialized knowledge\nand sensitive handling of patient information. The resulting fine-tuned LLM\ndemonstrates showcases the viability of the proposed approach and underscores\nits potential for widespread adoption across various industries and domains\nwhere tailored, accurate, and contextually relevant language models are\nindispensable.\n","authors":["Chih-Wei Song","Yu-Kai Lee","Yin-Te Tsai"],"pdf_url":"https://arxiv.org/pdf/2408.05911v1.pdf","comment":"5 pages, SCA 2024: The 7th IEEE International Workshop on Smart\n Computing & Applications"},{"id":"http://arxiv.org/abs/2212.14548v4","updated":"2024-08-12T03:44:48Z","published":"2022-12-30T05:03:15Z","title":"How would Stance Detection Techniques Evolve after the Launch of\n ChatGPT?","summary":" Stance detection refers to the task of extracting the standpoint (Favor,\nAgainst or Neither) towards a target in given texts. Such research gains\nincreasing attention with the proliferation of social media contents. The\nconventional framework of handling stance detection is converting it into text\nclassification tasks. Deep learning models have already replaced rule-based\nmodels and traditional machine learning models in solving such problems.\nCurrent deep neural networks are facing two main challenges which are\ninsufficient labeled data and information in social media posts and the\nunexplainable nature of deep learning models. A new pre-trained language model\nchatGPT was launched on Nov 30, 2022. For the stance detection tasks, our\nexperiments show that ChatGPT can achieve SOTA or similar performance for\ncommonly used datasets including SemEval-2016 and P-Stance. At the same time,\nChatGPT can provide explanation for its own prediction, which is beyond the\ncapability of any existing model. The explanations for the cases it cannot\nprovide classification results are especially useful. ChatGPT has the potential\nto be the best AI model for stance detection tasks in NLP, or at least change\nthe research paradigm of this field. ChatGPT also opens up the possibility of\nbuilding explanatory AI for stance detection.\n","authors":["Bowen Zhang","Daijun Ding","Liwen Jing","Genan Dai","Nan Yin"],"pdf_url":"https://arxiv.org/pdf/2212.14548v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01038v2","updated":"2024-08-12T03:44:13Z","published":"2024-08-02T06:21:36Z","title":"UNER: A Unified Prediction Head for Named Entity Recognition in\n Visually-rich Documents","summary":" The recognition of named entities in visually-rich documents (VrD-NER) plays\na critical role in various real-world scenarios and applications. However, the\nresearch in VrD-NER faces three major challenges: complex document layouts,\nincorrect reading orders, and unsuitable task formulations. To address these\nchallenges, we propose a query-aware entity extraction head, namely UNER, to\ncollaborate with existing multi-modal document transformers to develop more\nrobust VrD-NER models. The UNER head considers the VrD-NER task as a\ncombination of sequence labeling and reading order prediction, effectively\naddressing the issues of discontinuous entities in documents. Experimental\nevaluations on diverse datasets demonstrate the effectiveness of UNER in\nimproving entity extraction performance. Moreover, the UNER head enables a\nsupervised pre-training stage on various VrD-NER datasets to enhance the\ndocument transformer backbones and exhibits substantial knowledge transfer from\nthe pre-training stage to the fine-tuning stage. By incorporating universal\nlayout understanding, a pre-trained UNER-based model demonstrates significant\nadvantages in few-shot and cross-linguistic scenarios and exhibits zero-shot\nentity extraction abilities.\n","authors":["Yi Tu","Chong Zhang","Ya Guo","Huan Chen","Jinyang Tang","Huijia Zhu","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.01038v2.pdf","comment":"accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.05906v1","updated":"2024-08-12T03:32:53Z","published":"2024-08-12T03:32:53Z","title":"AdTEC: A Unified Benchmark for Evaluating Text Quality in Search Engine\n Advertising","summary":" With the increase in the more fluent ad texts automatically created by\nnatural language generation technology, it is in the high demand to verify the\nquality of these creatives in a real-world setting. We propose AdTEC, the first\npublic benchmark to evaluate ad texts in multiple aspects from the perspective\nof practical advertising operations. Our contributions are: (i) Defining five\ntasks for evaluating the quality of ad texts and building a dataset based on\nthe actual operational experience of advertising agencies, which is typically\nkept in-house. (ii) Validating the performance of existing pre-trained language\nmodels (PLMs) and human evaluators on the dataset. (iii) Analyzing the\ncharacteristics and providing challenges of the benchmark. The results show\nthat while PLMs have already reached the practical usage level in several\ntasks, human still outperforms in certain domains, implying that there is\nsignificant room for improvement in such area.\n","authors":["Peinan Zhang","Yusuke Sakai","Masato Mita","Hiroki Ouchi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.05906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19687v2","updated":"2024-08-12T02:51:59Z","published":"2024-07-29T04:10:13Z","title":"Efficiently and Effectively: A Two-stage Approach to Balance Plaintext\n and Encrypted Text for Traffic Classification","summary":" Encrypted traffic classification is the task of identifying the application\nor service associated with encrypted network traffic. One effective approach\nfor this task is to use deep learning methods to encode the raw traffic bytes\ndirectly and automatically extract features for classification (byte-based\nmodels). However, current byte-based models input raw traffic bytes, whether\nplaintext or encrypted text, for automated feature extraction, neglecting the\ndistinct impacts of plaintext and encrypted text on downstream tasks.\nAdditionally, these models primarily focus on improving classification\naccuracy, with little emphasis on the efficiency of models. In this paper, for\nthe first time, we analyze the impact of plaintext and encrypted text on the\nmodel's effectiveness and efficiency. Based on our observations and findings,\nwe propose a two-phase approach to balance the trade-off between plaintext and\nencrypted text in traffic classification. Specifically, Stage one is to\nDetermine whether the Plain text is enough to be accurately Classified (DPC)\nusing the proposed DPC Selector. This stage quickly identifies samples that can\nbe classified using plaintext, leveraging explicit byte features in plaintext\nto enhance model's efficiency. Stage two aims to adaptively make a\nclassification with the result from stage one. This stage incorporates\nencrypted text information for samples that cannot be classified using\nplaintext alone, ensuring the model's effectiveness on traffic classification\ntasks. Experiments on two datasets demonstrate that our proposed model achieves\nstate-of-the-art results in both effectiveness and efficiency.\n","authors":["Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2407.19687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18312v3","updated":"2024-08-12T02:43:32Z","published":"2024-06-26T12:51:37Z","title":"AI-native Memory: A Pathway from LLMs Towards AGI","summary":" Large language models (LLMs) have demonstrated the world with the sparks of\nartificial general intelligence (AGI). One opinion, especially from some\nstartups working on LLMs, argues that an LLM with nearly unlimited context\nlength can realize AGI. However, they might be too optimistic about the\nlong-context capability of (existing) LLMs -- (1) Recent literature has shown\nthat their effective context length is significantly smaller than their claimed\ncontext length; and (2) Our reasoning-in-a-haystack experiments further\ndemonstrate that simultaneously finding the relevant information from a long\ncontext and conducting (simple) reasoning is nearly impossible. In this paper,\nwe envision a pathway from LLMs to AGI through the integration of\n\\emph{memory}. We believe that AGI should be a system where LLMs serve as core\nprocessors. In addition to raw data, the memory in this system would store a\nlarge number of important conclusions derived from reasoning processes.\nCompared with retrieval-augmented generation (RAG) that merely processing raw\ndata, this approach not only connects semantically related information closer,\nbut also simplifies complex inferences at the time of querying. As an\nintermediate stage, the memory will likely be in the form of natural language\ndescriptions, which can be directly consumed by users too. Ultimately, every\nagent/person should have its own large personal model, a deep neural network\nmodel (thus \\emph{AI-native}) that parameterizes and compresses all types of\nmemory, even the ones cannot be described by natural languages. Finally, we\ndiscuss the significant potential of AI-native memory as the transformative\ninfrastructure for (proactive) engagement, personalization, distribution, and\nsocial in the AGI era, as well as the incurred privacy and security challenges\nwith preliminary solutions.\n","authors":["Jingbo Shang","Zai Zheng","Jiale Wei","Xiang Ying","Felix Tao","Mindverse Team"],"pdf_url":"https://arxiv.org/pdf/2406.18312v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05894v1","updated":"2024-08-12T02:16:47Z","published":"2024-08-12T02:16:47Z","title":"GlyphPattern: An Abstract Pattern Recognition for Vision-Language Models","summary":" Vision-Language Models (VLMs) building upon the foundation of powerful large\nlanguage models have made rapid progress in reasoning across visual and textual\ndata. While VLMs perform well on vision tasks that they are trained on, our\nresults highlight key challenges in abstract pattern recognition. We present\nGlyphPattern, a 954 item dataset that pairs 318 human-written descriptions of\nvisual patterns from 40 writing systems with three visual presentation styles.\n GlyphPattern evaluates abstract pattern recognition in VLMs, requiring models\nto understand and judge natural language descriptions of visual patterns.\nGlyphPattern patterns are drawn from a large-scale cognitive science\ninvestigation of human writing systems; as a result, they are rich in spatial\nreference and compositionality. Our experiments show that GlyphPattern is\nchallenging for state-of-the-art VLMs (GPT-4o achieves only 55% accuracy), with\nmarginal gains from few-shot prompting. Our detailed error analysis reveals\nchallenges at multiple levels, including visual processing, natural language\nunderstanding, and pattern generalization.\n","authors":["Zixuan Wu","Yoolim Kim","Carolyn Jane Anderson"],"pdf_url":"https://arxiv.org/pdf/2408.05894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00958v2","updated":"2024-08-12T02:08:03Z","published":"2024-07-01T04:29:35Z","title":"Universal Approximation Theory: The basic theory for large language\n models","summary":" Language models have emerged as a critical area of focus in artificial\nintelligence, particularly with the introduction of groundbreaking innovations\nlike ChatGPT. Large-scale Transformer networks have quickly become the leading\napproach for advancing natural language processing algorithms. Built on the\nTransformer architecture, these models enable interactions that closely mimic\nhuman communication and, equipped with extensive knowledge, can even assist in\nguiding human tasks. Despite their impressive capabilities and growing\ncomplexity, a key question remains-the theoretical foundations of large\nlanguage models (LLMs). What makes Transformer so effective for powering\nintelligent language applications, such as translation and coding? What\nunderlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme\nenhance the fine-tuning of LLMs? And what supports the practicality of pruning\nLLMs? To address these critical questions and explore the technological\nstrategies within LLMs, we leverage the Universal Approximation Theory (UAT) to\noffer a theoretical backdrop, shedding light on the mechanisms that underpin\nthese advancements.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.00958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01528v3","updated":"2024-08-12T01:44:26Z","published":"2024-02-02T16:15:24Z","title":"Decoding Speculative Decoding","summary":" Speculative Decoding is a widely used technique to speed up inference for\nLarge Language Models (LLMs) without sacrificing quality. When performing\ninference, speculative decoding uses a smaller draft model to generate\nspeculative tokens and then uses the target LLM to verify those draft tokens.\nThe speedup provided by speculative decoding heavily depends on the choice of\nthe draft model. In this work, we perform a detailed study comprising over 350\nexperiments with LLaMA-65B and OPT-66B using speculative decoding and delineate\nthe factors that affect the performance gain provided by speculative decoding.\nOur experiments indicate that the performance of speculative decoding depends\nheavily on the latency of the draft model, and the draft model's capability in\nlanguage modeling does not correlate strongly with its performance in\nspeculative decoding. Based on these insights we explore a new design space for\ndraft models and design hardware-efficient draft models for speculative\ndecoding. Our newly designed draft model for LLaMA-65B can provide 111% higher\nthroughput than existing draft models and can generalize further to the LLaMA-2\nmodel family and supervised fine-tuned models.\n","authors":["Minghao Yan","Saurabh Agarwal","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2402.01528v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05882v1","updated":"2024-08-12T00:46:39Z","published":"2024-08-12T00:46:39Z","title":"Creating Arabic LLM Prompts at Scale","summary":" The debut of chatGPT and BARD has popularized instruction following text\ngeneration using LLMs, where a user can interrogate an LLM using natural\nlanguage requests and obtain natural language answers that matches their\nrequests. Training LLMs to respond in this manner requires a large number of\nworked out examples of user requests (aka prompts) with corresponding gold\nresponses. In this paper, we introduce two methods for creating such prompts\nfor Arabic cheaply and quickly. The first methods entails automatically\ntranslating existing prompt datasets from English, such as PromptSource and\nSuper-NaturalInstructions, and then using machine translation quality\nestimation to retain high quality translations only. The second method involves\ncreating natural language prompts on top of existing Arabic NLP datasets. Using\nthese two methods we were able to create more than 67.4 million Arabic prompts\nthat cover a variety of tasks including summarization, headline generation,\ngrammar checking, open/closed question answering, creative writing, etc. We\nshow that fine tuning an open 7 billion parameter large language model, namely\nbase Qwen2 7B, enables it to outperform a state-of-the-art 70 billion parameter\ninstruction tuned model, namely Llama3 70B, in handling Arabic prompts.\n","authors":["Abdelrahman El-Sheikh","Ahmed Elmogtaba","Kareem Darwish","Muhammad Elmallah","Ashraf Elneima","Hassan Sawaf"],"pdf_url":"https://arxiv.org/pdf/2408.05882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12874v2","updated":"2024-08-12T00:38:22Z","published":"2024-07-16T04:41:58Z","title":"SELF-GUIDE: Better Task-Specific Instruction Following via\n Self-Synthetic Finetuning","summary":" Large language models (LLMs) hold the promise of solving diverse tasks when\nprovided with appropriate natural language prompts. However, prompting often\nleads models to make predictions with lower accuracy compared to finetuning a\nmodel with ample training data. On the other hand, while finetuning LLMs on\ntask-specific data generally improves their performance, abundant annotated\ndatasets are not available for all tasks. Previous work has explored generating\ntask-specific data from state-of-the-art LLMs and using this data to finetune\nsmaller models, but this approach requires access to a language model other\nthan the one being trained, which introduces cost, scalability challenges, and\nlegal hurdles associated with continuously relying on more powerful LLMs. In\nresponse to these, we propose SELF-GUIDE, a multi-stage mechanism in which we\nsynthesize task-specific input-output pairs from the student LLM, then use\nthese input-output pairs to finetune the student LLM itself. In our empirical\nevaluation of the Natural Instructions V2 benchmark, we find that SELF-GUIDE\nimproves the performance of LLM by a substantial margin. Specifically, we\nreport an absolute improvement of approximately 15% for classification tasks\nand 18% for generation tasks in the benchmark's metrics. This sheds light on\nthe promise of self-synthesized data guiding LLMs towards becoming\ntask-specific experts without any external learning signals.\n","authors":["Chenyang Zhao","Xueying Jia","Vijay Viswanathan","Tongshuang Wu","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2407.12874v2.pdf","comment":"Accepted by COLM 2024"},{"id":"http://arxiv.org/abs/2402.14798v3","updated":"2024-08-12T23:47:48Z","published":"2024-02-22T18:55:17Z","title":"Enhancing Systematic Decompositional Natural Language Inference Using\n Informal Logic","summary":" Recent language models enable new opportunities for structured reasoning with\ntext, such as the construction of intuitive, proof-like textual entailment\ntrees without relying on brittle formal logic. However, progress in this\ndirection has been hampered by a long-standing lack of a clear protocol for\ndetermining what valid compositional entailment is. This absence causes noisy\ndatasets and limited performance gains by modern neuro-symbolic engines. To\naddress these problems, we formulate a consistent and theoretically grounded\napproach to annotating decompositional entailment and evaluate its impact on\nLLM-based textual inference. We find that our new dataset, RDTE (Recognizing\nDecompositional Textual Entailment), has a substantially higher internal\nconsistency (+9%) than prior decompositional entailment datasets. We also find\nthat training an RDTE-oriented entailment classifier via knowledge distillation\nand employing it in an entailment tree reasoning engine significantly improves\nboth accuracy and proof quality, illustrating the practical benefit of this\nadvance for textual inference.\n","authors":["Nathaniel Weir","Kate Sanders","Orion Weller","Shreya Sharma","Dongwei Jiang","Zhengping Jiang","Bhavana Dalvi Mishra","Oyvind Tafjord","Peter Jansen","Peter Clark","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2402.14798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.07662v5","updated":"2024-08-12T23:42:17Z","published":"2022-09-16T00:54:44Z","title":"NELLIE: A Neuro-Symbolic Inference Engine for Grounded, Compositional,\n and Explainable Reasoning","summary":" Our goal is a modern approach to answering questions via systematic reasoning\nwhere answers are supported by human interpretable proof trees grounded in an\nNL corpus of authoritative facts. Such a system would help alleviate the\nchallenges of interpretability and hallucination with modern LMs, and the lack\nof grounding of current explanation methods (e.g., Chain-of-Thought). This\npaper proposes a new take on Prolog-based inference engines, where we replace\nhandcrafted rules with a combination of neural language modeling, guided\ngeneration, and semiparametric dense retrieval. Our implementation, NELLIE, is\nthe first system to demonstrate fully interpretable, end-to-end grounded QA as\nentailment tree proof search, going beyond earlier work explaining\nknown-to-be-true facts from text. In experiments, NELLIE outperforms a\nsimilar-sized state-of-the-art reasoner [Tafjord et al., 2022] while producing\nknowledge-grounded explanations. We also find NELLIE can exploit both\nsemi-structured and NL text corpora to guide reasoning. Together these suggest\na new way to jointly reap the benefits of both modern neural methods and\ntraditional symbolic reasoning.\n","authors":["Nathaniel Weir","Peter Clark","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2209.07662v5.pdf","comment":"Published at IJCAI 2024"},{"id":"http://arxiv.org/abs/2407.21050v2","updated":"2024-08-12T23:38:35Z","published":"2024-07-23T04:05:48Z","title":"Artificial Intelligence in Extracting Diagnostic Data from Dental\n Records","summary":" This research addresses the issue of missing structured data in dental\nrecords by extracting diagnostic information from unstructured text. The\nupdated periodontology classification system's complexity has increased\nincomplete or missing structured diagnoses. To tackle this, we use advanced AI\nand NLP methods, leveraging GPT-4 to generate synthetic notes for fine-tuning a\nRoBERTa model. This significantly enhances the model's ability to understand\nmedical and dental language. We evaluated the model using 120 randomly selected\nclinical notes from two datasets, demonstrating its improved diagnostic\nextraction accuracy. The results showed high accuracy in diagnosing periodontal\nstatus, stage, and grade, with Site 1 scoring 0.99 and Site 2 scoring 0.98. In\nthe subtype category, Site 2 achieved perfect scores, outperforming Site 1.\nThis method enhances extraction accuracy and broadens its use across dental\ncontexts. The study underscores AI and NLP's transformative impact on\nhealthcare delivery and management. Integrating AI and NLP technologies\nenhances documentation and simplifies administrative tasks by precisely\nextracting complex clinical information. This approach effectively addresses\nchallenges in dental diagnostics. Using synthetic training data from LLMs\noptimizes the training process, improving accuracy and efficiency in\nidentifying periodontal diagnoses from clinical notes. This innovative method\nholds promise for broader healthcare applications, potentially improving\npatient care quality.\n","authors":["Yao-Shun Chuang","Chun-Teh Lee","Oluwabunmi Tokede","Guo-Hao Lin","Ryan Brandon","Trung Duong Tran","Xiaoqian Jiang","Muhammad F. Walji"],"pdf_url":"https://arxiv.org/pdf/2407.21050v2.pdf","comment":"11 pages, 2 tables, 3 figures, under review"},{"id":"http://arxiv.org/abs/2408.06527v1","updated":"2024-08-12T23:19:02Z","published":"2024-08-12T23:19:02Z","title":"Chain-of-Strategy Planning with LLMs: Aligning the Generation of\n Psychotherapy Dialogue with Strategy in Motivational Interviewing","summary":" Recent advancements in large language models (LLMs) have shown promise in\ngenerating psychotherapeutic dialogues, especially in Motivational Interviewing\n(MI). However, how to employ strategies, a set of motivational interviewing\n(MI) skills, to generate therapeutic-adherent conversations with explainability\nis underexplored. We propose an approach called strategy-aware dialogue\ngeneration with Chain-of-Strategy (CoS) planning, which first predicts MI\nstrategies as reasoning and utilizes these strategies to guide the subsequent\ndialogue generation. It brings the potential for controllable and explainable\ngeneration in psychotherapy by aligning the generated MI dialogues with\ntherapeutic strategies. Extensive experiments including automatic and human\nevaluations are conducted to validate the effectiveness of the MI strategy. Our\nfindings demonstrate the potential of LLMs in producing strategically aligned\ndialogues and suggest directions for practical applications in\npsychotherapeutic settings.\n","authors":["Xin Sun","Xiao Tang","Abdallah El Ali","Zhuying Li","Xiaoyu Shen","Pengjie Ren","Jan de Wit","Jiahuan Pei","Jos A. Bosch"],"pdf_url":"https://arxiv.org/pdf/2408.06527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21070v2","updated":"2024-08-12T23:07:53Z","published":"2024-07-28T18:33:58Z","title":"Occam's Razor and Bender and Koller's Octopus","summary":" We discuss the teaching of the discussion surrounding Bender and Koller's\nprominent ACL 2020 paper, \"Climbing toward NLU: on meaning form, and\nunderstanding in the age of data\" \\cite{bender2020climbing}. We present what we\nunderstand to be the main contentions of the paper, and then recommend that the\nstudents engage with the natural counter-arguments to the claims in the paper.\nWe attach teaching materials that we use to facilitate teaching this topic to\nundergraduate students.\n","authors":["Michael Guerzhoy"],"pdf_url":"https://arxiv.org/pdf/2407.21070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06520v1","updated":"2024-08-12T22:40:01Z","published":"2024-08-12T22:40:01Z","title":"Hierarchical in-Context Reinforcement Learning with Hindsight Modular\n Reflections for Planning","summary":" Large Language Models (LLMs) have demonstrated remarkable abilities in\nvarious language tasks, making them promising candidates for decision-making in\nrobotics. Inspired by Hierarchical Reinforcement Learning (HRL), we propose\nHierarchical in-Context Reinforcement Learning (HCRL), a novel framework that\ndecomposes complex tasks into sub-tasks using an LLM-based high-level policy,\nin which a complex task is decomposed into sub-tasks by a high-level policy\non-the-fly. The sub-tasks, defined by goals, are assigned to the low-level\npolicy to complete. Once the LLM agent determines that the goal is finished, a\nnew goal will be proposed. To improve the agent's performance in multi-episode\nexecution, we propose Hindsight Modular Reflection (HMR), where, instead of\nreflecting on the full trajectory, we replace the task objective with\nintermediate goals and let the agent reflect on shorter trajectories to improve\nreflection efficiency. We evaluate the decision-making ability of the proposed\nHCRL in three benchmark environments--ALFWorld, Webshop, and HotpotQA. Results\nshow that HCRL can achieve 9%, 42%, and 10% performance improvement in 5\nepisodes of execution over strong in-context learning baselines.\n","authors":["Chuanneng Sun","Songjun Huang","Dario Pompili"],"pdf_url":"https://arxiv.org/pdf/2408.06520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06518v1","updated":"2024-08-12T22:30:55Z","published":"2024-08-12T22:30:55Z","title":"Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in\n Language Models","summary":" Despite their wide adoption, the biases and unintended behaviors of language\nmodels remain poorly understood. In this paper, we identify and characterize a\nphenomenon never discussed before, which we call semantic leakage, where models\nleak irrelevant information from the prompt into the generation in unexpected\nways. We propose an evaluation setting to detect semantic leakage both by\nhumans and automatically, curate a diverse test suite for diagnosing this\nbehavior, and measure significant semantic leakage in 13 flagship models. We\nalso show that models exhibit semantic leakage in languages besides English and\nacross different settings and generation scenarios. This discovery highlights\nyet another type of bias in language models that affects their generation\npatterns and behavior.\n","authors":["Hila Gonen","Terra Blevins","Alisa Liu","Luke Zettlemoyer","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2408.06518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15401v3","updated":"2024-08-12T21:46:16Z","published":"2024-02-19T17:58:41Z","title":"Large Language Model for Mental Health: A Systematic Review","summary":" Large language models (LLMs) have attracted significant attention for\npotential applications in digital health, while their application in mental\nhealth is subject to ongoing debate. This systematic review aims to evaluate\nthe usage of LLMs in mental health, focusing on their strengths and limitations\nin early screening, digital interventions, and clinical applications. Adhering\nto PRISMA guidelines, we searched PubMed, IEEE Xplore, Scopus, JMIR, and ACM\nusing keywords: 'mental health OR mental illness OR mental disorder OR\npsychiatry' AND 'large language models'. We included articles published between\nJanuary 1, 2017, and April 30, 2024, excluding non-English articles. 30\narticles were evaluated, which included research on mental health conditions\nand suicidal ideation detection through text (n=15), usage of LLMs for mental\nhealth conversational agents (CAs) (n=7), and other applications and\nevaluations of LLMs in mental health (n=18). LLMs exhibit substantial\neffectiveness in detecting mental health issues and providing accessible,\nde-stigmatized eHealth services. However, the current risks associated with the\nclinical use might surpass their benefits. The study identifies several\nsignificant issues: the lack of multilingual datasets annotated by experts,\nconcerns about the accuracy and reliability of the content generated,\nchallenges in interpretability due to the 'black box' nature of LLMs, and\npersistent ethical dilemmas. These include the lack of a clear ethical\nframework, concerns about data privacy, and the potential for over-reliance on\nLLMs by both therapists and patients, which could compromise traditional\nmedical practice. Despite these issues, the rapid development of LLMs\nunderscores their potential as new clinical aids, emphasizing the need for\ncontinued research and development in this area.\n","authors":["Zhijun Guo","Alvina Lai","Johan Hilge Thygesen","Joseph Farrington","Thomas Keen","Kezhi Li"],"pdf_url":"https://arxiv.org/pdf/2403.15401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06855v4","updated":"2024-08-12T21:40:04Z","published":"2024-01-12T19:02:48Z","title":"Fine-grained Hallucination Detection and Editing for Language Models","summary":" Large language models (LMs) are prone to generate factual errors, which are\noften called hallucinations. In this paper, we introduce a comprehensive\ntaxonomy of hallucinations and argue that hallucinations manifest in diverse\nforms, each requiring varying degrees of careful assessments to verify\nfactuality. We propose a novel task of automatic fine-grained hallucination\ndetection and construct a new evaluation benchmark, FavaBench, that includes\nabout one thousand fine-grained human judgments on three LM outputs across\nvarious domains. Our analysis reveals that ChatGPT and Llama2-Chat (70B, 7B)\nexhibit diverse types of hallucinations in the majority of their outputs in\ninformation-seeking scenarios. We train FAVA, a retrieval-augmented LM by\ncarefully creating synthetic data to detect and correct fine-grained\nhallucinations. On our benchmark, our automatic and human evaluations show that\nFAVA significantly outperforms ChatGPT and GPT-4 on fine-grained hallucination\ndetection, and edits suggested by FAVA improve the factuality of LM-generated\ntext.\n","authors":["Abhika Mishra","Akari Asai","Vidhisha Balachandran","Yizhong Wang","Graham Neubig","Yulia Tsvetkov","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2401.06855v4.pdf","comment":"Our code, data, and demo are available at\n https://fine-grained-hallucination.github.io. Published as a conference paper\n at COLM 2024"},{"id":"http://arxiv.org/abs/2408.06484v1","updated":"2024-08-12T20:40:46Z","published":"2024-08-12T20:40:46Z","title":"Cross-Lingual Conversational Speech Summarization with Large Language\n Models","summary":" Cross-lingual conversational speech summarization is an important problem,\nbut suffers from a dearth of resources. While transcriptions exist for a number\nof languages, translated conversational speech is rare and datasets containing\nsummaries are non-existent. We build upon the existing Fisher and Callhome\nSpanish-English Speech Translation corpus by supplementing the translations\nwith summaries. The summaries are generated using GPT-4 from the reference\ntranslations and are treated as ground truth. The task is to generate similar\nsummaries in the presence of transcription and translation errors. We build a\nbaseline cascade-based system using open-source speech recognition and machine\ntranslation models. We test a range of LLMs for summarization and analyze the\nimpact of transcription and translation errors. Adapting the Mistral-7B model\nfor this task performs significantly better than off-the-shelf models and\nmatches the performance of GPT-4.\n","authors":["Max Nelson","Shannon Wotherspoon","Francis Keith","William Hartmann","Matthew Snover"],"pdf_url":"https://arxiv.org/pdf/2408.06484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06474v1","updated":"2024-08-12T20:19:27Z","published":"2024-08-12T20:19:27Z","title":"TOGGL: Transcribing Overlapping Speech with Staggered Labeling","summary":" Transcribing the speech of multiple overlapping speakers typically requires\nseparating the audio into multiple streams and recognizing each one\nindependently. More recent work jointly separates and transcribes, but requires\na separate decoding component for each speaker. We propose the TOGGL model to\nsimultaneously transcribe the speech of multiple speakers. The TOGGL model uses\nspecial output tokens to attribute the speech to each speaker with only a\nsingle decoder. Our approach generalizes beyond two speakers, even when trained\nonly on two-speaker data. We demonstrate superior performance compared to\ncompeting approaches on a conversational speech dataset. Our approach also\nimproves performance on single-speaker audio.\n","authors":["Chak-Fai Li","William Hartmann","Matthew Snover"],"pdf_url":"https://arxiv.org/pdf/2408.06474v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2310.17143v3","updated":"2024-08-12T20:15:49Z","published":"2023-10-26T04:35:00Z","title":"Techniques for supercharging academic writing with generative AI","summary":" Academic writing is an indispensable yet laborious part of the research\nenterprise. This Perspective maps out principles and methods for using\ngenerative artificial intelligence (AI), specifically large language models\n(LLMs), to elevate the quality and efficiency of academic writing. We introduce\na human-AI collaborative framework that delineates the rationale (why), process\n(how), and nature (what) of AI engagement in writing. The framework pinpoints\nboth short-term and long-term reasons for engagement and their underlying\nmechanisms (e.g., cognitive offloading and imaginative stimulation). It reveals\nthe role of AI throughout the writing process, conceptualized through a\ntwo-stage model for human-AI collaborative writing, and the nature of AI\nassistance in writing, represented through a model of writing-assistance types\nand levels. Building on this framework, we describe effective prompting\ntechniques for incorporating AI into the writing routine (outlining, drafting,\nand editing) as well as strategies for maintaining rigorous scholarship,\nadhering to varied journal policies, and avoiding overreliance on AI.\nUltimately, the prudent integration of AI into academic writing can ease the\ncommunication burden, empower authors, accelerate discovery, and promote\ndiversity in science.\n","authors":["Zhicheng Lin"],"pdf_url":"https://arxiv.org/pdf/2310.17143v3.pdf","comment":"14 pages, 2 figures, 1 table, 1 box"},{"id":"http://arxiv.org/abs/2404.10306v5","updated":"2024-08-12T19:37:42Z","published":"2024-04-16T06:27:39Z","title":"Balancing Speciality and Versatility: a Coarse to Fine Framework for\n Supervised Fine-tuning Large Language Model","summary":" Aligned Large Language Models (LLMs) showcase remarkable versatility, capable\nof handling diverse real-world tasks. Meanwhile, aligned LLMs are also expected\nto exhibit speciality, excelling in specific applications. However, fine-tuning\nwith extra data, a common practice to gain speciality, often leads to\ncatastrophic forgetting (CF) of previously acquired versatility, hindering the\nmodel's performance across diverse tasks. In response to this challenge, we\npropose CoFiTune, a coarse to fine framework in an attempt to strike the\nbalance between speciality and versatility. At the coarse-grained level, an\nempirical tree-search algorithm is utilized to pinpoint and update specific\nmodules that are crucial for speciality, while keeping other parameters frozen;\nat the fine-grained level, a soft-masking mechanism regulates the update to the\nLLMs, mitigating the CF issue without harming speciality. In an overall\nevaluation of both speciality and versatility, CoFiTune consistently\noutperforms baseline methods across diverse tasks and model scales. Compared to\nthe full-parameter SFT, CoFiTune leads to about 14% versatility improvement and\nmarginal speciality loss on a 13B model. Lastly, based on further analysis, we\nprovide a speculative insight into the information forwarding process in LLMs,\nwhich helps explain the effectiveness of the proposed method. The code is\navailable at https://github.com/rattlesnakey/CoFiTune.\n","authors":["Hengyuan Zhang","Yanru Wu","Dawei Li","Sak Yang","Rui Zhao","Yong Jiang","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2404.10306v5.pdf","comment":"43 pages, 10 figures, accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2406.07546v2","updated":"2024-08-12T19:33:52Z","published":"2024-06-11T17:59:48Z","title":"Commonsense-T2I Challenge: Can Text-to-Image Generation Models\n Understand Commonsense?","summary":" We present a novel task and benchmark for evaluating the ability of\ntext-to-image(T2I) generation models to produce images that align with\ncommonsense in real life, which we call Commonsense-T2I. Given two adversarial\ntext prompts containing an identical set of action words with minor\ndifferences, such as \"a lightbulb without electricity\" v.s. \"a lightbulb with\nelectricity\", we evaluate whether T2I models can conduct visual-commonsense\nreasoning, e.g. produce images that fit \"the lightbulb is unlit\" vs. \"the\nlightbulb is lit\" correspondingly. Commonsense-T2I presents an adversarial\nchallenge, providing pairwise text prompts along with expected outputs. The\ndataset is carefully hand-curated by experts and annotated with fine-grained\nlabels, such as commonsense type and likelihood of the expected outputs, to\nassist analyzing model behavior. We benchmark a variety of state-of-the-art\n(sota) T2I models and surprisingly find that, there is still a large gap\nbetween image synthesis and real life photos--even the DALL-E 3 model could\nonly achieve 48.92% on Commonsense-T2I, and the stable diffusion XL model only\nachieves 24.92% accuracy. Our experiments show that GPT-enriched prompts cannot\nsolve this challenge, and we include a detailed analysis about possible reasons\nfor such deficiency. We aim for Commonsense-T2I to serve as a high-quality\nevaluation benchmark for T2I commonsense checking, fostering advancements in\nreal life image generation.\n","authors":["Xingyu Fu","Muyu He","Yujie Lu","William Yang Wang","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2406.07546v2.pdf","comment":"COLM 2024, Project Url: https://zeyofu.github.io/CommonsenseT2I/"},{"id":"http://arxiv.org/abs/2408.06458v1","updated":"2024-08-12T19:18:05Z","published":"2024-08-12T19:18:05Z","title":"Towards Autonomous Agents: Adaptive-planning, Reasoning, and Acting in\n Language Models","summary":" We propose a novel in-context learning algorithm for building autonomous\ndecision-making language agents. The language agent continuously attempts to\nsolve the same task by self-correcting each time the task fails. Our selected\nlanguage agent demonstrates the ability to solve tasks in a text-based game\nenvironment. Our results show that the gemma-2-9b-it language model, using our\nproposed method, can successfully complete two of six tasks that failed in the\nfirst attempt. This highlights the effectiveness of our approach in enhancing\nthe problem-solving capabilities of a single language model through\nself-correction, paving the way for more advanced autonomous agents. The code\nis publicly available at\nhttps://github.com/YenCheHsiao/AutonomousLLMAgentwithAdaptingPlanning.\n","authors":["Yen-Che Hsiao","Abhishek Dutta"],"pdf_url":"https://arxiv.org/pdf/2408.06458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06450v1","updated":"2024-08-12T18:59:13Z","published":"2024-08-12T18:59:13Z","title":"Evaluating Language Models for Efficient Code Generation","summary":" We introduce Differential Performance Evaluation (DPE), a framework designed\nto reliably evaluate Large Language Models (LLMs) for efficient code\ngeneration. Traditional coding benchmarks often fail to provide reliable\ninsights into code efficiency, due to their reliance on simplistic test inputs\nand the absence of effective compound metrics. DPE addresses these issues by\nfocusing on efficiency-demanding programming tasks and establishing an\ninsightful compound metric for performance evaluation. DPE operates in two\nphases: To curate efficiency datasets, it selects efficiency-demanding tasks\nfrom existing coding benchmarks and generates computationally expensive inputs\nto stress the efficiency of LLM solutions. To assess the code efficiency, DPE\nprofiles the new solution and compares it globally against a set of reference\nsolutions that exhibit distinct efficiency levels, where the matched level\ndefines its efficiency score. As a proof of concept, we use DPE to create\nEvalPerf, a benchmark with 121 performance-challenging coding tasks. Our\ncomprehensive evaluation draws interesting findings on the efficiency impact of\nmodel sizes, instruction tuning, and prompting. For example, while the scaling\nlaw fails to account for code efficiency, general instruction tuning benefits\nboth code correctness and efficiency. We also evaluate the evaluation by\nexamining the effectiveness of DPE, showing that EvalPerf is reliable and\nconvenient to use even across platforms.\n","authors":["Jiawei Liu","Songrun Xie","Junhao Wang","Yuxiang Wei","Yifeng Ding","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15286v2","updated":"2024-08-12T18:42:22Z","published":"2024-07-21T22:50:11Z","title":"Intrinsic Self-correction for Enhanced Morality: An Analysis of Internal\n Mechanisms and the Superficial Hypothesis","summary":" Large Language Models (LLMs) are capable of producing content that\nperpetuates stereotypes, discrimination, and toxicity. The recently proposed\nmoral self-correction is a computationally efficient method for reducing\nharmful content in the responses of LLMs. However, the process of how injecting\nself-correction instructions can modify the behavior of LLMs remains\nunder-explored. In this paper, we explore the effectiveness of moral\nself-correction by answering three research questions: (1) In what scenarios\ndoes moral self-correction work? (2) What are the internal mechanisms of LLMs,\ne.g., hidden states, that are influenced by moral self-correction instructions?\n(3) Is intrinsic moral self-correction actually superficial? We argue that\nself-correction can help LLMs find a shortcut to more morally correct output,\nrather than truly reducing the immorality stored in hidden states. Through\nempirical investigation with tasks of language generation and multi-choice\nquestion answering, we conclude: (i) LLMs exhibit good performance across both\ntasks, and self-correction instructions are particularly beneficial when the\ncorrect answer is already top-ranked; (ii) The morality levels in intermediate\nhidden states are strong indicators as to whether one instruction would be more\neffective than another; (iii) Based on our analysis of intermediate hidden\nstates and task case studies of self-correction behaviors, we are first to\npropose the hypothesis that intrinsic moral self-correction is in fact\nsuperficial.\n","authors":["Guangliang Liu","Haitao Mao","Jiliang Tang","Kristen Marie Johnson"],"pdf_url":"https://arxiv.org/pdf/2407.15286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06423v1","updated":"2024-08-12T18:01:50Z","published":"2024-08-12T18:01:50Z","title":"Evaluating Language Models on Entity Disambiguation in Tables","summary":" Tables are crucial containers of information, but understanding their meaning\nmay be challenging. Indeed, recently, there has been a focus on Semantic Table\nInterpretation (STI), i.e., the task that involves the semantic annotation of\ntabular data to disambiguate their meaning. Over the years, there has been a\nsurge in interest in data-driven approaches based on deep learning that have\nincreasingly been combined with heuristic-based approaches. In the last period,\nthe advent of Large Language Models (LLMs) has led to a new category of\napproaches for table annotation. The interest in this research field,\ncharacterised by multiple challenges, has led to a proliferation of approaches\nemploying different techniques. However, these approaches have not been\nconsistently evaluated on a common ground, making evaluation and comparison\ndifficult. This work proposes an extensive evaluation of four state-of-the-art\n(SOTA) approaches - Alligator (formerly s-elBat), Dagobah, TURL, and\nTableLlama; the first two belong to the family of heuristic-based algorithms,\nwhile the others are respectively encoder-only and decoder-only LLMs. The\nprimary objective is to measure the ability of these approaches to solve the\nentity disambiguation task, with the ultimate aim of charting new research\npaths in the field.\n","authors":["Federico Belotti","Fabio Dadda","Marco Cremaschi","Roberto Avogadro","Riccardo Pozzi","Matteo Palmonari"],"pdf_url":"https://arxiv.org/pdf/2408.06423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05976v1","updated":"2024-08-12T08:05:30Z","published":"2024-08-12T08:05:30Z","title":"Global-to-Local Support Spectrums for Language Model Explainability","summary":" Existing sample-based methods, like influence functions and representer\npoints, measure the importance of a training point by approximating the effect\nof its removal from training. As such, they are skewed towards outliers and\npoints that are very close to the decision boundaries. The explanations\nprovided by these methods are often static and not specific enough for\ndifferent test points. In this paper, we propose a method to generate an\nexplanation in the form of support spectrums which are based on two main ideas:\nthe support sets and a global-to-local importance measure. The support set is\nthe set of training points, in the predicted class, that ``lie in between'' the\ntest point and training points in the other classes. They indicate how well the\ntest point can be distinguished from the points not in the predicted class. The\nglobal-to-local importance measure is obtained by decoupling existing methods\ninto the global and local components which are then used to select the points\nin the support set. Using this method, we are able to generate explanations\nthat are tailored to specific test points. In the experiments, we show the\neffectiveness of the method in image classification and text generation tasks.\n","authors":["Lucas Agussurja","Xinyang Lu","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2408.05976v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.06336v1","updated":"2024-08-12T17:52:29Z","published":"2024-08-12T17:52:29Z","title":"Moo-ving Beyond Tradition: Revolutionizing Cattle Behavioural\n Phenotyping with Pose Estimation Techniques","summary":" The cattle industry has been a major contributor to the economy of many\ncountries, including the US and Canada. The integration of Artificial\nIntelligence (AI) has revolutionized this sector, mirroring its transformative\nimpact across all industries by enabling scalable and automated monitoring and\nintervention practices. AI has also introduced tools and methods that automate\nmany tasks previously performed by human labor with the help of computer\nvision, including health inspections. Among these methods, pose estimation has\na special place; pose estimation is the process of finding the position of\njoints in an image of animals. Analyzing the pose of animal subjects enables\nprecise identification and tracking of the animal's movement and the movements\nof its body parts. By summarizing the video and imagery data into movement and\njoint location using pose estimation and then analyzing this information, we\ncan address the scalability challenge in cattle management, focusing on health\nmonitoring, behavioural phenotyping and welfare concerns. Our study reviews\nrecent advancements in pose estimation methodologies, their applicability in\nimproving the cattle industry, existing challenges, and gaps in this field.\nFurthermore, we propose an initiative to enhance open science frameworks within\nthis field of study by launching a platform designed to connect industry and\nacademia.\n","authors":["Navid Ghassemi","Ali Goldani","Ian Q. Whishaw","Majid H. Mohajerani"],"pdf_url":"https://arxiv.org/pdf/2408.06336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06328v1","updated":"2024-08-12T17:44:33Z","published":"2024-08-12T17:44:33Z","title":"HeLiMOS: A Dataset for Moving Object Segmentation in 3D Point Clouds\n From Heterogeneous LiDAR Sensors","summary":" Moving object segmentation (MOS) using a 3D light detection and ranging\n(LiDAR) sensor is crucial for scene understanding and identification of moving\nobjects. Despite the availability of various types of 3D LiDAR sensors in the\nmarket, MOS research still predominantly focuses on 3D point clouds from\nmechanically spinning omnidirectional LiDAR sensors. Thus, we are, for example,\nlacking a dataset with MOS labels for point clouds from solid-state LiDAR\nsensors which have irregular scanning patterns. In this paper, we present a\nlabeled dataset, called \\textit{HeLiMOS}, that enables to test MOS approaches\non four heterogeneous LiDAR sensors, including two solid-state LiDAR sensors.\nFurthermore, we introduce a novel automatic labeling method to substantially\nreduce the labeling effort required from human annotators. To this end, our\nframework exploits an instance-aware static map building approach and\ntracking-based false label filtering. Finally, we provide experimental results\nregarding the performance of commonly used state-of-the-art MOS approaches on\nHeLiMOS that suggest a new direction for a sensor-agnostic MOS, which generally\nworks regardless of the type of LiDAR sensors used to capture 3D point clouds.\nOur dataset is available at https://sites.google.com/view/helimos.\n","authors":["Hyungtae Lim","Seoyeon Jang","Benedikt Mersch","Jens Behley","Hyun Myung","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2408.06328v1.pdf","comment":"Proc. IEEE/RSJ Int. Conf. Intell. Robot. Syst. (IROS) 2024"},{"id":"http://arxiv.org/abs/2408.06327v1","updated":"2024-08-12T17:44:17Z","published":"2024-08-12T17:44:17Z","title":"VisualAgentBench: Towards Large Multimodal Models as Visual Foundation\n Agents","summary":" Large Multimodal Models (LMMs) have ushered in a new era in artificial\nintelligence, merging capabilities in both language and vision to form highly\ncapable Visual Foundation Agents. These agents are postulated to excel across a\nmyriad of tasks, potentially approaching general artificial intelligence.\nHowever, existing benchmarks fail to sufficiently challenge or showcase the\nfull potential of LMMs in complex, real-world environments. To address this\ngap, we introduce VisualAgentBench (VAB), a comprehensive and pioneering\nbenchmark specifically designed to train and evaluate LMMs as visual foundation\nagents across diverse scenarios, including Embodied, Graphical User Interface,\nand Visual Design, with tasks formulated to probe the depth of LMMs'\nunderstanding and interaction capabilities. Through rigorous testing across\nnine proprietary LMM APIs and eight open models, we demonstrate the\nconsiderable yet still developing agent capabilities of these models.\nAdditionally, VAB constructs a trajectory training set constructed through\nhybrid methods including Program-based Solvers, LMM Agent Bootstrapping, and\nHuman Demonstrations, promoting substantial performance improvements in LMMs\nthrough behavior cloning. Our work not only aims to benchmark existing models\nbut also provides a solid foundation for future development into visual\nfoundation agents. Code, train \\& test data, and part of fine-tuned open LMMs\nare available at \\url{https://github.com/THUDM/VisualAgentBench}.\n","authors":["Xiao Liu","Tianjie Zhang","Yu Gu","Iat Long Iong","Yifan Xu","Xixuan Song","Shudan Zhang","Hanyu Lai","Xinyi Liu","Hanlin Zhao","Jiadai Sun","Xinyue Yang","Yu Yang","Zehan Qi","Shuntian Yao","Xueqiao Sun","Siyi Cheng","Qinkai Zheng","Hao Yu","Hanchen Zhang","Wenyi Hong","Ming Ding","Lihang Pan","Xiaotao Gu","Aohan Zeng","Zhengxiao Du","Chan Hee Song","Yu Su","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.06327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06321v1","updated":"2024-08-12T17:42:46Z","published":"2024-08-12T17:42:46Z","title":"EqNIO: Subequivariant Neural Inertial Odometry","summary":" Presently, neural networks are widely employed to accurately estimate 2D\ndisplacements and associated uncertainties from Inertial Measurement Unit (IMU)\ndata that can be integrated into stochastic filter networks like the Extended\nKalman Filter (EKF) as measurements and uncertainties for the update step in\nthe filter. However, such neural approaches overlook symmetry which is a\ncrucial inductive bias for model generalization. This oversight is notable\nbecause (i) physical laws adhere to symmetry principles when considering the\ngravity axis, meaning there exists the same transformation for both the\nphysical entity and the resulting trajectory, and (ii) displacements should\nremain equivariant to frame transformations when the inertial frame changes. To\naddress this, we propose a subequivariant framework by: (i) deriving\nfundamental layers such as linear and nonlinear layers for a subequivariant\nnetwork, designed to handle sequences of vectors and scalars, (ii) employing\nthe subequivariant network to predict an equivariant frame for the sequence of\ninertial measurements. This predicted frame can then be utilized for extracting\ninvariant features through projection, which are integrated with arbitrary\nnetwork architectures, (iii) transforming the invariant output by frame\ntransformation to obtain equivariant displacements and covariances. We\ndemonstrate the effectiveness and generalization of our Equivariant Framework\non a filter-based approach with TLIO architecture for TLIO and Aria datasets,\nand an end-to-end deep learning approach with RONIN architecture for RONIN,\nRIDI and OxIOD datasets.\n","authors":["Royina Karegoudra Jayanth","Yinshuang Xu","Ziyun Wang","Evangelos Chatzipantazis","Daniel Gehrig","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2408.06321v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2408.06305v1","updated":"2024-08-12T17:17:35Z","published":"2024-08-12T17:17:35Z","title":"From SAM to SAM 2: Exploring Improvements in Meta's Segment Anything\n Model","summary":" The Segment Anything Model (SAM), introduced to the computer vision community\nby Meta in April 2023, is a groundbreaking tool that allows automated\nsegmentation of objects in images based on prompts such as text, clicks, or\nbounding boxes. SAM excels in zero-shot performance, segmenting unseen objects\nwithout additional training, stimulated by a large dataset of over one billion\nimage masks. SAM 2 expands this functionality to video, leveraging memory from\npreceding and subsequent frames to generate accurate segmentation across entire\nvideos, enabling near real-time performance. This comparison shows how SAM has\nevolved to meet the growing need for precise and efficient segmentation in\nvarious applications. The study suggests that future advancements in models\nlike SAM will be crucial for improving computer vision technology.\n","authors":["Athulya Sundaresan Geetha","Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2408.06305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06303v1","updated":"2024-08-12T17:15:02Z","published":"2024-08-12T17:15:02Z","title":"Long-Form Answers to Visual Questions from Blind and Low Vision People","summary":" Vision language models can now generate long-form answers to questions about\nimages - long-form visual question answers (LFVQA). We contribute VizWiz-LF, a\ndataset of long-form answers to visual questions posed by blind and low vision\n(BLV) users. VizWiz-LF contains 4.2k long-form answers to 600 visual questions,\ncollected from human expert describers and six VQA models. We develop and\nannotate functional roles of sentences of LFVQA and demonstrate that long-form\nanswers contain information beyond the question answer such as explanations and\nsuggestions. We further conduct automatic and human evaluations with BLV and\nsighted people to evaluate long-form answers. BLV people perceive both\nhuman-written and generated long-form answers to be plausible, but generated\nanswers often hallucinate incorrect visual details, especially for unanswerable\nvisual questions (e.g., blurry or irrelevant images). To reduce hallucinations,\nwe evaluate the ability of VQA models to abstain from answering unanswerable\nquestions across multiple prompting strategies.\n","authors":["Mina Huh","Fangyuan Xu","Yi-Hao Peng","Chongyan Chen","Hansika Murugu","Danna Gurari","Eunsol Choi","Amy Pavel"],"pdf_url":"https://arxiv.org/pdf/2408.06303v1.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.06302v1","updated":"2024-08-12T17:14:41Z","published":"2024-08-12T17:14:41Z","title":"Finding Patterns in Ambiguity: Interpretable Stress Testing in the\n Decision~Boundary","summary":" The increasing use of deep learning across various domains highlights the\nimportance of understanding the decision-making processes of these black-box\nmodels. Recent research focusing on the decision boundaries of deep\nclassifiers, relies on generated synthetic instances in areas of low\nconfidence, uncovering samples that challenge both models and humans. We\npropose a novel approach to enhance the interpretability of deep binary\nclassifiers by selecting representative samples from the decision boundary -\nprototypes - and applying post-model explanation algorithms. We evaluate the\neffectiveness of our approach through 2D visualizations and GradientSHAP\nanalysis. Our experiments demonstrate the potential of the proposed method,\nrevealing distinct and compact clusters and diverse prototypes that capture\nessential features that lead to low-confidence decisions. By offering a more\naggregated view of deep classifiers' decision boundaries, our work contributes\nto the responsible development and deployment of reliable machine learning\nsystems.\n","authors":["Inês Gomes","Luís F. Teixeira","Jan N. van Rijn","Carlos Soares","André Restivo","Luís Cunha","Moisés Santos"],"pdf_url":"https://arxiv.org/pdf/2408.06302v1.pdf","comment":"To be published in the Responsible Generative AI workshop at CVPR"},{"id":"http://arxiv.org/abs/2309.00616v5","updated":"2024-08-12T16:58:33Z","published":"2023-09-01T17:59:56Z","title":"OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation","summary":" In this work, we introduce OpenIns3D, a new 3D-input-only framework for 3D\nopen-vocabulary scene understanding. The OpenIns3D framework employs a\n\"Mask-Snap-Lookup\" scheme. The \"Mask\" module learns class-agnostic mask\nproposals in 3D point clouds, the \"Snap\" module generates synthetic scene-level\nimages at multiple scales and leverages 2D vision-language models to extract\ninteresting objects, and the \"Lookup\" module searches through the outcomes of\n\"Snap\" to assign category names to the proposed masks. This approach, yet\nsimple, achieves state-of-the-art performance across a wide range of 3D\nopen-vocabulary tasks, including recognition, object detection, and instance\nsegmentation, on both indoor and outdoor datasets. Moreover, OpenIns3D\nfacilitates effortless switching between different 2D detectors without\nrequiring retraining. When integrated with powerful 2D open-world models, it\nachieves excellent results in scene understanding tasks. Furthermore, when\ncombined with LLM-powered 2D models, OpenIns3D exhibits an impressive\ncapability to comprehend and process highly complex text queries that demand\nintricate reasoning and real-world knowledge. Project page:\nhttps://zheninghuang.github.io/OpenIns3D/\n","authors":["Zhening Huang","Xiaoyang Wu","Xi Chen","Hengshuang Zhao","Lei Zhu","Joan Lasenby"],"pdf_url":"https://arxiv.org/pdf/2309.00616v5.pdf","comment":"ECCV 2024. Project page: https://zheninghuang.github.io/OpenIns3D/"},{"id":"http://arxiv.org/abs/2311.17693v3","updated":"2024-08-12T16:52:09Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robot-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems cannot accommodate individual surgeons' unique preferences and\nrequirements. Additionally, they primarily focus on general surgeries (e.g.,\nlaparoscopy) and are unsuitable for highly precise microsurgeries, such as\nophthalmic procedures. Thus, we propose an image-guided approach for\nsurgeon-centered autonomous agents that can adapt to the individual surgeon's\nskill level and preferred surgical techniques during ophthalmic cataract\nsurgery. Our approach trains reinforcement and imitation learning agents\nsimultaneously using curriculum learning approaches guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process, our approach\nenables the robot to implicitly learn and adapt to the individual surgeon's\nunique techniques through surgeon-in-the-loop demonstrations. This results in a\nmore intuitive and personalized surgical experience for the surgeon while\nensuring consistent performance for the autonomous robotic apprentice. We\ndefine and evaluate the effectiveness of our approach in a simulated\nenvironment using our proposed metrics and highlight the trade-off between a\ngeneric agent and a surgeon-centered adapted agent. Finally, our approach has\nthe potential to extend to other ophthalmic and microsurgical procedures,\nopening the door to a new generation of surgeon-in-the-loop autonomous surgical\nrobots. We provide an open-source simulation framework for future development\nand reproducibility at\nhttps://github.com/amrgomaaelhady/CataractAdaptSurgRobot.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v3.pdf","comment":"Accepted at IROS'24"},{"id":"http://arxiv.org/abs/2408.06286v1","updated":"2024-08-12T16:49:22Z","published":"2024-08-12T16:49:22Z","title":"Mipmap-GS: Let Gaussians Deform with Scale-specific Mipmap for\n Anti-aliasing Rendering","summary":" 3D Gaussian Splatting (3DGS) has attracted great attention in novel view\nsynthesis because of its superior rendering efficiency and high fidelity.\nHowever, the trained Gaussians suffer from severe zooming degradation due to\nnon-adjustable representation derived from single-scale training. Though some\nmethods attempt to tackle this problem via post-processing techniques such as\nselective rendering or filtering techniques towards primitives, the\nscale-specific information is not involved in Gaussians. In this paper, we\npropose a unified optimization method to make Gaussians adaptive for arbitrary\nscales by self-adjusting the primitive properties (e.g., color, shape and size)\nand distribution (e.g., position). Inspired by the mipmap technique, we design\npseudo ground-truth for the target scale and propose a scale-consistency\nguidance loss to inject scale information into 3D Gaussians. Our method is a\nplug-in module, applicable for any 3DGS models to solve the zoom-in and\nzoom-out aliasing. Extensive experiments demonstrate the effectiveness of our\nmethod. Notably, our method outperforms 3DGS in PSNR by an average of 9.25 dB\nfor zoom-in and 10.40 dB for zoom-out on the NeRF Synthetic dataset.\n","authors":["Jiameng Li","Yue Shi","Jiezhang Cao","Bingbing Ni","Wenjun Zhang","Kai Zhang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.06286v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2403.08042v2","updated":"2024-08-12T16:37:31Z","published":"2024-03-12T19:34:50Z","title":"CT evaluation of 2D and 3D holistic deep learning methods for the\n volumetric segmentation of airway lesions","summary":" This research embarked on a comparative exploration of the holistic\nsegmentation capabilities of Convolutional Neural Networks (CNNs) in both 2D\nand 3D formats, focusing on cystic fibrosis (CF) lesions. The study utilized\ndata from two CF reference centers, covering five major CF structural changes.\nInitially, it compared the 2D and 3D models, highlighting the 3D model's\nsuperior capability in capturing complex features like mucus plugs and\nconsolidations. To improve the 2D model's performance, a loss adapted to fine\nstructures segmentation was implemented and evaluated, significantly enhancing\nits accuracy, though not surpassing the 3D model's performance. The models\nunderwent further validation through external evaluation against pulmonary\nfunction tests (PFTs), confirming the robustness of the findings. Moreover,\nthis study went beyond comparing metrics; it also included comprehensive\nassessments of the models' interpretability and reliability, providing valuable\ninsights for their clinical application.\n","authors":["Amel Imene Hadj Bouzid","Baudouin Denis de Senneville","Fabien Baldacci","Pascal Desbarats","Patrick Berger","Ilyes Benlala","Gaël Dournes"],"pdf_url":"https://arxiv.org/pdf/2403.08042v2.pdf","comment":"6 pages, 3 figures, 2 tables, IEEE International Symposium on\n Biomedical Imaging (ISBI) 2024"},{"id":"http://arxiv.org/abs/2408.06259v1","updated":"2024-08-12T16:15:32Z","published":"2024-08-12T16:15:32Z","title":"Context-aware Visual Storytelling with Visual Prefix Tuning and\n Contrastive Learning","summary":" Visual storytelling systems generate multi-sentence stories from image\nsequences. In this task, capturing contextual information and bridging visual\nvariation bring additional challenges. We propose a simple yet effective\nframework that leverages the generalization capabilities of pretrained\nfoundation models, only training a lightweight vision-language mapping network\nto connect modalities, while incorporating context to enhance coherence. We\nintroduce a multimodal contrastive objective that also improves visual\nrelevance and story informativeness. Extensive experimental results, across\nboth automatic metrics and human evaluations, demonstrate that the stories\ngenerated by our framework are diverse, coherent, informative, and interesting.\n","authors":["Yingjin Song","Denis Paperno","Albert Gatt"],"pdf_url":"https://arxiv.org/pdf/2408.06259v1.pdf","comment":"18 pages, 12 figures, accepted by INLG 2024"},{"id":"http://arxiv.org/abs/2408.06248v1","updated":"2024-08-12T16:00:17Z","published":"2024-08-12T16:00:17Z","title":"Rethinking Video with a Universal Event-Based Representation","summary":" Traditionally, video is structured as a sequence of discrete image frames.\nRecently, however, a novel video sensing paradigm has emerged which eschews\nvideo frames entirely. These \"event\" sensors aim to mimic the human vision\nsystem with asynchronous sensing, where each pixel has an independent, sparse\ndata stream. While these cameras enable high-speed and high-dynamic-range\nsensing, researchers often revert to a framed representation of the event data\nfor existing applications, or build bespoke applications for a particular\ncamera's event data type. At the same time, classical video systems have\nsignificant computational redundancy at the application layer, since pixel\nsamples are repeated across frames in the uncompressed domain.\n To address the shortcomings of existing systems, I introduce Address,\nDecimation, {\\Delta}t Event Representation (AD{\\Delta}ER, pronounced \"adder\"),\na novel intermediate video representation and system framework. The framework\ntranscodes a variety of framed and event camera sources into a single\nevent-based representation, which supports source-modeled lossy compression and\nbackward compatibility with traditional frame-based applications. I demonstrate\nthat AD{\\Delta}ER achieves state-of-the-art application speed and compression\nperformance for scenes with high temporal redundancy. Crucially, I describe how\nAD{\\Delta}ER unlocks an entirely new control mechanism for computer vision:\napplication speed can correlate with both the scene content and the level of\nlossy compression. Finally, I discuss the implications for event-based video on\nlarge-scale video surveillance and resource-constrained sensing.\n","authors":["Andrew Freeman"],"pdf_url":"https://arxiv.org/pdf/2408.06248v1.pdf","comment":"137 pages. PhD dissertation at the University of North Carolina,\n Chapel Hill"},{"id":"http://arxiv.org/abs/2408.06245v1","updated":"2024-08-12T15:54:46Z","published":"2024-08-12T15:54:46Z","title":"Latent Disentanglement for Low Light Image Enhancement","summary":" Many learning-based low-light image enhancement (LLIE) algorithms are based\non the Retinex theory. However, the Retinex-based decomposition techniques in\nsuch models introduce corruptions which limit their enhancement performance. In\nthis paper, we propose a Latent Disentangle-based Enhancement Network (LDE-Net)\nfor low light vision tasks. The latent disentanglement module disentangles the\ninput image in latent space such that no corruption remains in the disentangled\nContent and Illumination components. For LLIE task, we design a Content-Aware\nEmbedding (CAE) module that utilizes Content features to direct the enhancement\nof the Illumination component. For downstream tasks (e.g. nighttime UAV\ntracking and low-light object detection), we develop an effective light-weight\nenhancer based on the latent disentanglement framework. Comprehensive\nquantitative and qualitative experiments demonstrate that our LDE-Net\nsignificantly outperforms state-of-the-art methods on various LLIE benchmarks.\nIn addition, the great results obtained by applying our framework on the\ndownstream tasks also demonstrate the usefulness of our latent disentanglement\ndesign.\n","authors":["Zhihao Zheng","Mooi Choo Chuah"],"pdf_url":"https://arxiv.org/pdf/2408.06245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06244v1","updated":"2024-08-12T15:53:24Z","published":"2024-08-12T15:53:24Z","title":"3D Reconstruction of Protein Structures from Multi-view AFM Images using\n Neural Radiance Fields (NeRFs)","summary":" Recent advancements in deep learning for predicting 3D protein structures\nhave shown promise, particularly when leveraging inputs like protein sequences\nand Cryo-Electron microscopy (Cryo-EM) images. However, these techniques often\nfall short when predicting the structures of protein complexes (PCs), which\ninvolve multiple proteins. In our study, we investigate using atomic force\nmicroscopy (AFM) combined with deep learning to predict the 3D structures of\nPCs. AFM generates height maps that depict the PCs in various random\norientations, providing a rich information for training a neural network to\npredict the 3D structures. We then employ the pre-trained UpFusion model (which\nutilizes a conditional diffusion model for synthesizing novel views) to train\nan instance-specific NeRF model for 3D reconstruction. The performance of\nUpFusion is evaluated through zero-shot predictions of 3D protein structures\nusing AFM images. The challenge, however, lies in the time-intensive and\nimpractical nature of collecting actual AFM images. To address this, we use a\nvirtual AFM imaging process that transforms a `PDB' protein file into\nmulti-view 2D virtual AFM images via volume rendering techniques. We\nextensively validate the UpFusion architecture using both virtual and actual\nmulti-view AFM images. Our results include a comparison of structures predicted\nwith varying numbers of views and different sets of views. This novel approach\nholds significant potential for enhancing the accuracy of protein complex\nstructure predictions with further fine-tuning of the UpFusion network.\n","authors":["Jaydeep Rade","Ethan Herron","Soumik Sarkar","Anwesha Sarkar","Adarsh Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2408.06244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06235v1","updated":"2024-08-12T15:38:51Z","published":"2024-08-12T15:38:51Z","title":"Correlation Weighted Prototype-based Self-Supervised One-Shot\n Segmentation of Medical Images","summary":" Medical image segmentation is one of the domains where sufficient annotated\ndata is not available. This necessitates the application of low-data frameworks\nlike few-shot learning. Contemporary prototype-based frameworks often do not\naccount for the variation in features within the support and query images,\ngiving rise to a large variance in prototype alignment. In this work, we adopt\na prototype-based self-supervised one-way one-shot learning framework using\npseudo-labels generated from superpixels to learn the semantic segmentation\ntask itself. We use a correlation-based probability score to generate a dynamic\nprototype for each query pixel from the bag of prototypes obtained from the\nsupport feature map. This weighting scheme helps to give a higher weightage to\ncontextually related prototypes. We also propose a quadrant masking strategy in\nthe downstream segmentation task by utilizing prior domain information to\ndiscard unwanted false positives. We present extensive experimentations and\nevaluations on abdominal CT and MR datasets to show that the proposed simple\nbut potent framework performs at par with the state-of-the-art methods.\n","authors":["Siladittya Manna","Saumik Bhattacharya","Umapada Pal"],"pdf_url":"https://arxiv.org/pdf/2408.06235v1.pdf","comment":"Accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2312.06573v2","updated":"2024-08-12T14:52:49Z","published":"2023-12-11T17:58:06Z","title":"ControlNet-XS: Rethinking the Control of Text-to-Image Diffusion Models\n as Feedback-Control Systems","summary":" The field of image synthesis has made tremendous strides forward in the last\nyears. Besides defining the desired output image with text-prompts, an\nintuitive approach is to additionally use spatial guidance in form of an image,\nsuch as a depth map. In state-of-the-art approaches, this guidance is realized\nby a separate controlling model that controls a pre-trained image generation\nnetwork, such as a latent diffusion model. Understanding this process from a\ncontrol system perspective shows that it forms a feedback-control system, where\nthe control module receives a feedback signal from the generation process and\nsends a corrective signal back. When analysing existing systems, we observe\nthat the feedback signals are timely sparse and have a small number of bits. As\na consequence, there can be long delays between newly generated features and\nthe respective corrective signals for these features. It is known that this\ndelay is the most unwanted aspect of any control system. In this work, we take\nan existing controlling network (ControlNet) and change the communication\nbetween the controlling network and the generation process to be of\nhigh-frequency and with large-bandwidth. By doing so, we are able to\nconsiderably improve the quality of the generated images, as well as the\nfidelity of the control. Also, the controlling network needs noticeably fewer\nparameters and hence is about twice as fast during inference and training time.\nAnother benefit of small-sized models is that they help to democratise our\nfield and are likely easier to understand. We call our proposed network\nControlNet-XS. When comparing with the state-of-the-art approaches, we\noutperform them for pixel-level guidance, such as depth, canny-edges, and\nsemantic segmentation, and are on a par for loose keypoint-guidance of human\nposes. All code and pre-trained models will be made publicly available.\n","authors":["Denis Zavadski","Johann-Friedrich Feiden","Carsten Rother"],"pdf_url":"https://arxiv.org/pdf/2312.06573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14499v2","updated":"2024-08-12T14:50:01Z","published":"2024-07-19T17:50:11Z","title":"Discover-then-Name: Task-Agnostic Concept Bottlenecks via Automated\n Concept Discovery","summary":" Concept Bottleneck Models (CBMs) have recently been proposed to address the\n'black-box' problem of deep neural networks, by first mapping images to a\nhuman-understandable concept space and then linearly combining concepts for\nclassification. Such models typically require first coming up with a set of\nconcepts relevant to the task and then aligning the representations of a\nfeature extractor to map to these concepts. However, even with powerful\nfoundational feature extractors like CLIP, there are no guarantees that the\nspecified concepts are detectable. In this work, we leverage recent advances in\nmechanistic interpretability and propose a novel CBM approach -- called\nDiscover-then-Name-CBM (DN-CBM) -- that inverts the typical paradigm: instead\nof pre-selecting concepts based on the downstream classification task, we use\nsparse autoencoders to first discover concepts learnt by the model, and then\nname them and train linear probes for classification. Our concept extraction\nstrategy is efficient, since it is agnostic to the downstream task, and uses\nconcepts already known to the model. We perform a comprehensive evaluation\nacross multiple datasets and CLIP architectures and show that our method yields\nsemantically meaningful concepts, assigns appropriate names to them that make\nthem easy to interpret, and yields performant and interpretable CBMs. Code\navailable at https://github.com/neuroexplicit-saar/discover-then-name.\n","authors":["Sukrut Rao","Sweta Mahajan","Moritz Böhle","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2407.14499v2.pdf","comment":"40 pages, 21 figures, 6 tables, European Conference on Computer\n Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2404.05205v2","updated":"2024-08-12T14:42:48Z","published":"2024-04-08T05:18:39Z","title":"A secure and private ensemble matcher using multi-vault obfuscated\n templates","summary":" Generative AI has revolutionized modern machine learning by providing\nunprecedented realism, diversity, and efficiency in data generation. This\ntechnology holds immense potential for biometrics, including for securing\nsensitive and personally identifiable information. Given the irrevocability of\nbiometric samples and mounting privacy concerns, biometric template security\nand secure matching are among the most sought-after features of modern\nbiometric systems. This paper proposes a novel obfuscation method using\nGenerative AI to enhance biometric template security. Our approach utilizes\nsynthetic facial images generated by a Generative Adversarial Network (GAN) as\n\"random chaff points\" within a secure vault system. Our method creates n\nsub-templates from the original template, each obfuscated with m GAN chaff\npoints. During verification, s closest vectors to the biometric query are\nretrieved from each vault and combined to generate hash values, which are then\ncompared with the stored hash value. Thus, our method safeguards user\nidentities during the training and deployment phases by employing the\nGAN-generated synthetic images. Our protocol was tested using the AT&T, GT, and\nLFW face datasets, achieving ROC areas under the curve of 0.99, 0.99, and 0.90,\nrespectively. Our results demonstrate that the proposed method can maintain\nhigh accuracy and reasonable computational complexity comparable to those\nunprotected template methods while significantly enhancing security and\nprivacy, underscoring the potential of Generative AI in developing proactive\ndefensive strategies for biometric systems.\n","authors":["Babak Poorebrahim Gilkalaye","Shubhabrata Mukherjee","Reza Derakhshani"],"pdf_url":"https://arxiv.org/pdf/2404.05205v2.pdf","comment":"This paper has been accepted in IJCB 2024 Special Session, Generative\n AI for Futuristic Biometrics"},{"id":"http://arxiv.org/abs/2408.06190v1","updated":"2024-08-12T14:40:38Z","published":"2024-08-12T14:40:38Z","title":"FruitNeRF: A Unified Neural Radiance Field based Fruit Counting\n Framework","summary":" We introduce FruitNeRF, a unified novel fruit counting framework that\nleverages state-of-the-art view synthesis methods to count any fruit type\ndirectly in 3D. Our framework takes an unordered set of posed images captured\nby a monocular camera and segments fruit in each image. To make our system\nindependent of the fruit type, we employ a foundation model that generates\nbinary segmentation masks for any fruit. Utilizing both modalities, RGB and\nsemantic, we train a semantic neural radiance field. Through uniform volume\nsampling of the implicit Fruit Field, we obtain fruit-only point clouds. By\napplying cascaded clustering on the extracted point cloud, our approach\nachieves precise fruit count.The use of neural radiance fields provides\nsignificant advantages over conventional methods such as object tracking or\noptical flow, as the counting itself is lifted into 3D. Our method prevents\ndouble counting fruit and avoids counting irrelevant fruit.We evaluate our\nmethodology using both real-world and synthetic datasets. The real-world\ndataset consists of three apple trees with manually counted ground truths, a\nbenchmark apple dataset with one row and ground truth fruit location, while the\nsynthetic dataset comprises various fruit types including apple, plum, lemon,\npear, peach, and mango.Additionally, we assess the performance of fruit\ncounting using the foundation model compared to a U-Net.\n","authors":["Lukas Meyer","Andreas Gilson","Ute Schmidt","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2408.06190v1.pdf","comment":"Project Page: https://meyerls.github.io/fruit_nerf/"},{"id":"http://arxiv.org/abs/2407.17229v3","updated":"2024-08-12T14:28:42Z","published":"2024-07-24T12:32:24Z","title":"LPGen: Enhancing High-Fidelity Landscape Painting Generation through\n Diffusion Model","summary":" Generating landscape paintings expands the possibilities of artistic\ncreativity and imagination. Traditional landscape painting methods involve\nusing ink or colored ink on rice paper, which requires substantial time and\neffort. These methods are susceptible to errors and inconsistencies and lack\nprecise control over lines and colors. This paper presents LPGen, a\nhigh-fidelity, controllable model for landscape painting generation,\nintroducing a novel multi-modal framework that integrates image prompts into\nthe diffusion model. We extract its edges and contours by computing canny edges\nfrom the target landscape image. These, along with natural language text\nprompts and drawing style references, are fed into the latent diffusion model\nas conditions. We implement a decoupled cross-attention strategy to ensure\ncompatibility between image and text prompts, facilitating multi-modal image\ngeneration. A decoder generates the final image. Quantitative and qualitative\nanalyses demonstrate that our method outperforms existing approaches in\nlandscape painting generation and exceeds the current state-of-the-art. The\nLPGen network effectively controls the composition and color of landscape\npaintings, generates more accurate images, and supports further research in\ndeep learning-based landscape painting generation.\n","authors":["Wanggong Yang","Xiaona Wang","Yingrui Qiu","Yifei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.17229v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06170v1","updated":"2024-08-12T14:16:10Z","published":"2024-08-12T14:16:10Z","title":"Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment\n Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging","summary":" Purpose: This study aimed to evaluate the zero-shot performance of Segment\nAnything Model 2 (SAM 2) in 3D segmentation of abdominal organs in CT scans,\nleveraging its video tracking capabilities for volumetric medical imaging.\nMaterials and Methods: Using a subset of the TotalSegmentator CT dataset\n(n=123) from 8 different institutions, we assessed SAM 2's ability to segment 8\nabdominal organs. Segmentation was initiated from three different Z-coordinate\nlevels (caudal, mid, and cranial levels) of each organ. Performance was\nmeasured using the Dice similarity coefficient (DSC). We also analyzed organ\nvolumes to contextualize the results. Results: As a zero-shot approach, larger\norgans with clear boundaries demonstrated high segmentation performance, with\nmean(median) DSCs as follows: liver 0.821(0.898), left kidney 0.870(0.921),\nright kidney 0.862(0.935), and spleen 0.891(0.932). Smaller or less defined\nstructures showed lower performance: gallbladder 0.531(0.590), pancreas\n0.361(0.359), and adrenal glands 0.203-0.308(0.109-0.231). Significant\ndifferences in DSC were observed depending on the starting initial slice of\nsegmentation for different organs. A moderate positive correlation was observed\nbetween volume size and DSCs (Spearman's rs = 0.731, P <.001 at caudal-level).\nDSCs exhibited high variability within organs, ranging from near 0 to almost\n1.0, indicating substantial inconsistency in segmentation performance between\nscans. Conclusion: SAM 2 demonstrated promising zero-shot performance in\nsegmenting certain abdominal organs in CT scans, particularly larger organs\nwith clear boundaries. The model's ability to segment previously unseen targets\nwithout additional training highlights its potential for cross-domain\ngeneralization in medical imaging. However, improvements are needed for smaller\nand less defined structures.\n","authors":["Yosuke Yamagishi","Shouhei Hanaoka","Tomohiro Kikuchi","Takahiro Nakao","Yuta Nakamura","Yukihiro Nomura","Soichiro Miki","Takeharu Yoshikawa","Osamu Abe"],"pdf_url":"https://arxiv.org/pdf/2408.06170v1.pdf","comment":"16 pages, 6 figures (including 1 supplemental figure), 3 tables"},{"id":"http://arxiv.org/abs/2408.06167v1","updated":"2024-08-12T14:13:08Z","published":"2024-08-12T14:13:08Z","title":"Blind-Match: Efficient Homomorphic Encryption-Based 1:N Matching for\n Privacy-Preserving Biometric Identification","summary":" We present Blind-Match, a novel biometric identification system that\nleverages homomorphic encryption (HE) for efficient and privacy-preserving 1:N\nmatching. Blind-Match introduces a HE-optimized cosine similarity computation\nmethod, where the key idea is to divide the feature vector into smaller parts\nfor processing rather than computing the entire vector at once. By optimizing\nthe number of these parts, Blind-Match minimizes execution time while ensuring\ndata privacy through HE. Blind-Match achieves superior performance compared to\nstate-of-the-art methods across various biometric datasets. On the LFW face\ndataset, Blind-Match attains a 99.63% Rank-1 accuracy with a 128-dimensional\nfeature vector, demonstrating its robustness in face recognition tasks. For\nfingerprint identification, Blind-Match achieves a remarkable 99.55% Rank-1\naccuracy on the PolyU dataset, even with a compact 16-dimensional feature\nvector, significantly outperforming the state-of-the-art method, Blind-Touch,\nwhich achieves only 59.17%. Furthermore, Blind-Match showcases practical\nefficiency in large-scale biometric identification scenarios, such as Naver\nCloud's FaceSign, by processing 6,144 biometric samples in 0.74 seconds using a\n128-dimensional feature vector.\n","authors":["Hyunmin Choi","Jiwon Kim","Chiyoung Song","Simon S. Woo","Hyoungshick Kim"],"pdf_url":"https://arxiv.org/pdf/2408.06167v1.pdf","comment":"Accepted to CIKM 2024 (Applied Research Track)"},{"id":"http://arxiv.org/abs/2405.10620v2","updated":"2024-08-12T14:07:32Z","published":"2024-05-17T08:33:27Z","title":"MC-GPT: Empowering Vision-and-Language Navigation with Memory Map and\n Reasoning Chains","summary":" In the Vision-and-Language Navigation (VLN) task, the agent is required to\nnavigate to a destination following a natural language instruction. While\nlearning-based approaches have been a major solution to the task, they suffer\nfrom high training costs and lack of interpretability. Recently, Large Language\nModels (LLMs) have emerged as a promising tool for VLN due to their strong\ngeneralization capabilities. However, existing LLM-based methods face\nlimitations in memory construction and diversity of navigation strategies. To\naddress these challenges, we propose a suite of techniques. Firstly, we\nintroduce a method to maintain a topological map that stores navigation\nhistory, retaining information about viewpoints, objects, and their spatial\nrelationships. This map also serves as a global action space. Additionally, we\npresent a Navigation Chain of Thoughts module, leveraging human navigation\nexamples to enrich navigation strategy diversity. Finally, we establish a\npipeline that integrates navigational memory and strategies with perception and\naction prediction modules. Experimental results on the REVERIE and R2R datasets\nshow that our method effectively enhances the navigation ability of the LLM and\nimproves the interpretability of navigation reasoning.\n","authors":["Zhaohuan Zhan","Lisha Yu","Sijie Yu","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2405.10620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06163v1","updated":"2024-08-12T14:03:17Z","published":"2024-08-12T14:03:17Z","title":"ACCELERATION: Sequentially-scanning DECT Imaging Using High Temporal\n Resolution Image Reconstruction And Temporal Extrapolation","summary":" Dual-energy computed tomography (DECT) has been widely used to obtain\nquantitative elemental composition of imaged subjects for personalized and\nprecise medical diagnosis. Compared with existing high-end DECT leveraging\nadvanced X-ray source and/or detector technologies, the use of the\nsequentially-scanning data acquisition scheme to implement DECT may make\nbroader impact on clinical practice because this scheme requires no specialized\nhardware designs. However, since the concentration of iodinated contrast agent\nin the imaged subject varies over time, sequentially-scanned data sets acquired\nat two tube potentials are temporally inconsistent. As existing material\ndecomposition approaches for DECT assume that the data sets acquired at two\ntube potentials are temporally consistent, the violation of this assumption\nresults in inaccurate quantification accuracy of iodine concentration. In this\nwork, we developed a technique to achieve sequentially-scanning DECT imaging\nusing high temporal resolution image reconstruction and temporal extrapolation,\nACCELERATION in short, to address the technical challenge induced by temporal\ninconsistency of sequentially-scanned data sets and improve iodine\nquantification accuracy in sequentially-scanning DECT. ACCELERATION has been\nvalidated and evaluated using numerical simulation data sets generated from\nclinical human subject exams. Results demonstrated the improvement of iodine\nquantification accuracy using ACCELERATION.\n","authors":["Qiaoxin Li","Dong Liang","Yinsheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.06163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06158v1","updated":"2024-08-12T13:55:46Z","published":"2024-08-12T13:55:46Z","title":"OmniCLIP: Adapting CLIP for Video Recognition with Spatial-Temporal\n Omni-Scale Feature Learning","summary":" Recent Vision-Language Models (VLMs) \\textit{e.g.} CLIP have made great\nprogress in video recognition. Despite the improvement brought by the strong\nvisual backbone in extracting spatial features, CLIP still falls short in\ncapturing and integrating spatial-temporal features which is essential for\nvideo recognition. In this paper, we propose OmniCLIP, a framework that adapts\nCLIP for video recognition by focusing on learning comprehensive features\nencompassing spatial, temporal, and dynamic spatial-temporal scales, which we\nrefer to as omni-scale features. This is achieved through the design of\nspatial-temporal blocks that include parallel temporal adapters (PTA), enabling\nefficient temporal modeling. Additionally, we introduce a self-prompt generator\n(SPG) module to capture dynamic object spatial features. The synergy between\nPTA and SPG allows OmniCLIP to discern varying spatial information across\nframes and assess object scales over time. We have conducted extensive\nexperiments in supervised video recognition, few-shot video recognition, and\nzero-shot recognition tasks. The results demonstrate the effectiveness of our\nmethod, especially with OmniCLIP achieving a top-1 accuracy of 74.30\\% on\nHMDB51 in a 16-shot setting, surpassing the recent MotionPrompt approach even\nwith full training data. The code is available at\n\\url{https://github.com/XiaoBuL/OmniCLIP}.\n","authors":["Mushui Liu","Bozheng Li","Yunlong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.06158v1.pdf","comment":"ECAI-2024"},{"id":"http://arxiv.org/abs/2408.06157v1","updated":"2024-08-12T13:53:40Z","published":"2024-08-12T13:53:40Z","title":"Novel View Synthesis from a Single Image with Pretrained Diffusion\n Guidance","summary":" Recent 3D novel view synthesis (NVS) methods are limited to\nsingle-object-centric scenes generated from new viewpoints and struggle with\ncomplex environments. They often require extensive 3D data for training,\nlacking generalization beyond training distribution. Conversely, 3D-free\nmethods can generate text-controlled views of complex, in-the-wild scenes using\na pretrained stable diffusion model without tedious fine-tuning, but lack\ncamera control. In this paper, we introduce HawkI++, a method capable of\ngenerating camera-controlled viewpoints from a single input image. HawkI++\nexcels in handling complex and diverse scenes without additional 3D data or\nextensive training. It leverages widely available pretrained NVS models for\nweak guidance, integrating this knowledge into a 3D-free view synthesis\napproach to achieve the desired results efficiently. Our experimental results\ndemonstrate that HawkI++ outperforms existing models in both qualitative and\nquantitative evaluations, providing high-fidelity and consistent novel view\nsynthesis at desired camera angles across a wide variety of scenes.\n","authors":["Taewon Kang","Divya Kothandaraman","Dinesh Manocha","Ming C. Lin"],"pdf_url":"https://arxiv.org/pdf/2408.06157v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.03154v2","updated":"2024-08-12T13:53:17Z","published":"2023-12-05T21:41:17Z","title":"ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for\n ControlNet","summary":" This paper introduces ViscoNet, a novel one-branch-adapter architecture for\nconcurrent spatial and visual conditioning. Our lightweight model requires\ntrainable parameters and dataset size multiple orders of magnitude smaller than\nthe current state-of-the-art IP-Adapter. However, our method successfully\npreserves the generative power of the frozen text-to-image (T2I) backbone.\nNotably, it excels in addressing mode collapse, a pervasive issue previously\noverlooked. Our novel architecture demonstrates outstanding capabilities in\nachieving a harmonious visual-text balance, unlocking unparalleled versatility\nin various human image generation tasks, including pose re-targeting, virtual\ntry-on, stylization, person re-identification, and textile transfer.Demo and\ncode are available from project page https://soon-yau.github.io/visconet/ .\n","authors":["Soon Yau Cheong","Armin Mustafa","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2312.03154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06152v1","updated":"2024-08-12T13:48:06Z","published":"2024-08-12T13:48:06Z","title":"Palantir: Towards Efficient Super Resolution for Ultra-high-definition\n Live Streaming","summary":" Neural enhancement through super-resolution deep neural networks opens up new\npossibilities for ultra-high-definition live streaming over existing encoding\nand networking infrastructure. Yet, the heavy SR DNN inference overhead leads\nto severe deployment challenges. To reduce the overhead, existing systems\npropose to apply DNN-based SR only on selected anchor frames while upscaling\nnon-anchor frames via the lightweight reusing-based SR approach. However,\nframe-level scheduling is coarse-grained and fails to deliver optimal\nefficiency. In this work, we propose Palantir, the first neural-enhanced UHD\nlive streaming system with fine-grained patch-level scheduling. In the\npresented solutions, two novel techniques are incorporated to make good\nscheduling decisions for inference overhead optimization and reduce the\nscheduling latency. Firstly, under the guidance of our pioneering and\ntheoretical analysis, Palantir constructs a directed acyclic graph (DAG) for\nlightweight yet accurate quality estimation under any possible anchor patch\nset. Secondly, to further optimize the scheduling latency, Palantir improves\nparallelizability by refactoring the computation subprocedure of the estimation\nprocess into a sparse matrix-matrix multiplication operation. The evaluation\nresults suggest that Palantir incurs a negligible scheduling latency accounting\nfor less than 5.7% of the end-to-end latency requirement. When compared to the\nstate-of-the-art real-time frame-level scheduling strategy, Palantir reduces\nthe energy overhead of SR-integrated mobile clients by 38.1% at most (and 22.4%\non average) and the monetary costs of cloud-based SR by 80.1% at most (and\n38.4% on average).\n","authors":["Xinqi Jin","Zhui Zhu","Xikai Sun","Fan Dang","Jiangchuan Liu","Jingao Xu","Kebin Liu","Xinlei Chen","Yunhao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06145v1","updated":"2024-08-12T13:41:47Z","published":"2024-08-12T13:41:47Z","title":"Efficient and Scalable Point Cloud Generation with Sparse Point-Voxel\n Diffusion Models","summary":" We propose a novel point cloud U-Net diffusion architecture for 3D generative\nmodeling capable of generating high-quality and diverse 3D shapes while\nmaintaining fast generation times. Our network employs a dual-branch\narchitecture, combining the high-resolution representations of points with the\ncomputational efficiency of sparse voxels. Our fastest variant outperforms all\nnon-diffusion generative approaches on unconditional shape generation, the most\npopular benchmark for evaluating point cloud generative models, while our\nlargest model achieves state-of-the-art results among diffusion methods, with a\nruntime approximately 70% of the previously state-of-the-art PVD. Beyond\nunconditional generation, we perform extensive evaluations, including\nconditional generation on all categories of ShapeNet, demonstrating the\nscalability of our model to larger datasets, and implicit generation which\nallows our network to produce high quality point clouds on fewer timesteps,\nfurther decreasing the generation time. Finally, we evaluate the architecture's\nperformance in point cloud completion and super-resolution. Our model excels in\nall tasks, establishing it as a state-of-the-art diffusion U-Net for point\ncloud generative modeling. The code is publicly available at\nhttps://github.com/JohnRomanelis/SPVD.git.\n","authors":["Ioannis Romanelis","Vlassios Fotis","Athanasios Kalogeras","Christos Alexakos","Konstantinos Moustakas","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2408.06145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06137v1","updated":"2024-08-12T13:27:11Z","published":"2024-08-12T13:27:11Z","title":"MR3D-Net: Dynamic Multi-Resolution 3D Sparse Voxel Grid Fusion for\n LiDAR-Based Collective Perception","summary":" The safe operation of automated vehicles depends on their ability to perceive\nthe environment comprehensively. However, occlusion, sensor range, and\nenvironmental factors limit their perception capabilities. To overcome these\nlimitations, collective perception enables vehicles to exchange information.\nHowever, fusing this exchanged information is a challenging task. Early fusion\napproaches require large amounts of bandwidth, while intermediate fusion\napproaches face interchangeability issues. Late fusion of shared detections is\ncurrently the only feasible approach. However, it often results in inferior\nperformance due to information loss. To address this issue, we propose\nMR3D-Net, a dynamic multi-resolution 3D sparse voxel grid fusion backbone\narchitecture for LiDAR-based collective perception. We show that sparse voxel\ngrids at varying resolutions provide a meaningful and compact environment\nrepresentation that can adapt to the communication bandwidth. MR3D-Net achieves\nstate-of-the-art performance on the OPV2V 3D object detection benchmark while\nreducing the required bandwidth by up to 94% compared to early fusion. Code is\navailable at https://github.com/ekut-es/MR3D-Net\n","authors":["Sven Teufel","Jörg Gamerdinger","Georg Volk","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2408.06137v1.pdf","comment":"Accepted at IEEE ITSC 2024"},{"id":"http://arxiv.org/abs/2406.16633v2","updated":"2024-08-12T13:09:42Z","published":"2024-06-24T13:30:55Z","title":"MLAAN: Scaling Supervised Local Learning with Multilaminar Leap\n Augmented Auxiliary Network","summary":" Deep neural networks (DNNs) typically employ an end-to-end (E2E) training\nparadigm which presents several challenges, including high GPU memory\nconsumption, inefficiency, and difficulties in model parallelization during\ntraining. Recent research has sought to address these issues, with one\npromising approach being local learning. This method involves partitioning the\nbackbone network into gradient-isolated modules and manually designing\nauxiliary networks to train these local modules. Existing methods often neglect\nthe interaction of information between local modules, leading to myopic issues\nand a performance gap compared to E2E training. To address these limitations,\nwe propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN).\nSpecifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap\nAugmented Modules (LAM). MLM captures both local and global features through\nindependent and cascaded auxiliary networks, alleviating performance issues\ncaused by insufficient global features. However, overly simplistic auxiliary\nnetworks can impede MLM's ability to capture global information. To address\nthis, we further design LAM, an enhanced auxiliary network that uses the\nExponential Moving Average (EMA) method to facilitate information exchange\nbetween local modules, thereby mitigating the shortsightedness resulting from\ninadequate interaction. The synergy between MLM and LAM has demonstrated\nexcellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and\nImageNet datasets show that MLAAN can be seamlessly integrated into existing\nlocal learning frameworks, significantly enhancing their performance and even\nsurpassing end-to-end (E2E) training methods, while also reducing GPU memory\nconsumption.\n","authors":["Yuming Zhang","Shouxin Zhang","Peizhe Wang","Feiyu Zhu","Dongzhi Guan","Junhao Su","Jiabin Liu","Changpeng Cai"],"pdf_url":"https://arxiv.org/pdf/2406.16633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06123v1","updated":"2024-08-12T13:05:43Z","published":"2024-08-12T13:05:43Z","title":"DPDETR: Decoupled Position Detection Transformer for Infrared-Visible\n Object Detection","summary":" Infrared-visible object detection aims to achieve robust object detection by\nleveraging the complementary information of infrared and visible image pairs.\nHowever, the commonly existing modality misalignment problem presents two\nchallenges: fusing misalignment complementary features is difficult, and\ncurrent methods cannot accurately locate objects in both modalities under\nmisalignment conditions. In this paper, we propose a Decoupled Position\nDetection Transformer (DPDETR) to address these problems. Specifically, we\nexplicitly formulate the object category, visible modality position, and\ninfrared modality position to enable the network to learn the intrinsic\nrelationships and output accurate positions of objects in both modalities. To\nfuse misaligned object features accurately, we propose a Decoupled Position\nMultispectral Cross-attention module that adaptively samples and aggregates\nmultispectral complementary features with the constraint of infrared and\nvisible reference positions. Additionally, we design a query-decoupled\nMultispectral Decoder structure to address the optimization gap among the three\nkinds of object information in our task and propose a Decoupled Position\nContrastive DeNosing Training strategy to enhance the DPDETR's ability to learn\ndecoupled positions. Experiments on DroneVehicle and KAIST datasets demonstrate\nsignificant improvements compared to other state-of-the-art methods. The code\nwill be released at https://github.com/gjj45/DPDETR.\n","authors":["Junjie Guo","Chenqiang Gao","Fangcen Liu","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2408.06123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06110v1","updated":"2024-08-12T12:47:37Z","published":"2024-08-12T12:47:37Z","title":"RISurConv: Rotation Invariant Surface Attention-Augmented Convolutions\n for 3D Point Cloud Classification and Segmentation","summary":" Despite the progress on 3D point cloud deep learning, most prior works focus\non learning features that are invariant to translation and point permutation,\nand very limited efforts have been devoted for rotation invariant property.\nSeveral recent studies achieve rotation invariance at the cost of lower\naccuracies. In this work, we close this gap by proposing a novel yet effective\nrotation invariant architecture for 3D point cloud classification and\nsegmentation. Instead of traditional pointwise operations, we construct local\ntriangle surfaces to capture more detailed surface structure, based on which we\ncan extract highly expressive rotation invariant surface properties which are\nthen integrated into an attention-augmented convolution operator named\nRISurConv to generate refined attention features via self-attention layers.\nBased on RISurConv we build an effective neural network for 3D point cloud\nanalysis that is invariant to arbitrary rotations while maintaining high\naccuracy. We verify the performance on various benchmarks with supreme results\nobtained surpassing the previous state-of-the-art by a large margin. We achieve\nan overall accuracy of 96.0% (+4.7%) on ModelNet40, 93.1% (+12.8%) on\nScanObjectNN, and class accuracies of 91.5% (+3.6%), 82.7% (+5.1%), and 78.5%\n(+9.2%) on the three categories of the FG3D dataset for the fine-grained\nclassification task. Additionally, we achieve 81.5% (+1.0%) mIoU on ShapeNet\nfor the segmentation task. Code is available here:\nhttps://github.com/cszyzhang/RISurConv\n","authors":["Zhiyuan Zhang","Licheng Yang","Zhiyu Xiang"],"pdf_url":"https://arxiv.org/pdf/2408.06110v1.pdf","comment":"ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2407.05623v4","updated":"2024-08-12T12:47:11Z","published":"2024-07-08T05:31:51Z","title":"Momentum Auxiliary Network for Supervised Local Learning","summary":" Deep neural networks conventionally employ end-to-end backpropagation for\ntheir training process, which lacks biological credibility and triggers a\nlocking dilemma during network parameter updates, leading to significant GPU\nmemory use. Supervised local learning, which segments the network into multiple\nlocal blocks updated by independent auxiliary networks. However, these methods\ncannot replace end-to-end training due to lower accuracy, as gradients only\npropagate within their local block, creating a lack of information exchange\nbetween blocks. To address this issue and establish information transfer across\nblocks, we propose a Momentum Auxiliary Network (MAN) that establishes a\ndynamic interaction mechanism. The MAN leverages an exponential moving average\n(EMA) of the parameters from adjacent local blocks to enhance information flow.\nThis auxiliary network, updated through EMA, helps bridge the informational gap\nbetween blocks. Nevertheless, we observe that directly applying EMA parameters\nhas certain limitations due to feature discrepancies among local blocks. To\novercome this, we introduce learnable biases, further boosting performance. We\nhave validated our method on four image classification datasets (CIFAR-10,\nSTL-10, SVHN, ImageNet), attaining superior performance and substantial memory\nsavings. Notably, our method can reduce GPU memory usage by more than 45\\% on\nthe ImageNet dataset compared to end-to-end training, while achieving higher\nperformance. The Momentum Auxiliary Network thus offers a new perspective for\nsupervised local learning. Our code is available at:\nhttps://github.com/JunhaoSu0/MAN.\n","authors":["Junhao Su","Changpeng Cai","Feiyu Zhu","Chenghao He","Xiaojie Xu","Dongzhi Guan","Chenyang Si"],"pdf_url":"https://arxiv.org/pdf/2407.05623v4.pdf","comment":"Accepted by ECCV2024(Oral)"},{"id":"http://arxiv.org/abs/2408.06083v1","updated":"2024-08-12T11:58:45Z","published":"2024-08-12T11:58:45Z","title":"Towards Robust Monocular Depth Estimation in Non-Lambertian Surfaces","summary":" In the field of monocular depth estimation (MDE), many models with excellent\nzero-shot performance in general scenes emerge recently. However, these methods\noften fail in predicting non-Lambertian surfaces, such as transparent or mirror\n(ToM) surfaces, due to the unique reflective properties of these regions.\nPrevious methods utilize externally provided ToM masks and aim to obtain\ncorrect depth maps through direct in-painting of RGB images. These methods\nhighly depend on the accuracy of additional input masks, and the use of random\ncolors during in-painting makes them insufficiently robust. We are committed to\nincrementally enabling the baseline model to directly learn the uniqueness of\nnon-Lambertian surface regions for depth estimation through a well-designed\ntraining framework. Therefore, we propose non-Lambertian surface regional\nguidance, which constrains the predictions of MDE model from the gradient\ndomain to enhance its robustness. Noting the significant impact of lighting on\nthis task, we employ the random tone-mapping augmentation during training to\nensure the network can predict correct results for varying lighting inputs.\nAdditionally, we propose an optional novel lighting fusion module, which uses\nVariational Autoencoders to fuse multiple images and obtain the most\nadvantageous input RGB image for depth estimation when multi-exposure images\nare available. Our method achieves accuracy improvements of 33.39% and 5.21% in\nzero-shot testing on the Booster and Mirror3D dataset for non-Lambertian\nsurfaces, respectively, compared to the Depth Anything V2. The state-of-the-art\nperformance of 90.75 in delta1.05 within the ToM regions on the TRICKY2024\ncompetition test set demonstrates the effectiveness of our approach.\n","authors":["Junrui Zhang","Jiaqi Li","Yachuan Huang","Yiran Wang","Jinghong Zheng","Liao Shen","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2408.06083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11962v2","updated":"2024-08-12T11:56:30Z","published":"2023-07-22T02:38:03Z","title":"MIMONet: Multi-Input Multi-Output On-Device Deep Learning","summary":" Future intelligent robots are expected to process multiple inputs\nsimultaneously (such as image and audio data) and generate multiple outputs\naccordingly (such as gender and emotion), similar to humans. Recent research\nhas shown that multi-input single-output (MISO) deep neural networks (DNN)\noutperform traditional single-input single-output (SISO) models, representing a\nsignificant step towards this goal. In this paper, we propose MIMONet, a novel\non-device multi-input multi-output (MIMO) DNN framework that achieves high\naccuracy and on-device efficiency in terms of critical performance metrics such\nas latency, energy, and memory usage. Leveraging existing SISO model\ncompression techniques, MIMONet develops a new deep-compression method that is\nspecifically tailored to MIMO models. This new method explores unique yet\nnon-trivial properties of the MIMO model, resulting in boosted accuracy and\non-device efficiency. Extensive experiments on three embedded platforms\ncommonly used in robotic systems, as well as a case study using the TurtleBot3\nrobot, demonstrate that MIMONet achieves higher accuracy and superior on-device\nefficiency compared to state-of-the-art SISO and MISO models, as well as a\nbaseline MIMO model we constructed. Our evaluation highlights the real-world\napplicability of MIMONet and its potential to significantly enhance the\nperformance of intelligent robotic systems.\n","authors":["Zexin Li","Xiaoxi He","Yufei Li","Wei Yang","Lothar Thiele","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11962v2.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2408.06079v1","updated":"2024-08-12T11:56:06Z","published":"2024-08-12T11:56:06Z","title":"Towards Adversarial Robustness via Debiased High-Confidence Logit\n Alignment","summary":" Despite the significant advances that deep neural networks (DNNs) have\nachieved in various visual tasks, they still exhibit vulnerability to\nadversarial examples, leading to serious security concerns. Recent adversarial\ntraining techniques have utilized inverse adversarial attacks to generate\nhigh-confidence examples, aiming to align the distributions of adversarial\nexamples with the high-confidence regions of their corresponding classes.\nHowever, in this paper, our investigation reveals that high-confidence outputs\nunder inverse adversarial attacks are correlated with biased feature\nactivation. Specifically, training with inverse adversarial examples causes the\nmodel's attention to shift towards background features, introducing a spurious\ncorrelation bias. To address this bias, we propose Debiased High-Confidence\nAdversarial Training (DHAT), a novel approach that not only aligns the logits\nof adversarial examples with debiased high-confidence logits obtained from\ninverse adversarial examples, but also restores the model's attention to its\nnormal state by enhancing foreground logit orthogonality. Extensive experiments\ndemonstrate that DHAT achieves state-of-the-art performance and exhibits robust\ngeneralization capabilities across various vision datasets. Additionally, DHAT\ncan seamlessly integrate with existing advanced adversarial training techniques\nfor improving the performance.\n","authors":["Kejia Zhang","Juanjuan Weng","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2408.06079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11474v3","updated":"2024-08-12T11:55:21Z","published":"2024-04-17T15:28:53Z","title":"Towards Highly Realistic Artistic Style Transfer via Stable Diffusion\n with Step-aware and Layer-aware Prompt","summary":" Artistic style transfer aims to transfer the learned artistic style onto an\narbitrary content image, generating artistic stylized images. Existing\ngenerative adversarial network-based methods fail to generate highly realistic\nstylized images and always introduce obvious artifacts and disharmonious\npatterns. Recently, large-scale pre-trained diffusion models opened up a new\nway for generating highly realistic artistic stylized images. However,\ndiffusion model-based methods generally fail to preserve the content structure\nof input content images well, introducing some undesired content structure and\nstyle patterns. To address the above problems, we propose a novel pre-trained\ndiffusion-based artistic style transfer method, called LSAST, which can\ngenerate highly realistic artistic stylized images while preserving the content\nstructure of input content images well, without bringing obvious artifacts and\ndisharmonious style patterns. Specifically, we introduce a Step-aware and\nLayer-aware Prompt Space, a set of learnable prompts, which can learn the style\ninformation from the collection of artworks and dynamically adjusts the input\nimages' content structure and style pattern. To train our prompt space, we\npropose a novel inversion method, called Step-ware and Layer-aware Prompt\nInversion, which allows the prompt space to learn the style information of the\nartworks collection. In addition, we inject a pre-trained conditional branch of\nControlNet into our LSAST, which further improved our framework's ability to\nmaintain content structure. Extensive experiments demonstrate that our proposed\nmethod can generate more highly realistic artistic stylized images than the\nstate-of-the-art artistic style transfer methods.\n","authors":["Zhanjie Zhang","Quanwei Zhang","Huaizhong Lin","Wei Xing","Juncheng Mo","Shuaicheng Huang","Jinheng Xie","Guangyuan Li","Junsheng Luan","Lei Zhao","Dalong Zhang","Lixia Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11474v3.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2408.06075v1","updated":"2024-08-12T11:48:57Z","published":"2024-08-12T11:48:57Z","title":"Five Pitfalls When Assessing Synthetic Medical Images with Reference\n Metrics","summary":" Reference metrics have been developed to objectively and quantitatively\ncompare two images. Especially for evaluating the quality of reconstructed or\ncompressed images, these metrics have shown very useful. Extensive tests of\nsuch metrics on benchmarks of artificially distorted natural images have\nrevealed which metric best correlate with human perception of quality. Direct\ntransfer of these metrics to the evaluation of generative models in medical\nimaging, however, can easily lead to pitfalls, because assumptions about image\ncontent, image data format and image interpretation are often very different.\nAlso, the correlation of reference metrics and human perception of quality can\nvary strongly for different kinds of distortions and commonly used metrics,\nsuch as SSIM, PSNR and MAE are not the best choice for all situations. We\nselected five pitfalls that showcase unexpected and probably undesired\nreference metric scores and discuss strategies to avoid them.\n","authors":["Melanie Dohmen","Tuan Truong","Ivo M. Baltruschat","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2408.06075v1.pdf","comment":"10 pages, 5 figures, accepted at Deep Generative Models workshop @\n MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.06072v1","updated":"2024-08-12T11:47:11Z","published":"2024-08-12T11:47:11Z","title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","summary":" We introduce CogVideoX, a large-scale diffusion transformer model designed\nfor generating videos based on text prompts. To efficently model video data, we\npropose to levearge a 3D Variational Autoencoder (VAE) to compress videos along\nboth spatial and temporal dimensions. To improve the text-video alignment, we\npropose an expert transformer with the expert adaptive LayerNorm to facilitate\nthe deep fusion between the two modalities. By employing a progressive training\ntechnique, CogVideoX is adept at producing coherent, long-duration videos\ncharacterized by significant motions. In addition, we develop an effective\ntext-video data processing pipeline that includes various data preprocessing\nstrategies and a video captioning method. It significantly helps enhance the\nperformance of CogVideoX, improving both generation quality and semantic\nalignment. Results show that CogVideoX demonstrates state-of-the-art\nperformance across both multiple machine metrics and human evaluations. The\nmodel weights of both the 3D Causal VAE and CogVideoX are publicly available at\nhttps://github.com/THUDM/CogVideo.\n","authors":["Zhuoyi Yang","Jiayan Teng","Wendi Zheng","Ming Ding","Shiyu Huang","Jiazheng Xu","Yuanming Yang","Wenyi Hong","Xiaohan Zhang","Guanyu Feng","Da Yin","Xiaotao Gu","Yuxuan Zhang","Weihan Wang","Yean Cheng","Ting Liu","Bin Xu","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.06072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06071v1","updated":"2024-08-12T11:44:47Z","published":"2024-08-12T11:44:47Z","title":"A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in\n Adverse Weather and Lighting","summary":" High-autonomy vehicle functions rely on machine learning (ML) algorithms to\nunderstand the environment. Despite displaying remarkable performance in fair\nweather scenarios, perception algorithms are heavily affected by adverse\nweather and lighting conditions. To overcome these difficulties, ML engineers\nmainly rely on comprehensive real-world datasets. However, the difficulties in\nreal-world data collection for critical areas of the operational design domain\n(ODD) often means synthetic data is required for perception training and safety\nvalidation. Thus, we present A-BDD, a large set of over 60,000 synthetically\naugmented images based on BDD100K that are equipped with semantic segmentation\nand bounding box annotations (inherited from the BDD100K dataset). The dataset\ncontains augmented data for rain, fog, overcast and sunglare/shadow with\nvarying intensity levels. We further introduce novel strategies utilizing\nfeature-based image quality metrics like FID and CMMD, which help identify\nuseful augmented and real-world data for ML training and testing. By conducting\nexperiments on A-BDD, we provide evidence that data augmentations can play a\npivotal role in closing performance gaps in adverse weather and lighting\nconditions.\n","authors":["Felix Assion","Florens Gressner","Nitin Augustine","Jona Klemenc","Ahmed Hammam","Alexandre Krattinger","Holger Trittenbach","Sascha Riemer"],"pdf_url":"https://arxiv.org/pdf/2408.06071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06070v1","updated":"2024-08-12T11:41:18Z","published":"2024-08-12T11:41:18Z","title":"ControlNeXt: Powerful and Efficient Control for Image and Video\n Generation","summary":" Diffusion models have demonstrated remarkable and robust abilities in both\nimage and video generation. To achieve greater control over generated results,\nresearchers introduce additional architectures, such as ControlNet, Adapters\nand ReferenceNet, to integrate conditioning controls. However, current\ncontrollable generation methods often require substantial additional\ncomputational resources, especially for video generation, and face challenges\nin training or exhibit weak control. In this paper, we propose ControlNeXt: a\npowerful and efficient method for controllable image and video generation. We\nfirst design a more straightforward and efficient architecture, replacing heavy\nadditional branches with minimal additional cost compared to the base model.\nSuch a concise structure also allows our method to seamlessly integrate with\nother LoRA weights, enabling style alteration without the need for additional\ntraining. As for training, we reduce up to 90% of learnable parameters compared\nto the alternatives. Furthermore, we propose another method called Cross\nNormalization (CN) as a replacement for Zero-Convolution' to achieve fast and\nstable training convergence. We have conducted various experiments with\ndifferent base models across images and videos, demonstrating the robustness of\nour method.\n","authors":["Bohao Peng","Jian Wang","Yuechen Zhang","Wenbo Li","Ming-Chang Yang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2408.06070v1.pdf","comment":"controllable generation"},{"id":"http://arxiv.org/abs/2408.06054v1","updated":"2024-08-12T11:00:04Z","published":"2024-08-12T11:00:04Z","title":"Parallel transport on matrix manifolds and Exponential Action","summary":" We express parallel transport for several common matrix Lie groups with a\nfamily of pseudo-Riemannian metrics in terms of matrix exponential and\nexponential actions. The expression for parallel transport is preserved by\ntaking the quotient under certain scenarios. In particular, for a Stiefel\nmanifold of orthogonal matrices of size $n\\times d$, we give an expression for\nparallel transport along a geodesic from time zero to $t$, that could be\ncomputed with time complexity of $O(nd^2)$ for small $t$, and of $O(td^3)$ for\nlarge t, contributing a step in a long-standing open problem in matrix\nmanifolds. A similar result holds for flag manifolds with the canonical metric.\nWe also show the parallel transport formulas for the generalized linear group,\nand the special orthogonal group under these metrics.\n","authors":["Du Nguyen","Stefan Sommer"],"pdf_url":"https://arxiv.org/pdf/2408.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15289v2","updated":"2024-08-12T10:53:03Z","published":"2024-05-24T07:22:35Z","title":"Learning Invariant Causal Mechanism from Vision-Language Models","summary":" Large-scale pre-trained vision-language models such as CLIP have been widely\napplied to a variety of downstream scenarios. In real-world applications, the\nCLIP model is often utilized in more diverse scenarios than those encountered\nduring its training, a challenge known as the out-of-distribution (OOD)\nproblem. However, our experiments reveal that CLIP performs unsatisfactorily in\ncertain domains. Through a causal analysis, we find that CLIP's current\nprediction process cannot guarantee a low OOD risk. The lowest OOD risk can be\nachieved when the prediction process is based on invariant causal mechanisms,\ni.e., predicting solely based on invariant latent factors. However, theoretical\nanalysis indicates that CLIP does not identify these invariant latent factors.\nTherefore, we propose the Invariant Causal Mechanism for CLIP (CLIP-ICM), a\nframework that first identifies invariant latent factors using interventional\ndata and then performs invariant predictions across various domains. Our method\nis simple yet effective, without significant computational overhead.\nExperimental results demonstrate that CLIP-ICM significantly improves CLIP's\nperformance in OOD scenarios.\n","authors":["Zeen Song","Siyu Zhao","Xingyu Zhang","Jiangmeng Li","Changwen Zheng","Wenwen Qiang"],"pdf_url":"https://arxiv.org/pdf/2405.15289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06047v1","updated":"2024-08-12T10:39:59Z","published":"2024-08-12T10:39:59Z","title":"BooW-VTON: Boosting In-the-Wild Virtual Try-On via Mask-Free Pseudo Data\n Training","summary":" Image-based virtual try-on is an increasingly popular and important task to\ngenerate realistic try-on images of specific person. Existing methods always\nemploy an accurate mask to remove the original garment in the source image,\nthus achieving realistic synthesized images in simple and conventional try-on\nscenarios based on powerful diffusion model. Therefore, acquiring suitable mask\nis vital to the try-on performance of these methods. However, obtaining precise\ninpainting masks, especially for complex wild try-on data containing diverse\nforeground occlusions and person poses, is not easy as Figure 1-Top shows. This\ndifficulty often results in poor performance in more practical and challenging\nreal-life scenarios, such as the selfie scene shown in Figure 1-Bottom. To this\nend, we propose a novel training paradigm combined with an efficient data\naugmentation method to acquire large-scale unpaired training data from wild\nscenarios, thereby significantly facilitating the try-on performance of our\nmodel without the need for additional inpainting masks. Besides, a try-on\nlocalization loss is designed to localize a more accurate try-on area to obtain\nmore reasonable try-on results. It is noted that our method only needs the\nreference cloth image, source pose image and source person image as input,\nwhich is more cost-effective and user-friendly compared to existing methods.\nExtensive qualitative and quantitative experiments have demonstrated superior\nperformance in wild scenarios with such a low-demand input.\n","authors":["Xuanpu Zhang","Dan Song","Pengxin Zhan","Qingguo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05206v3","updated":"2024-08-12T10:23:27Z","published":"2024-07-06T23:16:41Z","title":"Helios: An extremely low power event-based gesture recognition for\n always-on smart eyewear","summary":" This paper introduces Helios, the first extremely low-power, real-time,\nevent-based hand gesture recognition system designed for all-day on smart\neyewear. As augmented reality (AR) evolves, current smart glasses like the Meta\nRay-Bans prioritize visual and wearable comfort at the expense of\nfunctionality. Existing human-machine interfaces (HMIs) in these devices, such\nas capacitive touch and voice controls, present limitations in ergonomics,\nprivacy and power consumption. Helios addresses these challenges by leveraging\nnatural hand interactions for a more intuitive and comfortable user experience.\nOur system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera\nto perform natural hand-based gesture recognition for always-on smart eyewear.\nThe camera's output is processed by a convolutional neural network (CNN)\nrunning on a NXP Nano UltraLite compute platform, consuming less than 350mW.\nHelios can recognize seven classes of gestures, including subtle microgestures\nlike swipes and pinches, with 91% accuracy. We also demonstrate real-time\nperformance across 20 users at a remarkably low latency of 60ms. Our user\ntesting results align with the positive feedback we received during our recent\nsuccessful demo at AWE-USA-2024.\n","authors":["Prarthana Bhattacharyya","Joshua Mitton","Ryan Page","Owen Morgan","Ben Menzies","Gabriel Homewood","Kemi Jacobs","Paolo Baesso","Dave Trickett","Chris Mair","Taru Muhonen","Rory Clark","Louis Berridge","Richard Vigars","Iain Wallace"],"pdf_url":"https://arxiv.org/pdf/2407.05206v3.pdf","comment":"Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.\n 18 pages, 10 figures. First three authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2408.00343v2","updated":"2024-08-12T10:19:08Z","published":"2024-08-01T07:27:54Z","title":"IN-Sight: Interactive Navigation through Sight","summary":" Current visual navigation systems often treat the environment as static,\nlacking the ability to adaptively interact with obstacles. This limitation\nleads to navigation failure when encountering unavoidable obstructions. In\nresponse, we introduce IN-Sight, a novel approach to self-supervised path\nplanning, enabling more effective navigation strategies through interaction\nwith obstacles. Utilizing RGB-D observations, IN-Sight calculates\ntraversability scores and incorporates them into a semantic map, facilitating\nlong-range path planning in complex, maze-like environments. To precisely\nnavigate around obstacles, IN-Sight employs a local planner, trained\nimperatively on a differentiable costmap using representation learning\ntechniques. The entire framework undergoes end-to-end training within the\nstate-of-the-art photorealistic Intel SPEAR Simulator. We validate the\neffectiveness of IN-Sight through extensive benchmarking in a variety of\nsimulated scenarios and ablation studies. Moreover, we demonstrate the system's\nreal-world applicability with zero-shot sim-to-real transfer, deploying our\nplanner on the legged robot platform ANYmal, showcasing its practical potential\nfor interactive navigation in real environments.\n","authors":["Philipp Schoch","Fan Yang","Yuntao Ma","Stefan Leutenegger","Marco Hutter","Quentin Leboutet"],"pdf_url":"https://arxiv.org/pdf/2408.00343v2.pdf","comment":"The 2024 IEEE/RSJ International Conference on Intelligent Robots and\n Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2408.06040v1","updated":"2024-08-12T10:15:13Z","published":"2024-08-12T10:15:13Z","title":"ARPA: A Novel Hybrid Model for Advancing Visual Word Disambiguation\n Using Large Language Models and Transformers","summary":" In the rapidly evolving fields of natural language processing and computer\nvision, Visual Word Sense Disambiguation (VWSD) stands as a critical, yet\nchallenging task. The quest for models that can seamlessly integrate and\ninterpret multimodal data is more pressing than ever. Imagine a system that can\nunderstand language with the depth and nuance of human cognition, while\nsimultaneously interpreting the rich visual context of the world around it.\n We present ARPA, an architecture that fuses the unparalleled contextual\nunderstanding of large language models with the advanced feature extraction\ncapabilities of transformers, which then pass through a custom Graph Neural\nNetwork (GNN) layer to learn intricate relationships and subtle nuances within\nthe data. This innovative architecture not only sets a new benchmark in visual\nword disambiguation but also introduces a versatile framework poised to\ntransform how linguistic and visual data interact by harnessing the synergistic\nstrengths of its components, ensuring robust performance even in the most\ncomplex disambiguation scenarios. Through a series of experiments and\ncomparative analysis, we reveal the substantial advantages of our model,\nunderscoring its potential to redefine standards in the field. Beyond its\narchitectural prowess, our architecture excels through experimental\nenrichments, including sophisticated data augmentation and multi-modal training\ntechniques.\n ARPA's introduction marks a significant milestone in visual word\ndisambiguation, offering a compelling solution that bridges the gap between\nlinguistic and visual modalities. We invite researchers and practitioners to\nexplore the capabilities of our model, envisioning a future where such hybrid\nmodels drive unprecedented advancements in artificial intelligence.\n","authors":["Aristi Papastavrou","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2408.06040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19132v2","updated":"2024-08-12T10:00:17Z","published":"2024-04-29T22:31:21Z","title":"Integrating Present and Past in Unsupervised Continual Learning","summary":" We formulate a unifying framework for unsupervised continual learning (UCL),\nwhich disentangles learning objectives that are specific to the present and the\npast data, encompassing stability, plasticity, and cross-task consolidation.\nThe framework reveals that many existing UCL approaches overlook cross-task\nconsolidation and try to balance plasticity and stability in a shared embedding\nspace. This results in worse performance due to a lack of within-task data\ndiversity and reduced effectiveness in learning the current task. Our method,\nOsiris, which explicitly optimizes all three objectives on separate embedding\nspaces, achieves state-of-the-art performance on all benchmarks, including two\nnovel benchmarks proposed in this paper featuring semantically structured task\nsequences. Compared to standard benchmarks, these two structured benchmarks\nmore closely resemble visual signals received by humans and animals when\nnavigating real-world environments. Finally, we show some preliminary evidence\nthat continual models can benefit from such realistic learning scenarios.\n","authors":["Yipeng Zhang","Laurent Charlin","Richard Zemel","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2404.19132v2.pdf","comment":"CoLLAs 2024 (Oral)"},{"id":"http://arxiv.org/abs/2404.12861v2","updated":"2024-08-12T09:53:29Z","published":"2024-04-19T13:01:30Z","title":"Weakly Supervised LiDAR Semantic Segmentation via Scatter Image\n Annotation","summary":" Weakly supervised LiDAR semantic segmentation has made significant strides\nwith limited labeled data. However, most existing methods focus on the network\ntraining under weak supervision, while efficient annotation strategies remain\nlargely unexplored. To tackle this gap, we implement LiDAR semantic\nsegmentation using scatter image annotation, effectively integrating an\nefficient annotation strategy with network training. Specifically, we propose\nemploying scatter images to annotate LiDAR point clouds, combining a\npre-trained optical flow estimation network with a foundation image\nsegmentation model to rapidly propagate manual annotations into dense labels\nfor both images and point clouds. Moreover, we propose ScatterNet, a network\nthat includes three pivotal strategies to reduce the performance gap caused by\nsuch annotations. Firstly, it utilizes dense semantic labels as supervision for\nthe image branch, alleviating the modality imbalance between point clouds and\nimages. Secondly, an intermediate fusion branch is proposed to obtain\nmultimodal texture and structural features. Lastly, a perception consistency\nloss is introduced to determine which information needs to be fused and which\nneeds to be discarded during the fusion process. Extensive experiments on the\nnuScenes and SemanticKITTI datasets have demonstrated that our method requires\nless than 0.02% of the labeled points to achieve over 95% of the performance of\nfully-supervised methods. Notably, our labeled points are only 5% of those used\nin the most advanced weakly supervised methods.\n","authors":["Yilong Chen","Zongyi Xu","xiaoshui Huang","Ruicheng Zhang","Xinqi Jiang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2404.12861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06024v1","updated":"2024-08-12T09:24:48Z","published":"2024-08-12T09:24:48Z","title":"Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis\n Search","summary":" Deep neural network models have a complex architecture and are\noverparameterized. The number of parameters is more than the whole dataset,\nwhich is highly resource-consuming. This complicates their application and\nlimits its usage on different devices. Reduction in the number of network\nparameters helps to reduce the size of the model, but at the same time,\nthoughtlessly applied, can lead to a deterioration in the quality of the\nnetwork. One way to reduce the number of model parameters is matrix\ndecomposition, where a matrix is represented as a product of smaller matrices.\nIn this paper, we propose a new way of applying the matrix decomposition with\nrespect to the weights of convolutional layers. The essence of the method is to\ntrain not all convolutions, but only the subset of convolutions (basis\nconvolutions), and represent the rest as linear combinations of the basis ones.\nExperiments on models from the ResNet family and the CIFAR-10 dataset\ndemonstrate that basis convolutions can not only reduce the size of the model\nbut also accelerate the forward and backward passes of the network. Another\ncontribution of this work is that we propose a fast method for selecting a\nsubset of network layers in which the use of matrix decomposition does not\ndegrade the quality of the final model.\n","authors":["Vasiliy Alekseev","Ilya Lukashevich","Ilia Zharikov","Ilya Vasiliev"],"pdf_url":"https://arxiv.org/pdf/2408.06024v1.pdf","comment":"A revived draft of an unpublished (and never-to-be-published)\n article. For the sake of history, memory, and old times"},{"id":"http://arxiv.org/abs/2408.06021v1","updated":"2024-08-12T09:21:15Z","published":"2024-08-12T09:21:15Z","title":"ClickAttention: Click Region Similarity Guided Interactive Segmentation","summary":" Interactive segmentation algorithms based on click points have garnered\nsignificant attention from researchers in recent years.However, existing\nstudies typically use sparse click maps as model inputs to segment specific\ntarget objects, which primarily affect local regions and have limited abilities\nto focus on the whole target object, leading to increased times of clicks.In\naddition, most existing algorithms can not balance well between high\nperformance and efficiency.To address this issue, we propose a click attention\nalgorithm that expands the influence range of positive clicks based on the\nsimilarity between positively-clicked regions and the whole input.We also\npropose a discriminative affinity loss to reduce the attention coupling between\npositive and negative click regions to avoid an accuracy decrease caused by\nmutual interference between positive and negative clicks.Extensive experiments\ndemonstrate that our approach is superior to existing methods and achieves\ncutting-edge performance in fewer parameters.An interactive demo and all\nreproducible codes will be released at\nhttps://github.com/hahamyt/ClickAttention.\n","authors":["Long Xu","Shanghong Li","Yongquan Chen","Junkang Chen","Rui Huang","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06019v1","updated":"2024-08-12T09:19:38Z","published":"2024-08-12T09:19:38Z","title":"HeadGAP: Few-shot 3D Head Avatar via Generalizable Gaussian Priors","summary":" In this paper, we present a novel 3D head avatar creation approach capable of\ngeneralizing from few-shot in-the-wild data with high-fidelity and animatable\nrobustness. Given the underconstrained nature of this problem, incorporating\nprior knowledge is essential. Therefore, we propose a framework comprising\nprior learning and avatar creation phases. The prior learning phase leverages\n3D head priors derived from a large-scale multi-view dynamic dataset, and the\navatar creation phase applies these priors for few-shot personalization. Our\napproach effectively captures these priors by utilizing a Gaussian\nSplatting-based auto-decoder network with part-based dynamic modeling. Our\nmethod employs identity-shared encoding with personalized latent codes for\nindividual identities to learn the attributes of Gaussian primitives. During\nthe avatar creation phase, we achieve fast head avatar personalization by\nleveraging inversion and fine-tuning strategies. Extensive experiments\ndemonstrate that our model effectively exploits head priors and successfully\ngeneralizes them to few-shot personalization, achieving photo-realistic\nrendering quality, multi-view consistency, and stable animation.\n","authors":["Xiaozheng Zheng","Chao Wen","Zhaohu Li","Weiyi Zhang","Zhuo Su","Xu Chang","Yang Zhao","Zheng Lv","Xiaoyuan Zhang","Yongjie Zhang","Guidong Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.06019v1.pdf","comment":"Project page: https://headgap.github.io/"},{"id":"http://arxiv.org/abs/2407.18070v2","updated":"2024-08-12T09:15:04Z","published":"2024-07-25T14:25:17Z","title":"CSWin-UNet: Transformer UNet with Cross-Shaped Windows for Medical Image\n Segmentation","summary":" Deep learning, especially convolutional neural networks (CNNs) and\nTransformer architectures, have become the focus of extensive research in\nmedical image segmentation, achieving impressive results. However, CNNs come\nwith inductive biases that limit their effectiveness in more complex, varied\nsegmentation scenarios. Conversely, while Transformer-based methods excel at\ncapturing global and long-range semantic details, they suffer from high\ncomputational demands. In this study, we propose CSWin-UNet, a novel U-shaped\nsegmentation method that incorporates the CSWin self-attention mechanism into\nthe UNet to facilitate horizontal and vertical stripes self-attention. This\nmethod significantly enhances both computational efficiency and receptive field\ninteractions. Additionally, our innovative decoder utilizes a content-aware\nreassembly operator that strategically reassembles features, guided by\npredicted kernels, for precise image resolution restoration. Our extensive\nempirical evaluations on diverse datasets, including synapse multi-organ CT,\ncardiac MRI, and skin lesions, demonstrate that CSWin-UNet maintains low model\ncomplexity while delivering high segmentation accuracy.\n","authors":["Xiao Liu","Peng Gao","Tao Yu","Fei Wang","Ru-Yue Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.18070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06018v1","updated":"2024-08-12T09:14:23Z","published":"2024-08-12T09:14:23Z","title":"Uncertainty-Informed Volume Visualization using Implicit Neural\n Representation","summary":" The increasing adoption of Deep Neural Networks (DNNs) has led to their\napplication in many challenging scientific visualization tasks. While advanced\nDNNs offer impressive generalization capabilities, understanding factors such\nas model prediction quality, robustness, and uncertainty is crucial. These\ninsights can enable domain scientists to make informed decisions about their\ndata. However, DNNs inherently lack ability to estimate prediction uncertainty,\nnecessitating new research to construct robust uncertainty-aware visualization\ntechniques tailored for various visualization tasks. In this work, we propose\nuncertainty-aware implicit neural representations to model scalar field data\nsets effectively and comprehensively study the efficacy and benefits of\nestimated uncertainty information for volume visualization tasks. We evaluate\nthe effectiveness of two principled deep uncertainty estimation techniques: (1)\nDeep Ensemble and (2) Monte Carlo Dropout (MCDropout). These techniques enable\nuncertainty-informed volume visualization in scalar field data sets. Our\nextensive exploration across multiple data sets demonstrates that\nuncertainty-aware models produce informative volume visualization results.\nMoreover, integrating prediction uncertainty enhances the trustworthiness of\nour DNN model, making it suitable for robustly analyzing and visualizing\nreal-world scientific volumetric data sets.\n","authors":["Shanu Saklani","Chitwan Goel","Shrey Bansal","Zhe Wang","Soumya Dutta","Tushar M. Athawale","David Pugmire","Christopher R. Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.06018v1.pdf","comment":"To appear in IEEE Workshop on Uncertainty Visualization in\n conjunction with IEEE VIS 2024, Florida, USA"},{"id":"http://arxiv.org/abs/2408.06014v1","updated":"2024-08-12T08:59:56Z","published":"2024-08-12T08:59:56Z","title":"A Sharpness Based Loss Function for Removing Out-of-Focus Blur","summary":" The success of modern Deep Neural Network (DNN) approaches can be attributed\nto the use of complex optimization criteria beyond standard losses such as mean\nabsolute error (MAE) or mean squared error (MSE). In this work, we propose a\nnovel method of utilising a no-reference sharpness metric Q introduced by Zhu\nand Milanfar for removing out-of-focus blur from images. We also introduce a\nnovel dataset of real-world out-of-focus images for assessing restoration\nmodels. Our fine-tuned method produces images with a 7.5 % increase in\nperceptual quality (LPIPS) as compared to a standard model trained only on MAE.\nFurthermore, we observe a 6.7 % increase in Q (reflecting sharper restorations)\nand 7.25 % increase in PSNR over most state-of-the-art (SOTA) methods.\n","authors":["Uditangshu Aurangabadkar","Darren Ramsook","Anil Kokaram"],"pdf_url":"https://arxiv.org/pdf/2408.06014v1.pdf","comment":"6 pages, IEEE MMSP"},{"id":"http://arxiv.org/abs/2408.06010v1","updated":"2024-08-12T08:56:49Z","published":"2024-08-12T08:56:49Z","title":"DEEPTalk: Dynamic Emotion Embedding for Probabilistic Speech-Driven 3D\n Face Animation","summary":" Speech-driven 3D facial animation has garnered lots of attention thanks to\nits broad range of applications. Despite recent advancements in achieving\nrealistic lip motion, current methods fail to capture the nuanced emotional\nundertones conveyed through speech and produce monotonous facial motion. These\nlimitations result in blunt and repetitive facial animations, reducing user\nengagement and hindering their applicability. To address these challenges, we\nintroduce DEEPTalk, a novel approach that generates diverse and emotionally\nrich 3D facial expressions directly from speech inputs. To achieve this, we\nfirst train DEE (Dynamic Emotion Embedding), which employs probabilistic\ncontrastive learning to forge a joint emotion embedding space for both speech\nand facial motion. This probabilistic framework captures the uncertainty in\ninterpreting emotions from speech and facial motion, enabling the derivation of\nemotion vectors from its multifaceted space. Moreover, to generate dynamic\nfacial motion, we design TH-VQVAE (Temporally Hierarchical VQ-VAE) as an\nexpressive and robust motion prior overcoming limitations of VAEs and VQ-VAEs.\nUtilizing these strong priors, we develop DEEPTalk, A talking head generator\nthat non-autoregressively predicts codebook indices to create dynamic facial\nmotion, incorporating a novel emotion consistency loss. Extensive experiments\non various datasets demonstrate the effectiveness of our approach in creating\ndiverse, emotionally expressive talking faces that maintain accurate lip-sync.\nSource code will be made publicly available soon.\n","authors":["Jisoo Kim","Jungbin Cho","Joonho Park","Soonmin Hwang","Da Eun Kim","Geon Kim","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2408.06010v1.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2407.18100v2","updated":"2024-08-12T08:53:54Z","published":"2024-07-25T15:03:36Z","title":"DINOv2 Rocks Geological Image Analysis: Classification, Segmentation,\n and Interpretability","summary":" This study investigates the interpretability, classification, and\nsegmentation of CT-scan images of rock samples, with a particular focus on the\napplication of DINOv2 within Geosciences. We compared various segmentation\ntechniques to evaluate their efficacy, efficiency, and adaptability in\ngeological image analysis. The methods assessed include the Otsu thresholding\nmethod, clustering techniques (K-means and fuzzy C-means), a supervised machine\nlearning approach (Random Forest), and deep learning methods (UNet and DINOv2).\nWe tested these methods using ten binary sandstone datasets and three\nmulti-class calcite datasets. To begin, we provide a thorough interpretability\nanalysis of DINOv2's features in the geoscientific context, discussing its\nsuitability and inherent ability to process CT-scanned rock data. In terms of\nclassification, the out-of-the-box DINOv2 demonstrates an impressive capability\nto perfectly classify rock images, even when the CT scans are out of its\noriginal training set. Regarding segmentation, thresholding and unsupervised\nmethods, while fast, perform poorly despite image preprocessing, whereas\nsupervised methods show better results. We underscore the computational demands\nof deep learning but highlight its minimal intervention, superior\ngeneralization, and performance without additional image preprocessing.\nAdditionally, we observe a lack of correlation between a network's depth or the\nnumber of parameters and its performance. Our results show that a LoRA\nfine-tuned DINOv2 excels in out-of-distribution segmentation and significantly\noutperforms other methods in multi-class segmentation. By systematically\ncomparing these methods, we identify the most efficient strategy for meticulous\nand laborious segmentation tasks. DINOv2 proves advantageous, achieving\nsegmentations that could be described as \"better than ground-truth\" against\nrelatively small training sets.\n","authors":["Florent Brondolo","Samuel Beaussant"],"pdf_url":"https://arxiv.org/pdf/2407.18100v2.pdf","comment":"Minor typos fixing, link to the code, small changes"},{"id":"http://arxiv.org/abs/2408.06000v1","updated":"2024-08-12T08:49:00Z","published":"2024-08-12T08:49:00Z","title":"An Analysis for Image-to-Image Translation and Style Transfer","summary":" With the development of generative technologies in deep learning, a large\nnumber of image-to-image translation and style transfer models have emerged at\nan explosive rate in recent years. These two technologies have made significant\nprogress and can generate realistic images. However, many communities tend to\nconfuse the two, because both generate the desired image based on the input\nimage and both cover the two definitions of content and style. In fact, there\nare indeed significant differences between the two, and there is currently a\nlack of clear explanations to distinguish the two technologies, which is not\nconducive to the advancement of technology. We hope to serve the entire\ncommunity by introducing the differences and connections between image-to-image\ntranslation and style transfer. The entire discussion process involves the\nconcepts, forms, training modes, evaluation processes, and visualization\nresults of the two technologies. Finally, we conclude that image-to-image\ntranslation divides images by domain, and the types of images in the domain are\nlimited, and the scope involved is small, but the conversion ability is strong\nand can achieve strong semantic changes. Style transfer divides image types by\nsingle image, and the scope involved is large, but the transfer ability is\nlimited, and it transfers more texture and color of the image.\n","authors":["Xiaoming Yu","Jie Tian","Zhenhua Hu"],"pdf_url":"https://arxiv.org/pdf/2408.06000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00427v2","updated":"2024-08-12T08:45:19Z","published":"2024-08-01T09:59:57Z","title":"CARMIL: Context-Aware Regularization on Multiple Instance Learning\n models for Whole Slide Images","summary":" Multiple Instance Learning (MIL) models have proven effective for cancer\nprognosis from Whole Slide Images. However, the original MIL formulation\nincorrectly assumes the patches of the same image to be independent, leading to\na loss of spatial context as information flows through the network.\nIncorporating contextual knowledge into predictions is particularly important\ngiven the inclination for cancerous cells to form clusters and the presence of\nspatial indicators for tumors. State-of-the-art methods often use attention\nmechanisms eventually combined with graphs to capture spatial knowledge. In\nthis paper, we take a novel and transversal approach, addressing this issue\nthrough the lens of regularization. We propose Context-Aware Regularization for\nMultiple Instance Learning (CARMIL), a versatile regularization scheme designed\nto seamlessly integrate spatial knowledge into any MIL model. Additionally, we\npresent a new and generic metric to quantify the Context-Awareness of any MIL\nmodel when applied to Whole Slide Images, resolving a previously unexplored gap\nin the field. The efficacy of our framework is evaluated for two survival\nanalysis tasks on glioblastoma (TCGA GBM) and colon cancer data (TCGA COAD).\n","authors":["Thiziri Nait Saada","Valentina Di Proietto","Benoit Schmauch","Katharina Von Loga","Lucas Fidon"],"pdf_url":"https://arxiv.org/pdf/2408.00427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10105v5","updated":"2024-08-12T08:41:10Z","published":"2023-12-15T04:11:34Z","title":"SeiT++: Masked Token Modeling Improves Storage-efficient Training","summary":" Recent advancements in Deep Neural Network (DNN) models have significantly\nimproved performance across computer vision tasks. However, achieving highly\ngeneralizable and high-performing vision models requires expansive datasets,\nresulting in significant storage requirements. This storage challenge is a\ncritical bottleneck for scaling up models. A recent breakthrough by SeiT\nproposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as\nnetwork inputs for vision classification. This approach achieved 90% of the\nperformance of a model trained on full-pixel images with only 1% of the\nstorage. While SeiT needs labeled data, its potential in scenarios beyond fully\nsupervised learning remains largely untapped. In this paper, we extend SeiT by\nintegrating Masked Token Modeling (MTM) for self-supervised pre-training.\nRecognizing that self-supervised approaches often demand more data due to the\nlack of labels, we introduce TokenAdapt and ColorAdapt. These methods\nfacilitate comprehensive token-friendly data augmentation, effectively\naddressing the increased data requirements of self-supervised learning. We\nevaluate our approach across various scenarios, including storage-efficient\nImageNet-1k classification, fine-grained classification, ADE-20k semantic\nsegmentation, and robustness benchmarks. Experimental results demonstrate\nconsistent performance improvement in diverse experiments, validating the\neffectiveness of our method. Code is available at\nhttps://github.com/naver-ai/seit.\n","authors":["Minhyun Lee","Song Park","Byeongho Heo","Dongyoon Han","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.10105v5.pdf","comment":"Accepted to ECCV 2024. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2403.12559v2","updated":"2024-08-12T08:37:24Z","published":"2024-03-19T09:14:52Z","title":"Confidence Self-Calibration for Multi-Label Class-Incremental Learning","summary":" The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL)\narises when only the new classes are labeled during training, while past and\nfuture labels remain unavailable. This issue leads to a proliferation of\nfalse-positive errors due to erroneously high confidence multi-label\npredictions, exacerbating catastrophic forgetting within the disjoint label\nspace. In this paper, we aim to refine multi-label confidence calibration in\nMLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for\nlabel relationship calibration, we introduce a class-incremental graph\nconvolutional network that bridges the isolated label spaces by constructing\nlearnable, dynamically extended label relationship graph. Then, for confidence\ncalibration, we present a max-entropy regularization for each multi-label\nincrement, facilitating confidence self-calibration through the penalization of\nover-confident output distributions. Our approach attains new state-of-the-art\nresults in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the\ncalibration of label confidences confirmed through our methodology.\n","authors":["Kaile Du","Yifan Zhou","Fan Lyu","Yuyang Li","Chen Lu","Guangcan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12559v2.pdf","comment":"Accepted at the European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2305.13840v3","updated":"2024-08-12T08:30:05Z","published":"2023-05-23T09:03:19Z","title":"Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion\n Prior and Reward Feedback Learning","summary":" Recent advances in text-to-image (T2I) diffusion models have enabled\nimpressive image generation capabilities guided by text prompts. However,\nextending these techniques to video generation remains challenging, with\nexisting text-to-video (T2V) methods often struggling to produce high-quality\nand motion-consistent videos. In this work, we introduce Control-A-Video, a\ncontrollable T2V diffusion model that can generate videos conditioned on text\nprompts and reference control maps like edge and depth maps. To tackle video\nquality and motion consistency issues, we propose novel strategies to\nincorporate content prior and motion prior into the diffusion-based generation\nprocess. Specifically, we employ a first-frame condition scheme to transfer\nvideo generation from the image domain. Additionally, we introduce\nresidual-based and optical flow-based noise initialization to infuse motion\npriors from reference videos, promoting relevance among frame latents for\nreduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback\nLearning (ST-ReFL) algorithm that optimizes the video diffusion model using\nmultiple reward models for video quality and motion consistency, leading to\nsuperior outputs. Comprehensive experiments demonstrate that our framework\ngenerates higher-quality, more consistent videos compared to existing\nstate-of-the-art methods in controllable text-to-video generation\n","authors":["Weifeng Chen","Yatai Ji","Jie Wu","Hefeng Wu","Pan Xie","Jiashi Li","Xin Xia","Xuefeng Xiao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.13840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05539v3","updated":"2024-08-12T08:26:26Z","published":"2023-11-09T17:34:57Z","title":"A Deep Learning Method for Simultaneous Denoising and Missing Wedge\n Reconstruction in Cryogenic Electron Tomography","summary":" Cryogenic electron tomography is a technique for imaging biological samples\nin 3D. A microscope collects a series of 2D projections of the sample, and the\ngoal is to reconstruct the 3D density of the sample called the tomogram.\nReconstruction is difficult as the 2D projections are noisy and can not be\nrecorded from all directions, resulting in a missing wedge of information.\nTomograms conventionally reconstructed with filtered back-projection suffer\nfrom noise and strong artifacts due to the missing wedge. Here, we propose a\ndeep-learning approach for simultaneous denoising and missing wedge\nreconstruction called DeepDeWedge. The algorithm requires no ground truth data\nand is based on fitting a neural network to the 2D projections using a\nself-supervised loss. DeepDeWedge is simpler than current state-of-the-art\napproaches for denoising and missing wedge reconstruction, performs\ncompetitively and produces more denoised tomograms with higher overall\ncontrast.\n","authors":["Simon Wiedemann","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2311.05539v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21335v2","updated":"2024-08-12T08:21:36Z","published":"2024-07-31T04:57:06Z","title":"On-the-fly Point Feature Representation for Point Clouds Analysis","summary":" Point cloud analysis is challenging due to its unique characteristics of\nunorderness, sparsity and irregularity. Prior works attempt to capture local\nrelationships by convolution operations or attention mechanisms, exploiting\ngeometric information from coordinates implicitly. These methods, however, are\ninsufficient to describe the explicit local geometry, e.g., curvature and\norientation. In this paper, we propose On-the-fly Point Feature Representation\n(OPFR), which captures abundant geometric information explicitly through Curve\nFeature Generator module. This is inspired by Point Feature Histogram (PFH)\nfrom computer vision community. However, the utilization of vanilla PFH\nencounters great difficulties when applied to large datasets and dense point\nclouds, as it demands considerable time for feature generation. In contrast, we\nintroduce the Local Reference Constructor module, which approximates the local\ncoordinate systems based on triangle sets. Owing to this, our OPFR only\nrequires extra 1.56ms for inference (65x faster than vanilla PFH) and 0.012M\nmore parameters, and it can serve as a versatile plug-and-play module for\nvarious backbones, particularly MLP-based and Transformer-based backbones\nexamined in this study. Additionally, we introduce the novel Hierarchical\nSampling module aimed at enhancing the quality of triangle sets, thereby\nensuring robustness of the obtained geometric features. Our proposed method\nimproves overall accuracy (OA) on ModelNet40 from 90.7% to 94.5% (+3.8%) for\nclassification, and OA on S3DIS Area-5 from 86.4% to 90.0% (+3.6%) for semantic\nsegmentation, respectively, building upon PointNet++ backbone. When integrated\nwith Point Transformer backbone, we achieve state-of-the-art results on both\ntasks: 94.8% OA on ModelNet40 and 91.7% OA on S3DIS Area-5.\n","authors":["Jiangyi Wang","Zhongyao Cheng","Na Zhao","Jun Cheng","Xulei Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21335v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.05985v1","updated":"2024-08-12T08:21:04Z","published":"2024-08-12T08:21:04Z","title":"Diffuse-UDA: Addressing Unsupervised Domain Adaptation in Medical Image\n Segmentation with Appearance and Structure Aligned Diffusion Models","summary":" The scarcity and complexity of voxel-level annotations in 3D medical imaging\npresent significant challenges, particularly due to the domain gap between\nlabeled datasets from well-resourced centers and unlabeled datasets from\nless-resourced centers. This disparity affects the fairness of artificial\nintelligence algorithms in healthcare. We introduce Diffuse-UDA, a novel method\nleveraging diffusion models to tackle Unsupervised Domain Adaptation (UDA) in\nmedical image segmentation. Diffuse-UDA generates high-quality image-mask pairs\nwith target domain characteristics and various structures, thereby enhancing\nUDA tasks. Initially, pseudo labels for target domain samples are generated.\nSubsequently, a specially tailored diffusion model, incorporating deformable\naugmentations, is trained on image-label or image-pseudo-label pairs from both\ndomains. Finally, source domain labels guide the diffusion model to generate\nimage-label pairs for the target domain. Comprehensive evaluations on several\nbenchmarks demonstrate that Diffuse-UDA outperforms leading UDA and\nsemi-supervised strategies, achieving performance close to or even surpassing\nthe theoretical upper bound of models trained directly on target domain data.\nDiffuse-UDA offers a pathway to advance the development and deployment of AI\nsystems in medical imaging, addressing disparities between healthcare\nenvironments. This approach enables the exploration of innovative AI-driven\ndiagnostic tools, improves outcomes, saves time, and reduces human error.\n","authors":["Haifan Gong","Yitao Wang","Yihan Wang","Jiashun Xiao","Xiang Wan","Haofeng Li"],"pdf_url":"https://arxiv.org/pdf/2408.05985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05974v1","updated":"2024-08-12T08:02:37Z","published":"2024-08-12T08:02:37Z","title":"Unseen No More: Unlocking the Potential of CLIP for Generative Zero-shot\n HOI Detection","summary":" Zero-shot human-object interaction (HOI) detector is capable of generalizing\nto HOI categories even not encountered during training. Inspired by the\nimpressive zero-shot capabilities offered by CLIP, latest methods strive to\nleverage CLIP embeddings for improving zero-shot HOI detection. However, these\nembedding-based methods train the classifier on seen classes only, inevitably\nresulting in seen-unseen confusion for the model during inference. Besides, we\nfind that using prompt-tuning and adapters further increases the gap between\nseen and unseen accuracy. To tackle this challenge, we present the first\ngeneration-based model using CLIP for zero-shot HOI detection, coined HOIGen.\nIt allows to unlock the potential of CLIP for feature generation instead of\nfeature extraction only. To achieve it, we develop a CLIP-injected feature\ngenerator in accordance with the generation of human, object and union\nfeatures. Then, we extract realistic features of seen samples and mix them with\nsynthetic features together, allowing the model to train seen and unseen\nclasses jointly. To enrich the HOI scores, we construct a generative prototype\nbank in a pairwise HOI recognition branch, and a multi-knowledge prototype bank\nin an image-wise HOI recognition branch, respectively. Extensive experiments on\nHICO-DET benchmark demonstrate our HOIGen achieves superior performance for\nboth seen and unseen classes under various zero-shot settings, compared with\nother top-performing methods. Code is available at:\nhttps://github.com/soberguo/HOIGen\n","authors":["Yixin Guo","Yu Liu","Jianghao Li","Weimin Wang","Qi Jia"],"pdf_url":"https://arxiv.org/pdf/2408.05974v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.21454v2","updated":"2024-08-12T07:50:32Z","published":"2024-07-31T08:59:33Z","title":"StreetSurfaceVis: a dataset of crowdsourced street-level imagery with\n semi-automated annotations of road surface type and quality","summary":" Road unevenness significantly impacts the safety and comfort of various\ntraffic participants, especially vulnerable road users such as cyclists and\nwheelchair users. This paper introduces StreetSurfaceVis, a novel dataset\ncomprising 9,122 street-level images collected from a crowdsourcing platform\nand manually annotated by road surface type and quality. The dataset is\nintended to train models for comprehensive surface assessments of road\nnetworks. Existing open datasets are constrained by limited geospatial coverage\nand camera setups, typically excluding cycleways and footways. By crafting a\nheterogeneous dataset, we aim to fill this gap and enable robust models that\nmaintain high accuracy across diverse image sources. However, the frequency\ndistribution of road surface types and qualities is highly imbalanced. We\naddress the challenge of ensuring sufficient images per class while reducing\nmanual annotation by proposing a sampling strategy that incorporates various\nexternal label prediction resources. More precisely, we estimate the impact of\n(1) enriching the image data with OpenStreetMap tags, (2) iterative training\nand application of a custom surface type classification model, (3) amplifying\nunderrepresented classes through prompt-based classification with GPT-4o or\nsimilarity search using image embeddings. We show that utilizing a combination\nof these strategies effectively reduces manual annotation workload while\nensuring sufficient class representation.\n","authors":["Alexandra Kapp","Edith Hoffmann","Esther Weigmann","Helena Mihaljević"],"pdf_url":"https://arxiv.org/pdf/2407.21454v2.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.05966v1","updated":"2024-08-12T07:44:19Z","published":"2024-08-12T07:44:19Z","title":"Freehand Sketch Generation from Mechanical Components","summary":" Drawing freehand sketches of mechanical components on multimedia devices for\nAI-based engineering modeling has become a new trend. However, its development\nis being impeded because existing works cannot produce suitable sketches for\ndata-driven research. These works either generate sketches lacking a freehand\nstyle or utilize generative models not originally designed for this task\nresulting in poor effectiveness. To address this issue, we design a two-stage\ngenerative framework mimicking the human sketching behavior pattern, called\nMSFormer, which is the first time to produce humanoid freehand sketches\ntailored for mechanical components. The first stage employs Open CASCADE\ntechnology to obtain multi-view contour sketches from mechanical components,\nfiltering perturbing signals for the ensuing generation process. Meanwhile, we\ndesign a view selector to simulate viewpoint selection tasks during human\nsketching for picking out information-rich sketches. The second stage\ntranslates contour sketches into freehand sketches by a transformer-based\ngenerator. To retain essential modeling features as much as possible and\nrationalize stroke distribution, we introduce a novel edge-constraint stroke\ninitialization. Furthermore, we utilize a CLIP vision encoder and a new loss\nfunction incorporating the Hausdorff distance to enhance the generalizability\nand robustness of the model. Extensive experiments demonstrate that our\napproach achieves state-of-the-art performance for generating freehand sketches\nin the mechanical domain. Project page: https://mcfreeskegen.github.io .\n","authors":["Zhichao Liao","Di Huang","Heming Fang","Yue Ma","Fengyuan Piao","Xinghui Li","Long Zeng","Pingfa Feng"],"pdf_url":"https://arxiv.org/pdf/2408.05966v1.pdf","comment":"Published at ACM Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2401.16424v2","updated":"2024-08-12T07:41:32Z","published":"2024-01-29T18:59:56Z","title":"Computer Vision for Primate Behavior Analysis in the Wild","summary":" Advances in computer vision as well as increasingly widespread video-based\nbehavioral monitoring have great potential for transforming how we study animal\ncognition and behavior. However, there is still a fairly large gap between the\nexciting prospects and what can actually be achieved in practice today,\nespecially in videos from the wild. With this perspective paper, we want to\ncontribute towards closing this gap, by guiding behavioral scientists in what\ncan be expected from current methods and steering computer vision researchers\ntowards problems that are relevant to advance research in animal behavior. We\nstart with a survey of the state-of-the-art methods for computer vision\nproblems that are directly relevant to the video-based study of animal\nbehavior, including object detection, multi-individual tracking, individual\nidentification, and (inter)action recognition. We then review methods for\neffort-efficient learning, which is one of the biggest challenges from a\npractical perspective. Finally, we close with an outlook into the future of the\nemerging field of computer vision for animal behavior, where we argue that the\nfield should develop approaches to unify detection, tracking, identification\nand (inter)action recognition in a single, video-based framework.\n","authors":["Richard Vogg","Timo Lüddecke","Jonathan Henrich","Sharmita Dey","Matthias Nuske","Valentin Hassler","Derek Murphy","Julia Fischer","Julia Ostner","Oliver Schülke","Peter M. Kappeler","Claudia Fichtel","Alexander Gail","Stefan Treue","Hansjörg Scherberger","Florentin Wörgötter","Alexander S. Ecker"],"pdf_url":"https://arxiv.org/pdf/2401.16424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05964v1","updated":"2024-08-12T07:33:11Z","published":"2024-08-12T07:33:11Z","title":"Target Detection of Safety Protective Gear Using the Improved YOLOv5","summary":" In high-risk railway construction, personal protective equipment monitoring\nis critical but challenging due to small and frequently obstructed targets. We\npropose YOLO-EA, an innovative model that enhances safety measure detection by\nintegrating ECA into its backbone's convolutional layers, improving discernment\nof minuscule objects like hardhats. YOLO-EA further refines target recognition\nunder occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was\nempirically substantiated using a dataset derived from real-world railway\nconstruction site surveillance footage. It outperforms YOLOv5, achieving 98.9%\nprecision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining\nreal-time performance at 70.774 fps. This highly efficient and precise YOLO-EA\nholds great promise for practical application in intricate construction\nscenarios, enforcing stringent safety compliance during complex railway\nconstruction projects.\n","authors":["Hao Liu","Xue Qin"],"pdf_url":"https://arxiv.org/pdf/2408.05964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08353v2","updated":"2024-08-12T07:20:43Z","published":"2024-04-12T09:44:18Z","title":"TDANet: Target-Directed Attention Network For Object-Goal Visual\n Navigation With Zero-Shot Ability","summary":" The generalization of the end-to-end deep reinforcement learning (DRL) for\nobject-goal visual navigation is a long-standing challenge since object classes\nand placements vary in new test environments. Learning domain-independent\nvisual representation is critical for enabling the trained DRL agent with the\nability to generalize to unseen scenes and objects. In this letter, a\ntarget-directed attention network (TDANet) is proposed to learn the end-to-end\nobject-goal visual navigation policy with zero-shot ability. TDANet features a\nnovel target attention (TA) module that learns both the spatial and semantic\nrelationships among objects to help TDANet focus on the most relevant observed\nobjects to the target. With the Siamese architecture (SA) design, TDANet\ndistinguishes the difference between the current and target states and\ngenerates the domain-independent visual representation. To evaluate the\nnavigation performance of TDANet, extensive experiments are conducted in the\nAI2-THOR embodied AI environment. The simulation results demonstrate a strong\ngeneralization ability of TDANet to unseen scenes and target objects, with\nhigher navigation success rate (SR) and success weighted by length (SPL) than\nother state-of-the-art models. TDANet is finally deployed on a wheeled robot in\nreal scenes, demonstrating satisfactory generalization of TDANet to the real\nworld.\n","authors":["Shiwei Lian","Feitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.08353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18279v2","updated":"2024-08-12T07:14:00Z","published":"2023-05-29T17:50:33Z","title":"Contextual Object Detection with Multimodal Large Language Models","summary":" Recent Multimodal Large Language Models (MLLMs) are remarkable in\nvision-language tasks, such as image captioning and question answering, but\nlack the essential perception ability, i.e., object detection. In this work, we\naddress this limitation by introducing a novel research problem of contextual\nobject detection -- understanding visible objects within different human-AI\ninteractive contexts. Three representative scenarios are investigated,\nincluding the language cloze test, visual captioning, and question answering.\nMoreover, we present ContextDET, a unified multimodal model that is capable of\nend-to-end differentiable modeling of visual-language contexts, so as to\nlocate, identify, and associate visual objects with language inputs for\nhuman-AI interaction. Our ContextDET involves three key submodels: (i) a visual\nencoder for extracting visual representations, (ii) a pre-trained LLM for\nmultimodal context decoding, and (iii) a visual decoder for predicting bounding\nboxes given contextual object words. The new generate-then-detect framework\nenables us to detect object words within human vocabulary. Extensive\nexperiments show the advantages of ContextDET on our proposed CODE benchmark,\nopen-vocabulary detection, and referring image segmentation. Github:\nhttps://github.com/yuhangzang/ContextDET.\n","authors":["Yuhang Zang","Wei Li","Jun Han","Kaiyang Zhou","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2305.18279v2.pdf","comment":"IJCV 2024"},{"id":"http://arxiv.org/abs/2308.09908v4","updated":"2024-08-12T07:13:54Z","published":"2023-08-19T05:15:02Z","title":"LEGO: Learning and Graph-Optimized Modular Tracker for Online\n Multi-Object Tracking with Point Clouds","summary":" Online multi-object tracking (MOT) plays a pivotal role in autonomous\nsystems. The state-of-the-art approaches usually employ a tracking-by-detection\nmethod, and data association plays a critical role. This paper proposes a\nlearning and graph-optimized (LEGO) modular tracker to improve data association\nperformance in the existing literature. The proposed LEGO tracker integrates\ngraph optimization and self-attention mechanisms, which efficiently formulate\nthe association score map, facilitating the accurate and efficient matching of\nobjects across time frames. To further enhance the state update process, the\nKalman filter is added to ensure consistent tracking by incorporating temporal\ncoherence in the object states. Our proposed method utilizing LiDAR alone has\nshown exceptional performance compared to other online tracking approaches,\nincluding LiDAR-based and LiDAR-camera fusion-based methods. LEGO ranked 1st at\nthe time of submitting results to KITTI object tracking evaluation ranking\nboard and remains 2nd at the time of submitting this paper, among all online\ntrackers in the KITTI MOT benchmark for cars1\n","authors":["Zhenrong Zhang","Jianan Liu","Yuxuan Xia","Tao Huang","Qing-Long Han","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09908v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05956v1","updated":"2024-08-12T07:13:08Z","published":"2024-08-12T07:13:08Z","title":"Boosting Adverse Weather Crowd Counting via Multi-queue Contrastive\n Learning","summary":" Currently, most crowd counting methods have outstanding performance under\nnormal weather conditions. However, they often struggle to maintain their\nperformance in extreme and adverse weather conditions due to significant\ndifferences in the domain and a lack of adverse weather images for training. To\naddress this issue and enhance the model's robustness in adverse weather, we\npropose a two-stage crowd counting method. Specifically, in the first stage, we\nintroduce a multi-queue MoCo contrastive learning strategy to tackle the\nproblem of weather class imbalance. This strategy facilitates the learning of\nweather-aware representations by the model. In the second stage, we propose to\nrefine the representations under the guidance of contrastive learning, enabling\nthe conversion of the weather-aware representations to the normal weather\ndomain. While significantly improving the robustness, our method only\nmarginally increases the weight of the model. In addition, we also create a new\nsynthetic adverse weather dataset. Extensive experimental results show that our\nmethod achieves competitive performance.\n","authors":["Tianhang Pan","Zhuoran Zheng","Xiuyi Jia"],"pdf_url":"https://arxiv.org/pdf/2408.05956v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.05955v1","updated":"2024-08-12T07:09:12Z","published":"2024-08-12T07:09:12Z","title":"Probabilistic Vision-Language Representation for Weakly Supervised\n Temporal Action Localization","summary":" Weakly supervised temporal action localization (WTAL) aims to detect action\ninstances in untrimmed videos using only video-level annotations. Since many\nexisting works optimize WTAL models based on action classification labels, they\nencounter the task discrepancy problem (i.e., localization-by-classification).\nTo tackle this issue, recent studies have attempted to utilize action category\nnames as auxiliary semantic knowledge through vision-language pre-training\n(VLP). However, there are still areas where existing research falls short.\nPrevious approaches primarily focused on leveraging textual information from\nlanguage models but overlooked the alignment of dynamic human action and VLP\nknowledge in a joint space. Furthermore, the deterministic representation\nemployed in previous studies struggles to capture fine-grained human motions.\nTo address these problems, we propose a novel framework that aligns human\naction knowledge and VLP knowledge in a probabilistic embedding space.\nMoreover, we propose intra- and inter-distribution contrastive learning to\nenhance the probabilistic embedding space based on statistical similarities.\nExtensive experiments and ablation studies reveal that our method significantly\noutperforms all previous state-of-the-art methods. Code is available at\nhttps://github.com/sejong-rcv/PVLR.\n","authors":["Geuntaek Lim","Hyunwoo Kim","Joonsoo Kim","Yukyung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.05955v1.pdf","comment":"Accepted to ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.05953v1","updated":"2024-08-12T07:04:52Z","published":"2024-08-12T07:04:52Z","title":"A Simple Task-aware Contrastive Local Descriptor Selection Strategy for\n Few-shot Learning between inter class and intra class","summary":" Few-shot image classification aims to classify novel classes with few labeled\nsamples. Recent research indicates that deep local descriptors have better\nrepresentational capabilities. These studies recognize the impact of background\nnoise on classification performance. They typically filter query descriptors\nusing all local descriptors in the support classes or engage in bidirectional\nselection between local descriptors in support and query sets. However, they\nignore the fact that background features may be useful for the classification\nperformance of specific tasks. This paper proposes a novel task-aware\ncontrastive local descriptor selection network (TCDSNet). First, we calculate\nthe contrastive discriminative score for each local descriptor in the support\nclass, and select discriminative local descriptors to form a support descriptor\nsubset. Finally, we leverage support descriptor subsets to adaptively select\ndiscriminative query descriptors for specific tasks. Extensive experiments\ndemonstrate that our method outperforms state-of-the-art methods on both\ngeneral and fine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Shaoyao Huang","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2408.05953v1.pdf","comment":"Submitted to ICANN 2024"},{"id":"http://arxiv.org/abs/2408.05952v1","updated":"2024-08-12T07:03:35Z","published":"2024-08-12T07:03:35Z","title":"Optimizing Vision Transformers with Data-Free Knowledge Transfer","summary":" The groundbreaking performance of transformers in Natural Language Processing\n(NLP) tasks has led to their replacement of traditional Convolutional Neural\nNetworks (CNNs), owing to the efficiency and accuracy achieved through the\nself-attention mechanism. This success has inspired researchers to explore the\nuse of transformers in computer vision tasks to attain enhanced long-term\nsemantic awareness. Vision transformers (ViTs) have excelled in various\ncomputer vision tasks due to their superior ability to capture long-distance\ndependencies using the self-attention mechanism. Contemporary ViTs like Data\nEfficient Transformers (DeiT) can effectively learn both global semantic\ninformation and local texture information from images, achieving performance\ncomparable to traditional CNNs. However, their impressive performance comes\nwith a high computational cost due to very large number of parameters,\nhindering their deployment on devices with limited resources like smartphones,\ncameras, drones etc. Additionally, ViTs require a large amount of data for\ntraining to achieve performance comparable to benchmark CNN models. Therefore,\nwe identified two key challenges in deploying ViTs on smaller form factor\ndevices: the high computational requirements of large models and the need for\nextensive training data. As a solution to these challenges, we propose\ncompressing large ViT models using Knowledge Distillation (KD), which is\nimplemented data-free to circumvent limitations related to data availability.\nAdditionally, we conducted experiments on object detection within the same\nenvironment in addition to classification tasks. Based on our analysis, we\nfound that datafree knowledge distillation is an effective method to overcome\nboth issues, enabling the deployment of ViTs on less resourceconstrained\ndevices.\n","authors":["Gousia Habib","Damandeep Singh","Ishfaq Ahmad Malik","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2408.05952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05945v1","updated":"2024-08-12T06:46:05Z","published":"2024-08-12T06:46:05Z","title":"MV2DFusion: Leveraging Modality-Specific Object Semantics for\n Multi-Modal 3D Detection","summary":" The rise of autonomous vehicles has significantly increased the demand for\nrobust 3D object detection systems. While cameras and LiDAR sensors each offer\nunique advantages--cameras provide rich texture information and LiDAR offers\nprecise 3D spatial data--relying on a single modality often leads to\nperformance limitations. This paper introduces MV2DFusion, a multi-modal\ndetection framework that integrates the strengths of both worlds through an\nadvanced query-based fusion mechanism. By introducing an image query generator\nto align with image-specific attributes and a point cloud query generator,\nMV2DFusion effectively combines modality-specific object semantics without\nbiasing toward one single modality. Then the sparse fusion process can be\naccomplished based on the valuable object semantics, ensuring efficient and\naccurate object detection across various scenarios. Our framework's flexibility\nallows it to integrate with any image and point cloud-based detectors,\nshowcasing its adaptability and potential for future advancements. Extensive\nevaluations on the nuScenes and Argoverse2 datasets demonstrate that MV2DFusion\nachieves state-of-the-art performance, particularly excelling in long-range\ndetection scenarios.\n","authors":["Zitian Wang","Zehao Huang","Yulu Gao","Naiyan Wang","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2408.05945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05940v1","updated":"2024-08-12T06:33:38Z","published":"2024-08-12T06:33:38Z","title":"Spb3DTracker: A Robust LiDAR-Based Person Tracker for Noisy Environmen","summary":" Person detection and tracking (PDT) has seen significant advancements with 2D\ncamera-based systems in the autonomous vehicle field, leading to widespread\nadoption of these algorithms. However, growing privacy concerns have recently\nemerged as a major issue, prompting a shift towards LiDAR-based PDT as a viable\nalternative. Within this domain, \"Tracking-by-Detection\" (TBD) has become a\nprominent methodology. Despite its effectiveness, LiDAR-based PDT has not yet\nachieved the same level of performance as camera-based PDT. This paper examines\nkey components of the LiDAR-based PDT framework, including detection\npost-processing, data association, motion modeling, and lifecycle management.\nBuilding upon these insights, we introduce SpbTrack, a robust person tracker\ndesigned for diverse environments. Our method achieves superior performance on\nnoisy datasets and state-of-the-art results on KITTI Dataset benchmarks and\ncustom office indoor dataset among LiDAR-based trackers. Project page at\nanonymous.\n","authors":["Eunsoo Im","Changhyun Jee","Jung Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2408.05940v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.05939v1","updated":"2024-08-12T06:27:29Z","published":"2024-08-12T06:27:29Z","title":"UniPortrait: A Unified Framework for Identity-Preserving Single- and\n Multi-Human Image Personalization","summary":" This paper presents UniPortrait, an innovative human image personalization\nframework that unifies single- and multi-ID customization with high face\nfidelity, extensive facial editability, free-form input description, and\ndiverse layout generation. UniPortrait consists of only two plug-and-play\nmodules: an ID embedding module and an ID routing module. The ID embedding\nmodule extracts versatile editable facial features with a decoupling strategy\nfor each ID and embeds them into the context space of diffusion models. The ID\nrouting module then combines and distributes these embeddings adaptively to\ntheir respective regions within the synthesized image, achieving the\ncustomization of single and multiple IDs. With a carefully designed two-stage\ntraining scheme, UniPortrait achieves superior performance in both single- and\nmulti-ID customization. Quantitative and qualitative experiments demonstrate\nthe advantages of our method over existing approaches as well as its good\nscalability, e.g., the universal compatibility with existing generative control\ntools. The project page is at\nhttps://aigcdesigngroup.github.io/UniPortrait-Page/ .\n","authors":["Junjie He","Yifeng Geng","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2408.05939v1.pdf","comment":"Tech report; Project page:\n https://aigcdesigngroup.github.io/UniPortrait-Page/"},{"id":"http://arxiv.org/abs/2408.05938v1","updated":"2024-08-12T06:25:44Z","published":"2024-08-12T06:25:44Z","title":"Deep Geometric Moments Promote Shape Consistency in Text-to-3D\n Generation","summary":" To address the data scarcity associated with 3D assets, 2D-lifting techniques\nsuch as Score Distillation Sampling (SDS) have become a widely adopted practice\nin text-to-3D generation pipelines. However, the diffusion models used in these\ntechniques are prone to viewpoint bias and thus lead to geometric\ninconsistencies such as the Janus problem. To counter this, we introduce MT3D,\na text-to-3D generative model that leverages a high-fidelity 3D object to\novercome viewpoint bias and explicitly infuse geometric understanding into the\ngeneration pipeline. Firstly, we employ depth maps derived from a high-quality\n3D model as control signals to guarantee that the generated 2D images preserve\nthe fundamental shape and structure, thereby reducing the inherent viewpoint\nbias. Next, we utilize deep geometric moments to ensure geometric consistency\nin the 3D representation explicitly. By incorporating geometric details from a\n3D asset, MT3D enables the creation of diverse and geometrically consistent\nobjects, thereby improving the quality and usability of our 3D representations.\n","authors":["Utkarsh Nath","Rajeev Goel","Eun Som Jeon","Changhoon Kim","Kyle Min","Yezhou Yang","Yingzhen Yang","Pavan Turaga"],"pdf_url":"https://arxiv.org/pdf/2408.05938v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.05936v1","updated":"2024-08-12T06:23:10Z","published":"2024-08-12T06:23:10Z","title":"Multi-scale Contrastive Adaptor Learning for Segmenting Anything in\n Underperformed Scenes","summary":" Foundational vision models, such as the Segment Anything Model (SAM), have\nachieved significant breakthroughs through extensive pre-training on\nlarge-scale visual datasets. Despite their general success, these models may\nfall short in specialized tasks with limited data, and fine-tuning such\nlarge-scale models is often not feasible. Current strategies involve\nincorporating adaptors into the pre-trained SAM to facilitate downstream task\nperformance with minimal model adjustment. However, these strategies can be\nhampered by suboptimal learning approaches for the adaptors. In this paper, we\nintroduce a novel Multi-scale Contrastive Adaptor learning method named\nMCA-SAM, which enhances adaptor performance through a meticulously designed\ncontrastive learning framework at both token and sample levels. Our Token-level\nContrastive adaptor (TC-adaptor) focuses on refining local representations by\nimproving the discriminability of patch tokens, while the Sample-level\nContrastive adaptor (SC-adaptor) amplifies global understanding across\ndifferent samples. Together, these adaptors synergistically enhance feature\ncomparison within and across samples, bolstering the model's representational\nstrength and its ability to adapt to new tasks. Empirical results demonstrate\nthat MCA-SAM sets new benchmarks, outperforming existing methods in three\nchallenging domains: camouflage object detection, shadow segmentation, and\npolyp segmentation. Specifically, MCA-SAM exhibits substantial relative\nperformance enhancements, achieving a 20.0% improvement in MAE on the COD10K\ndataset, a 6.0% improvement in MAE on the CAMO dataset, a 15.4% improvement in\nBER on the ISTD dataset, and a 7.9% improvement in mDice on the Kvasir-SEG\ndataset.\n","authors":["Ke Zhou","Zhongwei Qiu","Dongmei Fu"],"pdf_url":"https://arxiv.org/pdf/2408.05936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18568v2","updated":"2024-08-12T06:11:33Z","published":"2024-06-02T13:25:44Z","title":"A Diagnostic Model for Acute Lymphoblastic Leukemia Using Metaheuristics\n and Deep Learning Methods","summary":" Acute lymphoblastic leukemia (ALL) severity is determined by the presence and\nratios of blast cells (abnormal white blood cells) in both bone marrow and\nperipheral blood. Manual diagnosis of this disease is a tedious and\ntime-consuming operation, making it difficult for professionals to accurately\nexamine blast cell characteristics. To address this difficulty, researchers use\ndeep learning and machine learning. In this paper, a ResNet-based feature\nextractor is utilized to detect ALL, along with a variety of feature selectors\nand classifiers. To get the best results, a variety of transfer learning\nmodels, including the Resnet, VGG, EfficientNet, and DensNet families, are used\nas deep feature extractors. Following extraction, different feature selectors\nare used, including Genetic algorithm, PCA, ANOVA, Random Forest, Univariate,\nMutual information, Lasso, XGB, Variance, and Binary ant colony. After feature\nqualification, a variety of classifiers are used, with MLP outperforming the\nothers. The recommended technique is used to categorize ALL and HEM in the\nselected dataset which is C-NMC 2019. This technique got an impressive 90.71%\naccuracy and 95.76% sensitivity for the relevant classifications, and its\nmetrics on this dataset outperformed others.\n","authors":["Amir Masoud Rahmani","Parisa Khoshvaght","Hamid Alinejad-Rokny","Samira Sadeghi","Parvaneh Asghari","Zohre Arabi","Mehdi Hosseinzadeh"],"pdf_url":"https://arxiv.org/pdf/2406.18568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03468v2","updated":"2024-08-12T06:01:33Z","published":"2024-07-28T08:19:09Z","title":"MultiHateClip: A Multilingual Benchmark Dataset for Hateful Video\n Detection on YouTube and Bilibili","summary":" Hate speech is a pressing issue in modern society, with significant effects\nboth online and offline. Recent research in hate speech detection has primarily\ncentered on text-based media, largely overlooking multimodal content such as\nvideos. Existing studies on hateful video datasets have predominantly focused\non English content within a Western context and have been limited to binary\nlabels (hateful or non-hateful), lacking detailed contextual information. This\nstudy presents MultiHateClip1 , an novel multilingual dataset created through\nhate lexicons and human annotation. It aims to enhance the detection of hateful\nvideos on platforms such as YouTube and Bilibili, including content in both\nEnglish and Chinese languages. Comprising 2,000 videos annotated for\nhatefulness, offensiveness, and normalcy, this dataset provides a\ncross-cultural perspective on gender-based hate speech. Through a detailed\nexamination of human annotation results, we discuss the differences between\nChinese and English hateful videos and underscore the importance of different\nmodalities in hateful and offensive video analysis. Evaluations of\nstate-of-the-art video classification models, such as VLM, GPT-4V and Qwen-VL,\non MultiHateClip highlight the existing challenges in accurately distinguishing\nbetween hateful and offensive content and the urgent need for models that are\nboth multimodally and culturally nuanced. MultiHateClip represents a\nfoundational advance in enhancing hateful video detection by underscoring the\nnecessity of a multimodal and culturally sensitive approach in combating online\nhate speech.\n","authors":["Han Wang","Tan Rui Yang","Usman Naseem","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2408.03468v2.pdf","comment":"10 pages, 3 figures, ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.05927v1","updated":"2024-08-12T05:33:45Z","published":"2024-08-12T05:33:45Z","title":"A Simple Early Exiting Framework for Accelerated Sampling in Diffusion\n Models","summary":" Diffusion models have shown remarkable performance in generation problems\nover various domains including images, videos, text, and audio. A practical\nbottleneck of diffusion models is their sampling speed, due to the repeated\nevaluation of score estimation networks during the inference. In this work, we\npropose a novel framework capable of adaptively allocating compute required for\nthe score estimation, thereby reducing the overall sampling time of diffusion\nmodels. We observe that the amount of computation required for the score\nestimation may vary along the time step for which the score is estimated. Based\non this observation, we propose an early-exiting scheme, where we skip the\nsubset of parameters in the score estimation network during the inference,\nbased on a time-dependent exit schedule. Using the diffusion models for image\nsynthesis, we show that our method could significantly improve the sampling\nthroughput of the diffusion models without compromising image quality.\nFurthermore, we also demonstrate that our method seamlessly integrates with\nvarious types of solvers for faster sampling, capitalizing on their\ncompatibility to enhance overall efficiency. The source code and our\nexperiments are available at \\url{https://github.com/taehong-moon/ee-diffusion}\n","authors":["Taehong Moon","Moonseok Choi","EungGu Yun","Jongmin Yoon","Gayoung Lee","Jaewoong Cho","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2408.05927v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.05923v1","updated":"2024-08-12T05:07:12Z","published":"2024-08-12T05:07:12Z","title":"Image Denoising Using Green Channel Prior","summary":" Image denoising is an appealing and challenging task, in that noise\nstatistics of real-world observations may vary with local image contents and\ndifferent image channels. Specifically, the green channel usually has twice the\nsampling rate in raw data. To handle noise variances and leverage such\nchannel-wise prior information, we propose a simple and effective green channel\nprior-based image denoising (GCP-ID) method, which integrates GCP into the\nclassic patch-based denoising framework. Briefly, we exploit the green channel\nto guide the search for similar patches, which aims to improve the patch\ngrouping quality and encourage sparsity in the transform domain. The grouped\nimage patches are then reformulated into RGGB arrays to explicitly characterize\nthe density of green samples. Furthermore, to enhance the adaptivity of GCP-ID\nto various image contents, we cast the noise estimation problem into a\nclassification task and train an effective estimator based on convolutional\nneural networks (CNNs). Experiments on real-world datasets demonstrate the\ncompetitive performance of the proposed GCP-ID method for image and video\ndenoising applications in both raw and sRGB spaces. Our code is available at\nhttps://github.com/ZhaomingKong/GCP-ID.\n","authors":["Zhaoming Kong","Fangxi Deng","Xiaowei Yang"],"pdf_url":"https://arxiv.org/pdf/2408.05923v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.08235"},{"id":"http://arxiv.org/abs/2408.05918v1","updated":"2024-08-12T04:46:55Z","published":"2024-08-12T04:46:55Z","title":"PAFormer: Part Aware Transformer for Person Re-identification","summary":" Within the domain of person re-identification (ReID), partial ReID methods\nare considered mainstream, aiming to measure feature distances through\ncomparisons of body parts between samples. However, in practice, previous\nmethods often lack sufficient awareness of anatomical aspect of body parts,\nresulting in the failure to capture features of the same body parts across\ndifferent samples. To address this issue, we introduce \\textbf{Part Aware\nTransformer (PAFormer)}, a pose estimation based ReID model which can perform\nprecise part-to-part comparison. In order to inject part awareness to pose\ntokens, we introduce learnable parameters called `pose token' which estimate\nthe correlation between each body part and partial regions of the image.\nNotably, at inference phase, PAFormer operates without additional modules\nrelated to body part localization, which is commonly used in previous ReID\nmethodologies leveraging pose estimation models. Additionally, leveraging the\nenhanced awareness of body parts, PAFormer suggests the use of a learning-based\nvisibility predictor to estimate the degree of occlusion for each body part.\nAlso, we introduce a teacher forcing technique using ground truth visibility\nscores which enables PAFormer to be trained only with visible parts. A set of\nextensive experiments show that our method outperforms existing approaches on\nwell-known ReID benchmark datasets.\n","authors":["Hyeono Jung","Jangwon Lee","Jiwon Yoo","Dami Ko","Gyeonghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2408.05918v1.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.05914v1","updated":"2024-08-12T04:05:19Z","published":"2024-08-12T04:05:19Z","title":"Deep Multimodal Collaborative Learning for Polyp Re-Identification","summary":" Colonoscopic Polyp Re-Identification aims to match the same polyp from a\nlarge gallery with images from different views taken using different cameras\nand plays an important role in the prevention and treatment of colorectal\ncancer in computer-aided diagnosis. However, traditional methods for object\nReID directly adopting CNN models trained on the ImageNet dataset usually\nproduce unsatisfactory retrieval performance on colonoscopic datasets due to\nthe large domain gap. Worsely, these solutions typically learn unimodal modal\nrepresentations on the basis of visual samples, which fails to explore\ncomplementary information from different modalities. To address this challenge,\nwe propose a novel Deep Multimodal Collaborative Learning framework named DMCL\nfor polyp re-identification, which can effectively encourage modality\ncollaboration and reinforce generalization capability in medical scenarios. On\nthe basis of it, a dynamic multimodal feature fusion strategy is introduced to\nleverage the optimized multimodal representations for multimodal fusion via\nend-to-end training. Experiments on the standard benchmarks show the benefits\nof the multimodal setting over state-of-the-art unimodal ReID models,\nespecially when combined with the specialized multimodal fusion strategy.\n","authors":["Suncheng Xiang","Jincheng Li","Zhengjie Zhang","Shilun Cai","Jiale Guan","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2408.05914v1.pdf","comment":"Work in progress. arXiv admin note: text overlap with\n arXiv:2307.10625"},{"id":"http://arxiv.org/abs/2304.05215v4","updated":"2024-08-12T03:33:12Z","published":"2023-04-11T13:33:45Z","title":"A Billion-scale Foundation Model for Remote Sensing Images","summary":" As the potential of foundation models in visual tasks has garnered\nsignificant attention, pretraining these models before downstream tasks has\nbecome a crucial step. The three key factors in pretraining foundation models\nare the pretraining method, the size of the pretraining dataset, and the number\nof model parameters. Recently, research in the remote sensing field has focused\nprimarily on the pretraining method and the size of the dataset, with limited\nemphasis on the number of model parameters. This paper addresses this gap by\nexamining the effect of increasing the number of model parameters on the\nperformance of foundation models in downstream tasks such as rotated object\ndetection and semantic segmentation. We pretrained foundation models with\nvarying numbers of parameters, including 86M, 605.26M, 1.3B, and 2.4B, to\ndetermine whether performance in downstream tasks improved with an increase in\nparameters. To the best of our knowledge, this is the first billion-scale\nfoundation model in the remote sensing field. Furthermore, we propose an\neffective method for scaling up and fine-tuning a vision transformer in the\nremote sensing field. To evaluate general performance in downstream tasks, we\nemployed the DOTA v2.0 and DIOR-R benchmark datasets for rotated object\ndetection, and the Potsdam and LoveDA datasets for semantic segmentation.\nExperimental results demonstrated that, across all benchmark datasets and\ndownstream tasks, the performance of the foundation models and data efficiency\nimproved as the number of parameters increased. Moreover, our models achieve\nthe state-of-the-art performance on several datasets including DIOR-R, Postdam,\nand LoveDA.\n","authors":["Keumgang Cha","Junghoon Seo","Taekyung Lee"],"pdf_url":"https://arxiv.org/pdf/2304.05215v4.pdf","comment":"This manuscript is the accepted version for IEEE Journal of Selected\n Topics in Applied Earth Observations and Remote Sensing (IEEE J-STARS)"},{"id":"http://arxiv.org/abs/2404.03179v2","updated":"2024-08-12T03:31:57Z","published":"2024-04-04T03:28:57Z","title":"UniAV: Unified Audio-Visual Perception for Multi-Task Video Event\n Localization","summary":" Video localization tasks aim to temporally locate specific instances in\nvideos, including temporal action localization (TAL), sound event detection\n(SED) and audio-visual event localization (AVEL). Existing methods\nover-specialize on each task, overlooking the fact that these instances often\noccur in the same video to form the complete video content. In this work, we\npresent UniAV, a Unified Audio-Visual perception network, to achieve joint\nlearning of TAL, SED and AVEL tasks for the first time. UniAV can leverage\ndiverse data available in task-specific datasets, allowing the model to learn\nand share mutually beneficial knowledge across tasks and modalities. To tackle\nthe challenges posed by substantial variations in datasets\n(size/domain/duration) and distinct task characteristics, we propose to\nuniformly encode visual and audio modalities of all videos to derive generic\nrepresentations, while also designing task-specific experts to capture unique\nknowledge for each task. Besides, we develop a unified language-aware\nclassifier by utilizing a pre-trained text encoder, enabling the model to\nflexibly detect various types of instances and previously unseen ones by simply\nchanging prompts during inference. UniAV outperforms its single-task\ncounterparts by a large margin with fewer parameters, achieving on-par or\nsuperior performances compared to state-of-the-art task-specific methods across\nActivityNet 1.3, DESED and UnAV-100 benchmarks.\n","authors":["Tiantian Geng","Teng Wang","Yanfu Zhang","Jinming Duan","Weili Guan","Feng Zheng","Ling shao"],"pdf_url":"https://arxiv.org/pdf/2404.03179v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.14660v4","updated":"2024-08-12T03:31:38Z","published":"2023-09-26T04:32:38Z","title":"CoFiI2P: Coarse-to-Fine Correspondences for Image-to-Point Cloud\n Registration","summary":" Image-to-point cloud (I2P) registration is a fundamental task for robots and\nautonomous vehicles to achieve cross-modality data fusion and localization.\nCurrent I2P registration methods primarily focus on estimating correspondences\nat the point or pixel level, often neglecting global alignment. As a result,\nI2P matching can easily converge to a local optimum if it lacks high-level\nguidance from global constraints. To improve the success rate and general\nrobustness, this paper introduces CoFiI2P, a novel I2P registration network\nthat extracts correspondences in a coarse-to-fine manner. First, the image and\npoint cloud data are processed through a two-stream encoder-decoder network for\nhierarchical feature extraction. Second, a coarse-to-fine matching module is\ndesigned to leverage these features and establish robust feature\ncorrespondences. Specifically, In the coarse matching phase, a novel I2P\ntransformer module is employed to capture both homogeneous and heterogeneous\nglobal information from the image and point cloud data. This enables the\nestimation of coarse super-point/super-pixel matching pairs with discriminative\ndescriptors. In the fine matching module, point/pixel pairs are established\nwith the guidance of super-point/super-pixel correspondences. Finally, based on\nmatching pairs, the transform matrix is estimated with the EPnP-RANSAC\nalgorithm. Experiments conducted on the KITTI Odometry dataset demonstrate that\nCoFiI2P achieves impressive results, with a relative rotation error (RRE) of\n1.14 degrees and a relative translation error (RTE) of 0.29 meters, while\nmaintaining real-time speed.Additional experiments on the Nuscenes datasets\nconfirm our method's generalizability. The project page is available at\n\\url{https://whu-usi3dv.github.io/CoFiI2P}.\n","authors":["Shuhao Kang","Youqi Liao","Jianping Li","Fuxun Liang","Yuhao Li","Xianghong Zou","Fangning Li","Xieyuanli Chen","Zhen Dong","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2309.14660v4.pdf","comment":"Submitted to IEEE RA-L (under review); project page is available at:\n https://whu-usi3dv.github.io/CoFiI2P"},{"id":"http://arxiv.org/abs/2408.05905v1","updated":"2024-08-12T03:31:29Z","published":"2024-08-12T03:31:29Z","title":"Weakly Supervised Video Anomaly Detection and Localization with\n Spatio-Temporal Prompts","summary":" Current weakly supervised video anomaly detection (WSVAD) task aims to\nachieve frame-level anomalous event detection with only coarse video-level\nannotations available. Existing works typically involve extracting global\nfeatures from full-resolution video frames and training frame-level classifiers\nto detect anomalies in the temporal dimension. However, most anomalous events\ntend to occur in localized spatial regions rather than the entire video frames,\nwhich implies existing frame-level feature based works may be misled by the\ndominant background information and lack the interpretation of the detected\nanomalies. To address this dilemma, this paper introduces a novel method called\nSTPrompt that learns spatio-temporal prompt embeddings for weakly supervised\nvideo anomaly detection and localization (WSVADL) based on pre-trained\nvision-language models (VLMs). Our proposed method employs a two-stream network\nstructure, with one stream focusing on the temporal dimension and the other\nprimarily on the spatial dimension. By leveraging the learned knowledge from\npre-trained VLMs and incorporating natural motion priors from raw videos, our\nmodel learns prompt embeddings that are aligned with spatio-temporal regions of\nvideos (e.g., patches of individual frames) for identify specific local regions\nof anomalies, enabling accurate video anomaly detection while mitigating the\ninfluence of background information. Without relying on detailed\nspatio-temporal annotations or auxiliary object detection/tracking, our method\nachieves state-of-the-art performance on three public benchmarks for the WSVADL\ntask.\n","authors":["Peng Wu","Xuerong Zhou","Guansong Pang","Zhiwei Yang","Qingsen Yan","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05905v1.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2405.05803v2","updated":"2024-08-12T03:29:51Z","published":"2024-05-09T14:38:53Z","title":"Boosting Multimodal Large Language Models with Visual Tokens Withdrawal\n for Rapid Inference","summary":" Multimodal large language models (MLLMs) demand considerable computations for\ninference due to the extensive parameters and the additional input tokens\nneeded for visual information representation. Herein, we introduce Visual\nTokens Withdrawal (VTW), a plug-and-play module to boost MLLMs for rapid\ninference. Our approach is inspired by two intriguing phenomena we have\nobserved: (1) the attention sink phenomenon that is prevalent in LLMs also\npersists in MLLMs, suggesting that initial tokens and nearest tokens receive\nthe majority of attention, while middle vision tokens garner minimal attention\nin deep layers; (2) the presence of information migration, which implies that\nvisual information is transferred to subsequent text tokens within the first\nfew layers of MLLMs. As per our findings, we conclude that vision tokens are\nunnecessary in the deep layers of MLLMs. Thus, we strategically withdraw them\nat a certain layer, enabling only text tokens to engage in subsequent layers.\nTo pinpoint the ideal layer for VTW, we initially analyze a limited set of tiny\ndatasets and choose the first layer that meets the Kullback-Leibler divergence\ncriterion. Our VTW approach can cut computational overhead by over 40\\% across\ndiverse multimodal tasks while maintaining performance. Our code is released at\n\\url{https://github.com/lzhxmu/VTW}.\n","authors":["Zhihang Lin","Mingbao Lin","Luxi Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2405.05803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07128v4","updated":"2024-08-12T03:05:59Z","published":"2023-12-12T10:04:11Z","title":"MS-Twins: Multi-Scale Deep Self-Attention Networks for Medical Image\n Segmentation","summary":" Chest X-ray is one of the most common radiological examination types for the\ndiagnosis of chest diseases. Nowadays, the automatic classification technology\nof radiological images has been widely used in clinical diagnosis and treatment\nplans. However, each disease has its own different response characteristic\nreceptive field region, which is the main challenge for chest disease\nclassification tasks. Besides, the imbalance of sample data categories further\nincreases the difficulty of tasks. To solve these problems, we propose a new\nmulti-label chest disease image classification scheme based on a multi-scale\nattention network. In this scheme, multi-scale information is iteratively fused\nto focus on regions with a high probability of disease, to effectively mine\nmore meaningful information from data, and the classification performance can\nbe improved only by image level annotation. We also designed a new loss\nfunction to improve the rationality of visual perception and the performance of\nmulti-label image classification by forcing the consistency of attention\nregions before and after image transformation. A comprehensive experiment was\ncarried out on the public Chest X-Ray14 and CheXpert datasets to achieve state\nof the art results, which verified the effectiveness of this method in chest\nX-ray image classification.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07128v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18330v2","updated":"2024-08-12T03:00:37Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nimplicit-learned memories to retain as many temporal cues as possible. However,\nimplicit memories still struggle to preserve long-term features effectively. In\nthis paper, we consider those invisible objects as pseudo-occluded objects and\naim to detect them by tracking through occlusions. Firstly, we introduce the\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nnot only clean the existing event camera dataset but also append additional\nvisibility labels to it. Secondly, we exploit tracking strategies for\npseudo-occluded objects to maintain their permanence and retain their bounding\nboxes, even when features have not been available for a very long time. These\nstrategies can be treated as an explicit-learned memory guided by the tracking\nobjective to record the displacements of objects across frames. Lastly, we\npropose a spatio-temporal feature aggregation module to enrich the latent\nfeatures and a consistency loss to increase the robustness of the overall\npipeline. We conduct comprehensive experiments to verify our method's\neffectiveness where still objects are retained, but real occluded objects are\ndiscarded. The results demonstrate that (1) the additional visibility labels\ncan assist in supervised training, and (2) our method outperforms\nstate-of-the-art approaches with a significant improvement of 7.9% absolute\nmAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08434v2","updated":"2024-08-12T02:57:30Z","published":"2024-05-14T08:56:09Z","title":"TP3M: Transformer-based Pseudo 3D Image Matching with Reference Image","summary":" Image matching is still challenging in such scenes with large viewpoints or\nillumination changes or with low textures. In this paper, we propose a\nTransformer-based pseudo 3D image matching method. It upgrades the 2D features\nextracted from the source image to 3D features with the help of a reference\nimage and matches to the 2D features extracted from the destination image by\nthe coarse-to-fine 3D matching. Our key discovery is that by introducing the\nreference image, the source image's fine points are screened and furtherly\ntheir feature descriptors are enriched from 2D to 3D, which improves the match\nperformance with the destination image. Experimental results on multiple\ndatasets show that the proposed method achieves the state-of-the-art on the\ntasks of homography estimation, pose estimation and visual localization\nespecially in challenging scenes.\n","authors":["Liming Han","Zhaoxiang Liu","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2405.08434v2.pdf","comment":"Accepted by ICRA 2024"},{"id":"http://arxiv.org/abs/2408.05900v1","updated":"2024-08-12T02:48:00Z","published":"2024-08-12T02:48:00Z","title":"Classifier Guidance Enhances Diffusion-based Adversarial Purification by\n Preserving Predictive Information","summary":" Adversarial purification is one of the promising approaches to defend neural\nnetworks against adversarial attacks. Recently, methods utilizing diffusion\nprobabilistic models have achieved great success for adversarial purification\nin image classification tasks. However, such methods fall into the dilemma of\nbalancing the needs for noise removal and information preservation. This paper\npoints out that existing adversarial purification methods based on diffusion\nmodels gradually lose sample information during the core denoising process,\ncausing occasional label shift in subsequent classification tasks. As a remedy,\nwe suggest to suppress such information loss by introducing guidance from the\nclassifier confidence. Specifically, we propose Classifier-cOnfidence gUided\nPurification (COUP) algorithm, which purifies adversarial examples while\nkeeping away from the classifier decision boundary. Experimental results show\nthat COUP can achieve better adversarial robustness under strong attack\nmethods.\n","authors":["Mingkun Zhang","Jianing Li","Wei Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.05900v1.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2408.05901v1","updated":"2024-08-12T02:48:00Z","published":"2024-08-12T02:48:00Z","title":"HcNet: Image Modeling with Heat Conduction Equation","summary":" Foundation models, such as CNNs and ViTs, have powered the development of\nimage modeling. However, general guidance to model architecture design is still\nmissing. The design of many modern model architectures, such as residual\nstructures, multiplicative gating signal, and feed-forward networks, can be\ninterpreted in terms of the heat conduction equation. This finding inspired us\nto model images by the heat conduction equation, where the essential idea is to\nconceptualize image features as temperatures and model their information\ninteraction as the diffusion of thermal energy. We can take advantage of the\nrich knowledge in the heat conduction equation to guide us in designing new and\nmore interpretable models. As an example, we propose Heat Conduction Layer and\nRefine Approximation Layer inspired by solving the heat conduction equation\nusing Finite Difference Method and Fourier series, respectively. This paper\ndoes not aim to present a state-of-the-art model; instead, it seeks to\nintegrate the overall architectural design of the model into the heat\nconduction theory framework. Nevertheless, our Heat Conduction Network (HcNet)\nstill shows competitive performance. Code available at\n\\url{https://github.com/ZheminZhang1/HcNet}.\n","authors":["Zhemin Zhang","Xun Gong"],"pdf_url":"https://arxiv.org/pdf/2408.05901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15002v2","updated":"2024-08-12T02:46:06Z","published":"2024-01-26T17:03:38Z","title":"BackdoorBench: A Comprehensive Benchmark and Analysis of Backdoor\n Learning","summary":" As an emerging and vital topic for studying deep neural networks'\nvulnerability (DNNs), backdoor learning has attracted increasing interest in\nrecent years, and many seminal backdoor attack and defense algorithms are being\ndeveloped successively or concurrently, in the status of a rapid arms race.\nHowever, mainly due to the diverse settings, and the difficulties of\nimplementation and reproducibility of existing works, there is a lack of a\nunified and standardized benchmark of backdoor learning, causing unfair\ncomparisons, and unreliable conclusions (e.g., misleading, biased or even false\nconclusions). Consequently, it is difficult to evaluate the current progress\nand design the future development roadmap of this literature. To alleviate this\ndilemma, we build a comprehensive benchmark of backdoor learning called\nBackdoorBench. Our benchmark makes three valuable contributions to the research\ncommunity. 1) We provide an integrated implementation of state-of-the-art\n(SOTA) backdoor learning algorithms (currently including 16 attack and 27\ndefense algorithms), based on an extensible modular-based codebase. 2) We\nconduct comprehensive evaluations of 12 attacks against 16 defenses, with 5\npoisoning ratios, based on 4 models and 4 datasets, thus 11,492 pairs of\nevaluations in total. 3) Based on above evaluations, we present abundant\nanalysis from 8 perspectives via 18 useful analysis tools, and provide several\ninspiring insights about backdoor learning. We hope that our efforts could\nbuild a solid foundation of backdoor learning to facilitate researchers to\ninvestigate existing algorithms, develop more innovative algorithms, and\nexplore the intrinsic mechanism of backdoor learning. Finally, we have created\na user-friendly website at http://backdoorbench.com, which collects all\nimportant information of BackdoorBench, including codebase, docs, leaderboard,\nand model Zoo.\n","authors":["Baoyuan Wu","Hongrui Chen","Mingda Zhang","Zihao Zhu","Shaokui Wei","Danni Yuan","Mingli Zhu","Ruotong Wang","Li Liu","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2401.15002v2.pdf","comment":"We have uploaded a new version, which can be accessed at\n arXiv:2407.19845"},{"id":"http://arxiv.org/abs/2408.05894v1","updated":"2024-08-12T02:16:47Z","published":"2024-08-12T02:16:47Z","title":"GlyphPattern: An Abstract Pattern Recognition for Vision-Language Models","summary":" Vision-Language Models (VLMs) building upon the foundation of powerful large\nlanguage models have made rapid progress in reasoning across visual and textual\ndata. While VLMs perform well on vision tasks that they are trained on, our\nresults highlight key challenges in abstract pattern recognition. We present\nGlyphPattern, a 954 item dataset that pairs 318 human-written descriptions of\nvisual patterns from 40 writing systems with three visual presentation styles.\n GlyphPattern evaluates abstract pattern recognition in VLMs, requiring models\nto understand and judge natural language descriptions of visual patterns.\nGlyphPattern patterns are drawn from a large-scale cognitive science\ninvestigation of human writing systems; as a result, they are rich in spatial\nreference and compositionality. Our experiments show that GlyphPattern is\nchallenging for state-of-the-art VLMs (GPT-4o achieves only 55% accuracy), with\nmarginal gains from few-shot prompting. Our detailed error analysis reveals\nchallenges at multiple levels, including visual processing, natural language\nunderstanding, and pattern generalization.\n","authors":["Zixuan Wu","Yoolim Kim","Carolyn Jane Anderson"],"pdf_url":"https://arxiv.org/pdf/2408.05894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05892v1","updated":"2024-08-12T02:10:18Z","published":"2024-08-12T02:10:18Z","title":"Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer\n Detection","summary":" Polyp segmentation plays a crucial role in the early detection and diagnosis\nof colorectal cancer. However, obtaining accurate segmentations often requires\nlabor-intensive annotations and specialized models. Recently, Meta AI Research\nreleased a general Segment Anything Model 2 (SAM 2), which has demonstrated\npromising performance in several segmentation tasks. In this work, we evaluate\nthe performance of SAM 2 in segmenting polyps under various prompted settings.\nWe hope this report will provide insights to advance the field of polyp\nsegmentation and promote more interesting work in the future. This project is\npublicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.\n","authors":["Mobina Mansoori","Sajjad Shahabodini","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2408.05892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05891v1","updated":"2024-08-12T02:09:25Z","published":"2024-08-12T02:09:25Z","title":"CMAB: A First National-Scale Multi-Attribute Building Dataset Derived\n from Open Source Data and GeoAI","summary":" Rapidly acquiring three-dimensional (3D) building data, including geometric\nattributes like rooftop, height, and structure, as well as indicative\nattributes like function, quality, and age, is essential for accurate urban\nanalysis, simulations, and policy updates. Existing large-scale building\ndatasets lack accuracy, extensibility and indicative attributes. This paper\npresents a geospatial artificial intelligence (GeoAI) framework for large-scale\nbuilding modeling, introducing the first Multi-Attribute Building dataset\n(CMAB) in China at a national scale. The dataset covers 3,667 natural cities\nwith a total rooftop area of 21.3 billion square meters with an F1-Score of\n89.93% in rooftop extraction through the OCRNet. We trained bootstrap\naggregated XGBoost models with city administrative classifications,\nincorporating building features such as morphology, location, and function.\nUsing multi-source data, including billions of high-resolution Google Earth\nimagery and 60 million street view images (SVI), we generated rooftop, height,\nfunction, age, and quality attributes for each building. Accuracy was validated\nthrough model benchmarks, existing similar products, and manual SVI validation.\nThe results support urban planning and sustainable development.\n","authors":["Yecheng Zhang","Huimin Zhao","Ying Long"],"pdf_url":"https://arxiv.org/pdf/2408.05891v1.pdf","comment":"43 pages, 20 figures"},{"id":"http://arxiv.org/abs/2407.17480v2","updated":"2024-08-12T01:57:37Z","published":"2024-07-02T06:08:30Z","title":"Universal Approximation Theory: The basic theory for deep learning-based\n computer vision models","summary":" Computer vision (CV) is one of the most crucial fields in artificial\nintelligence. In recent years, a variety of deep learning models based on\nconvolutional neural networks (CNNs) and Transformers have been designed to\ntackle diverse problems in CV. These algorithms have found practical\napplications in areas such as robotics and facial recognition. Despite the\nincreasing power of current CV models, several fundamental questions remain\nunresolved: Why do CNNs require deep layers? What ensures the generalization\nability of CNNs? Why do residual-based networks outperform fully convolutional\nnetworks like VGG? What is the fundamental difference between residual-based\nCNNs and Transformer-based networks? Why can CNNs utilize LoRA and pruning\ntechniques? The root cause of these questions lies in the lack of a robust\ntheoretical foundation for deep learning models in CV. To address these\ncritical issues and techniques, we employ the Universal Approximation Theorem\n(UAT) to provide a theoretical basis for convolution- and Transformer-based\nmodels in CV. By doing so, we aim to elucidate these questions from a\ntheoretical perspective.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.17480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05889v1","updated":"2024-08-12T01:49:13Z","published":"2024-08-12T01:49:13Z","title":"Enhancing 3D Transformer Segmentation Model for Medical Image with\n Token-level Representation Learning","summary":" In the field of medical images, although various works find Swin Transformer\nhas promising effectiveness on pixelwise dense prediction, whether pre-training\nthese models without using extra dataset can further boost the performance for\nthe downstream semantic segmentation remains unexplored.Applications of\nprevious representation learning methods are hindered by the limited number of\n3D volumes and high computational cost. In addition, most of pretext tasks\ndesigned specifically for Transformer are not applicable to hierarchical\nstructure of Swin Transformer. Thus, this work proposes a token-level\nrepresentation learning loss that maximizes agreement between token embeddings\nfrom different augmented views individually instead of volume-level global\nfeatures. Moreover, we identify a potential representation collapse exclusively\ncaused by this new loss. To prevent collapse, we invent a simple\n\"rotate-and-restore\" mechanism, which rotates and flips one augmented view of\ninput volume, and later restores the order of tokens in the feature maps. We\nalso modify the contrastive loss to address the discrimination between tokens\nat the same position but from different volumes. We test our pre-training\nscheme on two public medical segmentation datasets, and the results on the\ndownstream segmentation task show more improvement of our methods than other\nstate-of-the-art pre-trainig methods.\n","authors":["Xinrong Hu","Dewen Zeng","Yawen Wu","Xueyang Li","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.05889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10885v3","updated":"2024-08-12T01:24:33Z","published":"2024-05-17T16:22:52Z","title":"FA-Depth: Toward Fast and Accurate Self-supervised Monocular Depth\n Estimation","summary":" Most existing methods often rely on complex models to predict scene depth\nwith high accuracy, resulting in slow inference that is not conducive to\ndeployment. To better balance precision and speed, we first designed SmallDepth\nbased on sparsity. Second, to enhance the feature representation ability of\nSmallDepth during training under the condition of equal complexity during\ninference, we propose an equivalent transformation module(ETM). Third, to\nimprove the ability of each layer in the case of a fixed SmallDepth to perceive\ndifferent context information and improve the robustness of SmallDepth to the\nleft-right direction and illumination changes, we propose pyramid loss. Fourth,\nto further improve the accuracy of SmallDepth, we utilized the proposed\nfunction approximation loss (APX) to transfer knowledge in the pretrained\nHQDecv2, obtained by optimizing the previous HQDec to address grid artifacts in\nsome regions, to SmallDepth. Extensive experiments demonstrate that each\nproposed component improves the precision of SmallDepth without changing the\ncomplexity of SmallDepth during inference, and the developed approach achieves\nstate-of-the-art results on KITTI at an inference speed of more than 500 frames\nper second and with approximately 2 M parameters. The code and models will be\npublicly available at https://github.com/fwucas/FA-Depth.\n","authors":["Fei Wang","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.10885v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10034v2","updated":"2024-08-12T00:39:01Z","published":"2024-04-15T17:25:21Z","title":"A Realistic Protocol for Evaluation of Weakly Supervised Object\n Localization","summary":" Weakly Supervised Object Localization (WSOL) allows training deep learning\nmodels for classification and localization (LOC) using only global class-level\nlabels. The absence of bounding box (bbox) supervision during training raises\nchallenges in the literature for hyper-parameter tuning, model selection, and\nevaluation. WSOL methods rely on a validation set with bbox annotations for\nmodel selection, and a test set with bbox annotations for threshold estimation\nfor producing bboxes from localization maps. This approach, however, is not\naligned with the WSOL setting as these annotations are typically unavailable in\nreal-world scenarios. Our initial empirical analysis shows a significant\ndecline in LOC performance when model selection and threshold estimation rely\nsolely on class labels and the image itself, respectively, compared to using\nmanual bbox annotations. This highlights the importance of incorporating bbox\nlabels for optimal model performance. In this paper, a new WSOL evaluation\nprotocol is proposed that provides LOC information without the need for manual\nbbox annotations. In particular, we generated noisy pseudo-boxes from a\npretrained off-the-shelf region proposal method such as Selective Search, CLIP,\nand RPN for model selection. These bboxes are also employed to estimate the\nthreshold from LOC maps, circumventing the need for test-set bbox annotations.\nOur experiments with several WSOL methods on ILSVRC and CUB datasets show that\nusing the proposed pseudo-bboxes for validation facilitates the model selection\nand threshold estimation, with LOC performance comparable to those selected\nusing GT bboxes on the validation set and threshold estimation on the test set.\nIt also outperforms models selected using class-level labels, and then\ndynamically thresholded based solely on LOC maps.\n","authors":["Shakeeb Murtaza","Soufiane Belharbi","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2404.10034v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.00963v2","updated":"2024-08-12T23:00:47Z","published":"2024-08-02T00:35:18Z","title":"MIS-ME: A Multi-modal Framework for Soil Moisture Estimation","summary":" Soil moisture estimation is an important task to enable precision agriculture\nin creating optimal plans for irrigation, fertilization, and harvest. It is\ncommon to utilize statistical and machine learning models to estimate soil\nmoisture from traditional data sources such as weather forecasts, soil\nproperties, and crop properties. However, there is a growing interest in\nutilizing aerial and geospatial imagery to estimate soil moisture. Although\nthese images capture high-resolution crop details, they are expensive to curate\nand challenging to interpret. Imagine, an AI-enhanced software tool that\npredicts soil moisture using visual cues captured by smartphones and\nstatistical data given by weather forecasts. This work is a first step towards\nthat goal of developing a multi-modal approach for soil moisture estimation. In\nparticular, we curate a dataset consisting of real-world images taken from\nground stations and their corresponding weather data. We also propose MIS-ME -\nMeteorological & Image based Soil Moisture Estimator, a multi-modal framework\nfor soil moisture estimation. Our extensive analysis shows that MIS-ME achieves\na MAPE of 10.14%, outperforming traditional unimodal approaches with a\nreduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image\ndata, highlighting the effectiveness of tailored multi-modal approaches.\n","authors":["Mohammed Rakib","Adil Aman Mohammed","Cole Diggins","Sumit Sharma","Jeff Michael Sadler","Tyson Ochsner","Arun Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2408.00963v2.pdf","comment":"Accepted by DSAA2024"},{"id":"http://arxiv.org/abs/2408.06507v1","updated":"2024-08-12T21:47:15Z","published":"2024-08-12T21:47:15Z","title":"Benchmarking tree species classification from proximally-sensed laser\n scanning data: introducing the FOR-species20K dataset","summary":" Proximally-sensed laser scanning offers significant potential for automated\nforest data capture, but challenges remain in automatically identifying tree\nspecies without additional ground data. Deep learning (DL) shows promise for\nautomation, yet progress is slowed by the lack of large, diverse, openly\navailable labeled datasets of single tree point clouds. This has impacted the\nrobustness of DL models and the ability to establish best practices for species\nclassification.\n To overcome these challenges, the FOR-species20K benchmark dataset was\ncreated, comprising over 20,000 tree point clouds from 33 species, captured\nusing terrestrial (TLS), mobile (MLS), and drone laser scanning (ULS) across\nvarious European forests, with some data from other regions. This dataset\nenables the benchmarking of DL models for tree species classification,\nincluding both point cloud-based (PointNet++, MinkNet, MLP-Mixer, DGCNNs) and\nmulti-view image-based methods (SimpleView, DetailView, YOLOv5).\n 2D image-based models generally performed better (average OA = 0.77) than 3D\npoint cloud-based models (average OA = 0.72), with consistent results across\ndifferent scanning platforms and sensors. The top model, DetailView, was\nparticularly robust, handling data imbalances well and generalizing effectively\nacross tree sizes.\n The FOR-species20K dataset, available at https://zenodo.org/records/13255198,\nis a key resource for developing and benchmarking DL models for tree species\nclassification using laser scanning data, providing a foundation for future\nadvancements in the field.\n","authors":["Stefano Puliti","Emily R. Lines","Jana Müllerová","Julian Frey","Zoe Schindler","Adrian Straker","Matthew J. Allen","Lukas Winiwarter","Nataliia Rehush","Hristina Hristova","Brent Murray","Kim Calders","Louise Terryn","Nicholas Coops","Bernhard Höfle","Samuli Junttila","Martin Krůček","Grzegorz Krok","Kamil Král","Shaun R. Levick","Linda Luck","Azim Missarov","Martin Mokroš","Harry J. F. Owen","Krzysztof Stereńczak","Timo P. Pitkänen","Nicola Puletti","Ninni Saarinen","Chris Hopkinson","Chiara Torresan","Enrico Tomelleri","Hannah Weiser","Rasmus Astrup"],"pdf_url":"https://arxiv.org/pdf/2408.06507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10395v2","updated":"2024-08-12T21:36:10Z","published":"2024-06-14T19:49:45Z","title":"BrainFounder: Towards Brain Foundation Models for Neuroimage Analysis","summary":" The burgeoning field of brain health research increasingly leverages\nartificial intelligence (AI) to interpret and analyze neurological data. This\nstudy introduces a novel approach towards the creation of medical foundation\nmodels by integrating a large-scale multi-modal magnetic resonance imaging\n(MRI) dataset derived from 41,400 participants in its own. Our method involves\na novel two-stage pretraining approach using vision transformers. The first\nstage is dedicated to encoding anatomical structures in generally healthy\nbrains, identifying key features such as shapes and sizes of different brain\nregions. The second stage concentrates on spatial information, encompassing\naspects like location and the relative positioning of brain structures. We\nrigorously evaluate our model, BrainFounder, using the Brain Tumor Segmentation\n(BraTS) challenge and Anatomical Tracings of Lesions After Stroke v2.0 (ATLAS\nv2.0) datasets. BrainFounder demonstrates a significant performance gain,\nsurpassing the achievements of the previous winning solutions using fully\nsupervised learning. Our findings underscore the impact of scaling up both the\ncomplexity of the model and the volume of unlabeled training data derived from\ngenerally healthy brains, which enhances the accuracy and predictive\ncapabilities of the model in complex neuroimaging tasks with MRI. The\nimplications of this research provide transformative insights and practical\napplications in healthcare and make substantial steps towards the creation of\nfoundation models for Medical AI. Our pretrained models and training code can\nbe found at https://github.com/lab-smile/GatorBrain.\n","authors":["Joseph Cox","Peng Liu","Skylar E. Stolte","Yunchao Yang","Kang Liu","Kyle B. See","Huiwen Ju","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10395v2.pdf","comment":"19 pages, 5 figures, to be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2408.06502v1","updated":"2024-08-12T21:35:59Z","published":"2024-08-12T21:35:59Z","title":"Prompt Recovery for Image Generation Models: A Comparative Study of\n Discrete Optimizers","summary":" Recovering natural language prompts for image generation models, solely based\non the generated images is a difficult discrete optimization problem. In this\nwork, we present the first head-to-head comparison of recent discrete\noptimization techniques for the problem of prompt inversion. We evaluate Greedy\nCoordinate Gradients (GCG), PEZ , Random Search, AutoDAN and BLIP2's image\ncaptioner across various evaluation metrics related to the quality of inverted\nprompts and the quality of the images generated by the inverted prompts. We\nfind that focusing on the CLIP similarity between the inverted prompts and the\nground truth image acts as a poor proxy for the similarity between ground truth\nimage and the image generated by the inverted prompts. While the discrete\noptimizers effectively minimize their objectives, simply using responses from a\nwell-trained captioner often leads to generated images that more closely\nresemble those produced by the original prompts.\n","authors":["Joshua Nathaniel Williams","Avi Schwarzschild","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2408.06502v1.pdf","comment":"9 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2408.06494v1","updated":"2024-08-12T21:04:16Z","published":"2024-08-12T21:04:16Z","title":"What Color Scheme is More Effective in Assisting Readers to Locate\n Information in a Color-Coded Article?","summary":" Color coding, a technique assigning specific colors to cluster information\ntypes, has proven advantages in aiding human cognitive activities, especially\nreading and comprehension. The rise of Large Language Models (LLMs) has\nstreamlined document coding, enabling simple automatic text labeling with\nvarious schemes. This has the potential to make color-coding more accessible\nand benefit more users. However, the impact of color choice on information\nseeking is understudied. We conducted a user study assessing various color\nschemes' effectiveness in LLM-coded text documents, standardizing contrast\nratios to approximately 5.55:1 across schemes. Participants performed timed\ninformation-seeking tasks in color-coded scholarly abstracts. Results showed\nnon-analogous and yellow-inclusive color schemes improved performance, with the\nlatter also being more preferred by participants. These findings can inform\nbetter color scheme choices for text annotation. As LLMs advance document\ncoding, we advocate for more research focusing on the \"color\" aspect of\ncolor-coding techniques.\n","authors":["Ho Yin Ng","Zeyu He","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04212v2","updated":"2024-08-12T20:34:05Z","published":"2024-08-08T04:34:29Z","title":"Is SAM 2 Better than SAM in Medical Image Segmentation?","summary":" The Segment Anything Model (SAM) has demonstrated impressive performance in\nzero-shot promptable segmentation on natural images. The recently released\nSegment Anything Model 2 (SAM 2) claims to outperform SAM on images and extends\nthe model's capabilities to video segmentation. Evaluating the performance of\nthis new model in medical image segmentation, specifically in a zero-shot\npromptable manner, is crucial. In this work, we conducted extensive studies\nusing multiple datasets from various imaging modalities to compare the\nperformance of SAM and SAM 2. We employed two point-prompt strategies: (i)\nmultiple positive prompts where one prompt is placed near the centroid of the\ntarget structure, while the remaining prompts are randomly placed within the\nstructure, and (ii) combined positive and negative prompts where one positive\nprompt is placed near the centroid of the target structure, and two negative\nprompts are positioned outside the structure, maximizing the distance from the\npositive prompt and from each other. The evaluation encompassed 24 unique\norgan-modality combinations, including abdominal structures, cardiac\nstructures, fetal head images, skin lesions and polyp images across 11 publicly\navailable MRI, CT, ultrasound, dermoscopy, and endoscopy datasets. Preliminary\nresults based on 2D images indicate that while SAM 2 may perform slightly\nbetter in a few cases, it does not generally surpass SAM for medical image\nsegmentation. Notably, SAM 2 performs worse than SAM in lower contrast imaging\nmodalities, such as CT and ultrasound. However, for MRI images, SAM 2 performs\non par with or better than SAM. Like SAM, SAM 2 also suffers from\nover-segmentation issues, particularly when the boundaries of the target organ\nare fuzzy.\n","authors":["Sourya Sengupta","Satrajit Chakrabarty","Ravi Soni"],"pdf_url":"https://arxiv.org/pdf/2408.04212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06467v1","updated":"2024-08-12T19:42:09Z","published":"2024-08-12T19:42:09Z","title":"Generalization Enhancement Strategies to Enable Cross-year Cropland\n Mapping with Convolutional Neural Networks Trained Using Historical Samples","summary":" The accuracy of mapping agricultural fields across large areas is steadily\nimproving with high-resolution satellite imagery and deep learning (DL) models,\neven in regions where fields are small and geometrically irregular. However,\ndeveloping effective DL models often requires large, expensive label datasets,\ntypically available only for specific years or locations. This limits the\nability to create annual maps essential for agricultural monitoring, as domain\nshifts occur between years and regions due to changes in farming practices and\nenvironmental conditions. The challenge is to design a model flexible enough to\naccount for these shifts without needing yearly labels. While domain adaptation\ntechniques or semi-supervised training are common solutions, we explored\nenhancing the model's generalization power. Our results indicate that a\nholistic approach is essential, combining methods to improve generalization.\nSpecifically, using an area-based loss function, such as Tversky-focal loss\n(TFL), significantly improved predictions across multiple years. The use of\ndifferent augmentation techniques helped to encode different types of\ninvariance, particularly photometric augmentations encoded invariance to\nbrightness changes, though they increased false positives. The combination of\nphotometric augmentation, TFL loss, and MC-dropout produced the best results,\nalthough dropout alone led to more false negatives in subsequent year\npredictions. Additionally, the choice of input normalization had a significant\nimpact, with the best results obtained when statistics were calculated either\nlocally or across the entire dataset over all bands (lab and gab). We developed\na workflow that enabled a U-Net model to generate effective multi-year crop\nmaps over large areas. Our code, available at:\nhttps://github.com/agroimpacts/cnn-generalization-enhancement, will be\nregularly updated with improvements.\n","authors":["Sam Khallaghi","Rahebe Abedi","Hanan Abou Ali","Mary Dziedzorm Asipunu","Ismail Alatise","Nguyen Ha","Boka Luo","Cat Mai","Lei Song","Amos Wussah","Sitian Xiong","Qi Zhang","Lyndon D. Estes"],"pdf_url":"https://arxiv.org/pdf/2408.06467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07546v2","updated":"2024-08-12T19:33:52Z","published":"2024-06-11T17:59:48Z","title":"Commonsense-T2I Challenge: Can Text-to-Image Generation Models\n Understand Commonsense?","summary":" We present a novel task and benchmark for evaluating the ability of\ntext-to-image(T2I) generation models to produce images that align with\ncommonsense in real life, which we call Commonsense-T2I. Given two adversarial\ntext prompts containing an identical set of action words with minor\ndifferences, such as \"a lightbulb without electricity\" v.s. \"a lightbulb with\nelectricity\", we evaluate whether T2I models can conduct visual-commonsense\nreasoning, e.g. produce images that fit \"the lightbulb is unlit\" vs. \"the\nlightbulb is lit\" correspondingly. Commonsense-T2I presents an adversarial\nchallenge, providing pairwise text prompts along with expected outputs. The\ndataset is carefully hand-curated by experts and annotated with fine-grained\nlabels, such as commonsense type and likelihood of the expected outputs, to\nassist analyzing model behavior. We benchmark a variety of state-of-the-art\n(sota) T2I models and surprisingly find that, there is still a large gap\nbetween image synthesis and real life photos--even the DALL-E 3 model could\nonly achieve 48.92% on Commonsense-T2I, and the stable diffusion XL model only\nachieves 24.92% accuracy. Our experiments show that GPT-enriched prompts cannot\nsolve this challenge, and we include a detailed analysis about possible reasons\nfor such deficiency. We aim for Commonsense-T2I to serve as a high-quality\nevaluation benchmark for T2I commonsense checking, fostering advancements in\nreal life image generation.\n","authors":["Xingyu Fu","Muyu He","Yujie Lu","William Yang Wang","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2406.07546v2.pdf","comment":"COLM 2024, Project Url: https://zeyofu.github.io/CommonsenseT2I/"},{"id":"http://arxiv.org/abs/2310.13356v4","updated":"2024-08-12T19:28:46Z","published":"2023-10-20T08:45:30Z","title":"Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos","summary":" Recent advancements in 4D scene reconstruction using neural radiance fields\n(NeRF) have demonstrated the ability to represent dynamic scenes from\nmulti-view videos. However, they fail to reconstruct the dynamic scenes and\nstruggle to fit even the training views in unsynchronized settings. It happens\nbecause they employ a single latent embedding for a frame while the multi-view\nimages at the same frame were actually captured at different moments. To\naddress this limitation, we introduce time offsets for individual\nunsynchronized videos and jointly optimize the offsets with NeRF. By design,\nour method is applicable for various baselines and improves them with large\nmargins. Furthermore, finding the offsets naturally works as synchronizing the\nvideos without manual effort. Experiments are conducted on the common Plenoptic\nVideo Dataset and a newly built Unsynchronized Dynamic Blender Dataset to\nverify the performance of our method. Project page:\nhttps://seoha-kim.github.io/sync-nerf\n","authors":["Seoha Kim","Jeongmin Bae","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2310.13356v4.pdf","comment":"AAAI 2024. Project page: https://seoha-kim.github.io/sync-nerf"},{"id":"http://arxiv.org/abs/2408.06459v1","updated":"2024-08-12T19:19:23Z","published":"2024-08-12T19:19:23Z","title":"InfLocNet: Enhanced Lung Infection Localization and Disease Detection\n from Chest X-Ray Images Using Lightweight Deep Learning","summary":" In recent years, the integration of deep learning techniques into medical\nimaging has revolutionized the diagnosis and treatment of lung diseases,\nparticularly in the context of COVID-19 and pneumonia. This paper presents a\nnovel, lightweight deep learning based segmentation-classification network\ndesigned to enhance the detection and localization of lung infections using\nchest X-ray images. By leveraging the power of transfer learning with\npre-trained VGG-16 weights, our model achieves robust performance even with\nlimited training data. The architecture incorporates refined skip connections\nwithin the UNet++ framework, reducing semantic gaps and improving precision in\nsegmentation tasks. Additionally, a classification module is integrated at the\nend of the encoder block, enabling simultaneous classification and\nsegmentation. This dual functionality enhances the model's versatility,\nproviding comprehensive diagnostic insights while optimizing computational\nefficiency. Experimental results demonstrate that our proposed lightweight\nnetwork outperforms existing methods in terms of accuracy and computational\nrequirements, making it a viable solution for real-time and resource\nconstrained medical imaging applications. Furthermore, the streamlined design\nfacilitates easier hyperparameter tuning and deployment on edge devices. This\nwork underscores the potential of advanced deep learning architectures in\nimproving clinical outcomes through precise and efficient medical image\nanalysis. Our model achieved remarkable results with an Intersection over Union\n(IoU) of 93.59% and a Dice Similarity Coefficient (DSC) of 97.61% in lung area\nsegmentation, and an IoU of 97.67% and a DSC of 87.61% for infection region\nlocalization. Additionally, it demonstrated high accuracy of 93.86% and\nsensitivity of 89.55% in detecting chest diseases, highlighting its efficacy\nand reliability.\n","authors":["Md. Asiful Islam Miah","Shourin Paul","Sunanda Das","M. M. A. Hashem"],"pdf_url":"https://arxiv.org/pdf/2408.06459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06457v1","updated":"2024-08-12T19:17:57Z","published":"2024-08-12T19:17:57Z","title":"Advanced Vision Transformers and Open-Set Learning for Robust Mosquito\n Classification: A Novel Approach to Entomological Studies","summary":" Mosquito-related diseases pose a significant threat to global public health,\nnecessitating efficient and accurate mosquito classification for effective\nsurveillance and control. This work presents an innovative approach to mosquito\nclassification by leveraging state-of-the-art vision transformers and open-set\nlearning techniques. A novel framework has been introduced that integrates\nTransformer-based deep learning models with comprehensive data augmentation and\npreprocessing methods, enabling robust and precise identification of ten\nmosquito species. The Swin Transformer model achieves the best performance for\ntraditional closed-set learning with 99.80\\% accuracy and 0.998 F1 score. The\nlightweight MobileViT technique attains an almost similar accuracy of 98.90\\%\nwith significantly reduced parameters and model complexities. Next, the applied\ndeep learning models' adaptability and generalizability in a static environment\nhave been enhanced by using new classes of data samples during the inference\nstage that have not been included in the training set. The proposed framework's\nability to handle unseen classes like insects similar to mosquitoes, even\nhumans, through open-set learning further enhances its practical applicability\nemploying the OpenMax technique and Weibull distribution. The traditional CNN\nmodel, Xception, outperforms the latest transformer with higher accuracy and F1\nscore for open-set learning. The study's findings highlight the transformative\npotential of advanced deep-learning architectures in entomology, providing a\nstrong groundwork for future research and development in mosquito surveillance\nand vector control. The implications of this work extend beyond mosquito\nclassification, offering valuable insights for broader ecological and\nenvironmental monitoring applications.\n","authors":["Ahmed Akib Jawad Karim","Muhammad Zawad Mahmud","Riasat Khan"],"pdf_url":"https://arxiv.org/pdf/2408.06457v1.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2408.06447v1","updated":"2024-08-12T18:53:03Z","published":"2024-08-12T18:53:03Z","title":"S-SAM: SVD-based Fine-Tuning of Segment Anything Model for Medical Image\n Segmentation","summary":" Medical image segmentation has been traditionally approached by training or\nfine-tuning the entire model to cater to any new modality or dataset. However,\nthis approach often requires tuning a large number of parameters during\ntraining. With the introduction of the Segment Anything Model (SAM) for\nprompted segmentation of natural images, many efforts have been made towards\nadapting it efficiently for medical imaging, thus reducing the training time\nand resources. However, these methods still require expert annotations for\nevery image in the form of point prompts or bounding box prompts during\ntraining and inference, making it tedious to employ them in practice. In this\npaper, we propose an adaptation technique, called S-SAM, that only trains\nparameters equal to 0.4% of SAM's parameters and at the same time uses simply\nthe label names as prompts for producing precise masks. This not only makes\ntuning SAM more efficient than the existing adaptation methods but also removes\nthe burden of providing expert prompts. We call this modified version S-SAM and\nevaluate it on five different modalities including endoscopic images, x-ray,\nultrasound, CT, and histology images. Our experiments show that S-SAM\noutperforms state-of-the-art methods as well as existing SAM adaptation methods\nwhile tuning a significantly less number of parameters. We release the code for\nS-SAM at https://github.com/JayParanjape/SVDSAM.\n","authors":["Jay N. Paranjape","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2408.06447v1.pdf","comment":"Accepted in MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.06437v1","updated":"2024-08-12T18:29:48Z","published":"2024-08-12T18:29:48Z","title":"HAT: History-Augmented Anchor Transformer for Online Temporal Action\n Localization","summary":" Online video understanding often relies on individual frames, leading to\nframe-by-frame predictions. Recent advancements such as Online Temporal Action\nLocalization (OnTAL), extend this approach to instance-level predictions.\nHowever, existing methods mainly focus on short-term context, neglecting\nhistorical information. To address this, we introduce the History-Augmented\nAnchor Transformer (HAT) Framework for OnTAL. By integrating historical\ncontext, our framework enhances the synergy between long-term and short-term\ninformation, improving the quality of anchor features crucial for\nclassification and localization. We evaluate our model on both procedural\negocentric (PREGO) datasets (EGTEA and EPIC) and standard non-PREGO OnTAL\ndatasets (THUMOS and MUSES). Results show that our model outperforms\nstate-of-the-art approaches significantly on PREGO datasets and achieves\ncomparable or slightly superior performance on non-PREGO datasets, underscoring\nthe importance of leveraging long-term history, especially in procedural and\negocentric action scenarios. Code is available at:\nhttps://github.com/sakibreza/ECCV24-HAT/\n","authors":["Sakib Reza","Yuexi Zhang","Mohsen Moghaddam","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2408.06437v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2404.18699v2","updated":"2024-08-12T18:12:44Z","published":"2024-04-29T13:47:59Z","title":"Convergence Properties of Score-Based Models for Linear Inverse Problems\n Using Graduated Optimisation","summary":" The incorporation of generative models as regularisers within variational\nformulations for inverse problems has proven effective across numerous image\nreconstruction tasks. However, the resulting optimisation problem is often\nnon-convex and challenging to solve. In this work, we show that score-based\ngenerative models (SGMs) can be used in a graduated optimisation framework to\nsolve inverse problems. We show that the resulting graduated non-convexity flow\nconverge to stationary points of the original problem and provide a numerical\nconvergence analysis of a 2D toy example. We further provide experiments on\ncomputed tomography image reconstruction, where we show that this framework is\nable to recover high-quality images, independent of the initial value. The\nexperiments highlight the potential of using SGMs in graduated optimisation\nframeworks. The source code is publicly available on GitHub.\n","authors":["Pascal Fernsel","Željko Kereta","Alexander Denker"],"pdf_url":"https://arxiv.org/pdf/2404.18699v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.06429v1","updated":"2024-08-12T18:10:51Z","published":"2024-08-12T18:10:51Z","title":"Wavelet based inpainting detection","summary":" With the advancement in image editing tools, manipulating digital images has\nbecome alarmingly easy. Inpainting, which is used to remove objects or fill in\nparts of an image, serves as a powerful tool for both image restoration and\nforgery. This paper introduces a novel approach for detecting image inpainting\nforgeries by combining DT-CWT with Hierarchical Feature segmentation and with\nnoise inconsistency analysis. The DT-CWT offers several advantages for this\ntask, including inherent shift-invariance, which makes it robust to minor\nmanipulations during the inpainting process, and directional selectivity, which\nhelps capture subtle artifacts introduced by inpainting in specific frequency\nbands and orientations. By first applying color image segmentation and then\nanalyzing for each segment, noise inconsistency obtained via DT-CW we can\nidentify patterns indicative of inpainting forgeries. The proposed method is\nevaluated on a benchmark dataset created for this purpose and is compared with\nexisting forgery detection techniques. Our approach demonstrates superior\nresults compared with SOTA in detecting inpainted images.\n","authors":["Barglazan Adrian-Alin","Brad Remus Ovidiu"],"pdf_url":"https://arxiv.org/pdf/2408.06429v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.02854v3","updated":"2024-08-12T16:44:05Z","published":"2024-08-05T22:34:28Z","title":"Wiping out the limitations of Large Language Models -- A Taxonomy for\n Retrieval Augmented Generation","summary":" Current research on RAGs is distributed across various disciplines, and since\nthe technology is evolving very quickly, its unit of analysis is mostly on\ntechnological innovations, rather than applications in business contexts. Thus,\nin this research, we aim to create a taxonomy to conceptualize a comprehensive\noverview of the constituting characteristics that define RAG applications,\nfacilitating the adoption of this technology in the IS community. To the best\nof our knowledge, no RAG application taxonomies have been developed so far. We\ndescribe our methodology for developing the taxonomy, which includes the\ncriteria for selecting papers, an explanation of our rationale for employing a\nLarge Language Model (LLM)-supported approach to extract and identify initial\ncharacteristics, and a concise overview of our systematic process for\nconceptualizing the taxonomy. Our systematic taxonomy development process\nincludes four iterative phases designed to refine and enhance our understanding\nand presentation of RAG's core dimensions. We have developed a total of five\nmeta-dimensions and sixteen dimensions to comprehensively capture the concept\nof Retrieval-Augmented Generation (RAG) applications. When discussing our\nfindings, we also detail the specific research areas and pose key research\nquestions to guide future information system researchers as they explore the\nemerging topics of RAG systems.\n","authors":["Mahei Manhai Li","Irina Nikishina","Özge Sevgili","Martin Semmann"],"pdf_url":"https://arxiv.org/pdf/2408.02854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06051v1","updated":"2024-08-12T10:55:42Z","published":"2024-08-12T10:55:42Z","title":"Perceptual Similarity for Measuring Decision-Making Style and Policy\n Diversity in Games","summary":" Defining and measuring decision-making styles, also known as playstyles, is\ncrucial in gaming, where these styles reflect a broad spectrum of individuality\nand diversity. However, finding a universally applicable measure for these\nstyles poses a challenge. Building on Playstyle Distance, the first\nunsupervised metric to measure playstyle similarity based on game screens and\nraw actions, we introduce three enhancements to increase accuracy: multiscale\nanalysis with varied state granularity, a perceptual kernel rooted in\npsychology, and the utilization of the intersection-over-union method for\nefficient evaluation. These innovations not only advance measurement precision\nbut also offer insights into human cognition of similarity. Across two racing\ngames and seven Atari games, our techniques significantly improve the precision\nof zero-shot playstyle classification, achieving an accuracy exceeding 90\npercent with fewer than 512 observation-action pairs, which is less than half\nan episode of these games. Furthermore, our experiments with 2048 and Go\ndemonstrate the potential of discrete playstyle measures in puzzle and board\ngames. We also develop an algorithm for assessing decision-making diversity\nusing these measures. Our findings improve the measurement of end-to-end game\nanalysis and the evolution of artificial intelligence for diverse playstyles.\n","authors":["Chiu-Chou Lin","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06051v1.pdf","comment":"TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49"},{"id":"http://arxiv.org/abs/2408.06034v1","updated":"2024-08-12T09:53:58Z","published":"2024-08-12T09:53:58Z","title":"The landscape of ontologies in materials science and engineering: A\n survey and evaluation","summary":" Ontologies are widely used in materials science to describe experiments,\nprocesses, material properties, and experimental and computational workflows.\nNumerous online platforms are available for accessing and sharing ontologies in\nMaterials Science and Engineering (MSE). Additionally, several surveys of these\nontologies have been conducted. However, these studies often lack comprehensive\nanalysis and quality control metrics. This paper provides an overview of\nontologies used in Materials Science and Engineering to assist domain experts\nin selecting the most suitable ontology for a given purpose. Sixty selected\nontologies are analyzed and compared based on the requirements outlined in this\npaper. Statistical data on ontology reuse and key metrics are also presented.\nThe evaluation results provide valuable insights into the strengths and\nweaknesses of the investigated MSE ontologies. This enables domain experts to\nselect suitable ontologies and to incorporate relevant terms from existing\nresources.\n","authors":["Ebrahim Norouzi","Jörg Waitelonis","Harald Sack"],"pdf_url":"https://arxiv.org/pdf/2408.06034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15740v2","updated":"2024-08-12T08:21:32Z","published":"2024-03-23T06:36:32Z","title":"Protecting Copyrighted Material with Unique Identifiers in Large\n Language Model Training","summary":" A major public concern regarding the training of large language models (LLMs)\nis whether they abusing copyrighted online text. Previous membership inference\nmethods may be misled by similar examples in vast amounts of training data.\nAdditionally, these methods are often too complex for general users to\nunderstand and use, making them centralized, lacking transparency, and\ntrustworthiness. To address these issues, we propose an alternative\n\\textit{insert-and-detection} methodology, advocating that web users and\ncontent platforms employ \\textbf{\\textit{unique identifiers}} for reliable and\nindependent membership inference. Users and platforms can create their own\nidentifiers, embed them in copyrighted text, and independently detect them in\nfuture LLMs. As an initial demonstration, we introduce \\textit{ghost\nsentences}, a primitive form of unique identifiers, consisting primarily of\npassphrases made up of random words. By embedding one ghost sentences in a few\ncopyrighted texts, users can detect its membership using a perplexity test and\na \\textit{user-friendly} last-$k$ words test. The perplexity test is based on\nthe fact that LLMs trained on natural language should exhibit high perplexity\nwhen encountering unnatural passphrases. As the repetition increases, users can\nleverage the verbatim memorization ability of LLMs to perform a last-$k$ words\ntest by chatting with LLMs without writing any code. Both tests offer rigorous\nstatistical guarantees for membership inference. For LLaMA-13B, a perplexity\ntest on 30 ghost sentences with an average of 7 repetitions in 148K examples\nyields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out\nof 16 users, with an average of 24 examples each, successfully identify their\ndata from 1.8M examples.\n","authors":["Shuai Zhao","Linchao Zhu","Ruijie Quan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15740v2.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2408.05948v1","updated":"2024-08-12T06:48:43Z","published":"2024-08-12T06:48:43Z","title":"ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge\n Graph QA datasets with Large Language Models","summary":" The rapid advancement of Large Language Models (LLMs) and conversational\nassistants necessitates dynamic, scalable, and configurable conversational\ndatasets for training and evaluation. These datasets must accommodate diverse\nuser interaction modes, including text and voice, each presenting unique\nmodeling challenges. Knowledge Graphs (KGs), with their structured and evolving\nnature, offer an ideal foundation for current and precise knowledge. Although\nhuman-curated KG-based conversational datasets exist, they struggle to keep\npace with the rapidly changing user information needs. We present ConvKGYarn, a\nscalable method for generating up-to-date and configurable conversational KGQA\ndatasets. Qualitative psychometric analyses confirm our method can generate\nhigh-quality datasets rivaling a popular conversational KGQA dataset while\noffering it at scale and covering a wide range of human-interaction\nconfigurations. We showcase its utility by testing LLMs on diverse\nconversations - exploring model behavior on conversational KGQA sets with\ndifferent configurations grounded in the same KG fact set. Our results\nhighlight the ability of ConvKGYarn to improve KGQA foundations and evaluate\nparametric knowledge of LLMs, thus offering a robust solution to the constantly\nevolving landscape of conversational assistants.\n","authors":["Ronak Pradeep","Daniel Lee","Ali Mousavi","Jeff Pound","Yisi Sang","Jimmy Lin","Ihab Ilyas","Saloni Potdar","Mostafa Arefiyan","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.05948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05933v1","updated":"2024-08-12T06:16:37Z","published":"2024-08-12T06:16:37Z","title":"Optimizing RAG Techniques for Automotive Industry PDF Chatbots: A Case\n Study with Locally Deployed Ollama Models","summary":" With the growing demand for offline PDF chatbots in automotive industrial\nproduction environments, optimizing the deployment of large language models\n(LLMs) in local, low-performance settings has become increasingly important.\nThis study focuses on enhancing Retrieval-Augmented Generation (RAG) techniques\nfor processing complex automotive industry documents using locally deployed\nOllama models. Based on the Langchain framework, we propose a multi-dimensional\noptimization approach for Ollama's local RAG implementation. Our method\naddresses key challenges in automotive document processing, including\nmulti-column layouts and technical specifications. We introduce improvements in\nPDF processing, retrieval mechanisms, and context compression, tailored to the\nunique characteristics of automotive industry documents. Additionally, we\ndesign custom classes supporting embedding pipelines and an agent supporting\nself-RAG based on LangGraph best practices. To evaluate our approach, we\nconstructed a proprietary dataset comprising typical automotive industry\ndocuments, including technical reports and corporate regulations. We compared\nour optimized RAG model and self-RAG agent against a naive RAG baseline across\nthree datasets: our automotive industry dataset, QReCC, and CoQA. Results\ndemonstrate significant improvements in context precision, context recall,\nanswer relevancy, and faithfulness, with particularly notable performance on\nthe automotive industry dataset. Our optimization scheme provides an effective\nsolution for deploying local RAG systems in the automotive sector, addressing\nthe specific needs of PDF chatbots in industrial production environments. This\nresearch has important implications for advancing information processing and\nintelligent production in the automotive industry.\n","authors":["Fei Liu","Zejun Kang","Xing Han"],"pdf_url":"https://arxiv.org/pdf/2408.05933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05883v1","updated":"2024-08-12T00:51:21Z","published":"2024-08-12T00:51:21Z","title":"Low-Rank Approximation, Adaptation, and Other Tales","summary":" Low-rank approximation is a fundamental technique in modern data analysis,\nwidely utilized across various fields such as signal processing, machine\nlearning, and natural language processing. Despite its ubiquity, the mechanics\nof low-rank approximation and its application in adaptation can sometimes be\nobscure, leaving practitioners and researchers with questions about its true\ncapabilities and limitations. This paper seeks to clarify low-rank\napproximation and adaptation by offering a comprehensive guide that reveals\ntheir inner workings and explains their utility in a clear and accessible way.\nOur focus here is to develop a solid intuition for how low-rank approximation\nand adaptation operate, and why they are so effective. We begin with basic\nconcepts and gradually build up to the mathematical underpinnings, ensuring\nthat readers of all backgrounds can gain a deeper understanding of low-rank\napproximation and adaptation. We strive to strike a balance between informal\nexplanations and rigorous mathematics, ensuring that both newcomers and\nexperienced experts can benefit from this survey. Additionally, we introduce\nnew low-rank decomposition and adaptation algorithms that have not yet been\nexplored in the field, hoping that future researchers will investigate their\npotential applicability.\n","authors":["Jun Lu"],"pdf_url":"https://arxiv.org/pdf/2408.05883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13033v2","updated":"2024-08-12T23:36:58Z","published":"2024-02-20T14:18:43Z","title":"Enhancing Node Representations for Real-World Complex Networks with\n Topological Augmentation","summary":" Graph augmentation methods play a crucial role in improving the performance\nand enhancing generalisation capabilities in Graph Neural Networks (GNNs).\nExisting graph augmentation methods mainly perturb the graph structures, and\nare usually limited to pairwise node relations. These methods cannot fully\naddress the complexities of real-world large-scale networks, which often\ninvolve higher-order node relations beyond only being pairwise. Meanwhile,\nreal-world graph datasets are predominantly modelled as simple graphs, due to\nthe scarcity of data that can be used to form higher-order edges. Therefore,\nreconfiguring the higher-order edges as an integration into graph augmentation\nstrategies lights up a promising research path to address the aforementioned\nissues. In this paper, we present Topological Augmentation (TopoAug), a novel\ngraph augmentation method that builds a combinatorial complex from the original\ngraph by constructing virtual hyperedges directly from the raw data. TopoAug\nthen produces auxiliary node features by extracting information from the\ncombinatorial complex, which are used for enhancing GNN performances on\ndownstream tasks. We design three diverse virtual hyperedge construction\nstrategies to accompany the construction of combinatorial complexes: (1) via\ngraph statistics, (2) from multiple data perspectives, and (3) utilising\nmulti-modality. Furthermore, to facilitate TopoAug evaluation, we provide 23\nnovel real-world graph datasets across various domains including social media,\nbiology, and e-commerce. Our empirical study shows that TopoAug consistently\nand significantly outperforms GNN baselines and other graph augmentation\nmethods, across a variety of application contexts, which clearly indicates that\nit can effectively incorporate higher-order node relations into the graph\naugmentation for real-world complex networks.\n","authors":["Xiangyu Zhao","Zehui Li","Mingzhu Shen","Guy-Bart Stan","Pietro Liò","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.13033v2.pdf","comment":"In 27th European Conference on Artificial Intelligence (ECAI 2024).\n 13 pages, 2 figures, 13 tables"},{"id":"http://arxiv.org/abs/2408.06512v1","updated":"2024-08-12T22:02:39Z","published":"2024-08-12T22:02:39Z","title":"Learned Ranking Function: From Short-term Behavior Predictions to\n Long-term User Satisfaction","summary":" We present the Learned Ranking Function (LRF), a system that takes short-term\nuser-item behavior predictions as input and outputs a slate of recommendations\nthat directly optimizes for long-term user satisfaction. Most previous work is\nbased on optimizing the hyperparameters of a heuristic function. We propose to\nmodel the problem directly as a slate optimization problem with the objective\nof maximizing long-term user satisfaction. We also develop a novel constraint\noptimization algorithm that stabilizes objective trade-offs for multi-objective\noptimization. We evaluate our approach with live experiments and describe its\ndeployment on YouTube.\n","authors":["Yi Wu","Daryl Chang","Jennifer She","Zhe Zhao","Li Wei","Lukasz Heldt"],"pdf_url":"https://arxiv.org/pdf/2408.06512v1.pdf","comment":"RecSys 24"},{"id":"http://arxiv.org/abs/2408.06201v1","updated":"2024-08-12T14:50:04Z","published":"2024-08-12T14:50:04Z","title":"Investigating Characteristics of Media Recommendation Solicitation in\n r/ifyoulikeblank","summary":" Despite the existence of search-based recommender systems like Google,\nNetflix, and Spotify, online users sometimes may turn to crowdsourced\nrecommendations in places like the r/ifyoulikeblank subreddit. In this\nexploratory study, we probe why users go to r/ifyoulikeblank, how they look for\nrecommendation, and how the subreddit users respond to recommendation requests.\nTo answer, we collected sample posts from r/ifyoulikeblank and analyzed them\nusing a qualitative approach. Our analysis reveals that users come to this\nsubreddit for various reasons, such as exhausting popular search systems, not\nknowing what or how to search for an item, and thinking crowd have better\nknowledge than search systems. Examining users query and their description, we\nfound novel information users provide during recommendation seeking using\nr/ifyoulikeblank. For example, sometimes they ask for artifacts recommendation\nbased on the tools used to create them. Or, sometimes indicating a\nrecommendation seeker's time constraints can help better suit recommendations\nto their needs. Finally, recommendation responses and interactions revealed\npatterns of how requesters and responders refine queries and recommendations.\nOur work informs future intelligent recommender systems design.\n","authors":["Md Momen Bhuiyan","Donghan Hu","Andrew Jelson","Tanushree Mitra","Sang Won Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06201v1.pdf","comment":"page 23"},{"id":"http://arxiv.org/abs/2408.05896v1","updated":"2024-08-12T02:18:42Z","published":"2024-08-12T02:18:42Z","title":"Scalable recommender system based on factor analysis","summary":" Recommender systems have become crucial in the modern digital landscape,\nwhere personalized content, products, and services are essential for enhancing\nuser experience. This paper explores statistical models for recommender\nsystems, focusing on crossed random effects models and factor analysis. We\nextend the crossed random effects model to include random slopes, enabling the\ncapture of varying covariate effects among users and items. Additionally, we\ninvestigate the use of factor analysis in recommender systems, particularly for\nsettings with incomplete data. The paper also discusses scalable solutions\nusing the Expectation Maximization (EM) and variational EM algorithms for\nparameter estimation, highlighting the application of these models to predict\nuser-item interactions effectively.\n","authors":["Disha Ghandwani","Trevor Hastie"],"pdf_url":"https://arxiv.org/pdf/2408.05896v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.00798v4","updated":"2024-08-12T17:54:32Z","published":"2024-02-01T17:30:50Z","title":"Formal-LLM: Integrating Formal Language and Natural Language for\n Controllable LLM-based Agents","summary":" Recent advancements on Large Language Models (LLMs) enable AI Agents to\nautomatically generate and execute multi-step plans to solve complex tasks.\nHowever, since LLM's content generation process is hardly controllable, current\nLLM-based agents frequently generate invalid or non-executable plans, which\njeopardizes the performance of the generated plans and corrupts users' trust in\nLLM-based agents. In response, this paper proposes a novel \"Formal-LLM\"\nframework for LLM-based agents by integrating the expressiveness of natural\nlanguage and the precision of formal language. Specifically, the framework\nallows agent developers to express their requirements or constraints for the\nplanning process as an automaton. A stack-based LLM plan generation process is\nthen conducted under the supervision of the automaton to ensure that the\ngenerated plan satisfies the constraints, making the planning process\ncontrollable. We conduct experiments on both benchmark tasks and practical\nreal-life tasks, and our framework achieves over 50% overall performance\nincrease, which validates the feasibility and effectiveness of employing\nFormal-LLM to guide the plan generation of agents, preventing the agents from\ngenerating invalid and unsuccessful plans. Further, more controllable LLM-based\nagents can facilitate the broader utilization of LLM in application scenarios\nwhere high validity of planning is essential. The source code of this work is\navailable at https://github.com/agiresearch/Formal-LLM.\n","authors":["Zelong Li","Wenyue Hua","Hao Wang","He Zhu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00798v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17012v2","updated":"2024-08-12T17:53:13Z","published":"2023-09-29T06:53:10Z","title":"Benchmarking Cognitive Biases in Large Language Models as Evaluators","summary":" Large Language Models (LLMs) have recently been shown to be effective as\nautomatic evaluators with simple prompting and in-context learning. In this\nwork, we assemble 15 LLMs of four different size ranges and evaluate their\noutput responses by preference ranking from the other LLMs as evaluators, such\nas System Star is better than System Square. We then evaluate the quality of\nranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators\n(CoBBLEr), a benchmark to measure six different cognitive biases in LLM\nevaluation outputs, such as the Egocentric bias where a model prefers to rank\nits own outputs highly in evaluation. We find that LLMs are biased text quality\nevaluators, exhibiting strong indications on our bias benchmark (average of 40%\nof comparisons across all models) within each of their evaluations that\nquestion their robustness as evaluators. Furthermore, we examine the\ncorrelation between human and machine preferences and calculate the average\nRank-Biased Overlap (RBO) score to be 49.6%, indicating that machine\npreferences are misaligned with humans. According to our findings, LLMs may\nstill be unable to be utilized for automatic annotation aligned with human\npreferences. Our project page is at: https://minnesotanlp.github.io/cobbler.\n","authors":["Ryan Koo","Minhwa Lee","Vipul Raheja","Jong Inn Park","Zae Myung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2309.17012v2.pdf","comment":"Publishsed at 2024. 29 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2408.06335v1","updated":"2024-08-12T17:52:11Z","published":"2024-08-12T17:52:11Z","title":"LOLgorithm: Integrating Semantic,Syntactic and Contextual Elements for\n Humor Classification","summary":" This paper explores humor detection through a linguistic lens, prioritizing\nsyntactic, semantic, and contextual features over computational methods in\nNatural Language Processing. We categorize features into syntactic, semantic,\nand contextual dimensions, including lexicons, structural statistics, Word2Vec,\nWordNet, and phonetic style. Our proposed model, Colbert, utilizes BERT\nembeddings and parallel hidden layers to capture sentence congruity. By\ncombining syntactic, semantic, and contextual features, we train Colbert for\nhumor detection. Feature engineering examines essential syntactic and semantic\nfeatures alongside BERT embeddings. SHAP interpretations and decision trees\nidentify influential features, revealing that a holistic approach improves\nhumor detection accuracy on unseen data. Integrating linguistic cues from\ndifferent dimensions enhances the model's ability to understand humor\ncomplexity beyond traditional computational methods.\n","authors":["Tanisha Khurana","Kaushik Pillalamarri","Vikram Pande","Munindar Singh"],"pdf_url":"https://arxiv.org/pdf/2408.06335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06318v1","updated":"2024-08-12T17:39:01Z","published":"2024-08-12T17:39:01Z","title":"Can We Rely on LLM Agents to Draft Long-Horizon Plans? Let's Take\n TravelPlanner as an Example","summary":" Large language models (LLMs) have brought autonomous agents closer to\nartificial general intelligence (AGI) due to their promising generalization and\nemergent capabilities. There is, however, a lack of studies on how LLM-based\nagents behave, why they could potentially fail, and how to improve them,\nparticularly in demanding real-world planning tasks. In this paper, as an\neffort to fill the gap, we present our study using a realistic benchmark,\nTravelPlanner, where an agent must meet multiple constraints to generate\naccurate plans. We leverage this benchmark to address four key research\nquestions: (1) are LLM agents robust enough to lengthy and noisy contexts when\nit comes to reasoning and planning? (2) can few-shot prompting adversely impact\nthe performance of LLM agents in scenarios with long context? (3) can we rely\non refinement to improve plans, and (4) can fine-tuning LLMs with both positive\nand negative feedback lead to further improvement? Our comprehensive\nexperiments indicate that, firstly, LLMs often fail to attend to crucial parts\nof a long context, despite their ability to handle extensive reference\ninformation and few-shot examples; secondly, they still struggle with analyzing\nthe long plans and cannot provide accurate feedback for refinement; thirdly, we\npropose Feedback-Aware Fine-Tuning (FAFT), which leverages both positive and\nnegative feedback, resulting in substantial gains over Supervised Fine-Tuning\n(SFT). Our findings offer in-depth insights to the community on various aspects\nrelated to real-world planning applications.\n","authors":["Yanan Chen","Ali Pesaranghader","Tanmana Sadhu","Dong Hoon Yi"],"pdf_url":"https://arxiv.org/pdf/2408.06318v1.pdf","comment":"13 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.06316v1","updated":"2024-08-12T17:31:28Z","published":"2024-08-12T17:31:28Z","title":"Body Transformer: Leveraging Robot Embodiment for Policy Learning","summary":" In recent years, the transformer architecture has become the de facto\nstandard for machine learning algorithms applied to natural language processing\nand computer vision. Despite notable evidence of successful deployment of this\narchitecture in the context of robot learning, we claim that vanilla\ntransformers do not fully exploit the structure of the robot learning problem.\nTherefore, we propose Body Transformer (BoT), an architecture that leverages\nthe robot embodiment by providing an inductive bias that guides the learning\nprocess. We represent the robot body as a graph of sensors and actuators, and\nrely on masked attention to pool information throughout the architecture. The\nresulting architecture outperforms the vanilla transformer, as well as the\nclassical multilayer perceptron, in terms of task completion, scaling\nproperties, and computational efficiency when representing either imitation or\nreinforcement learning policies. Additional material including the open-source\ncode is available at https://sferrazza.cc/bot_site.\n","authors":["Carmelo Sferrazza","Dun-Ming Huang","Fangchen Liu","Jongmin Lee","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2408.06316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18957v2","updated":"2024-08-12T17:20:35Z","published":"2024-03-27T19:02:13Z","title":"Moderating Illicit Online Image Promotion for Unsafe User-Generated\n Content Games Using Large Vision-Language Models","summary":" Online user generated content games (UGCGs) are increasingly popular among\nchildren and adolescents for social interaction and more creative online\nentertainment. However, they pose a heightened risk of exposure to explicit\ncontent, raising growing concerns for the online safety of children and\nadolescents. Despite these concerns, few studies have addressed the issue of\nillicit image-based promotions of unsafe UGCGs on social media, which can\ninadvertently attract young users. This challenge arises from the difficulty of\nobtaining comprehensive training data for UGCG images and the unique nature of\nthese images, which differ from traditional unsafe content. In this work, we\ntake the first step towards studying the threat of illicit promotions of unsafe\nUGCGs. We collect a real-world dataset comprising 2,924 images that display\ndiverse sexually explicit and violent content used to promote UGCGs by their\ngame creators. Our in-depth studies reveal a new understanding of this problem\nand the urgent need for automatically flagging illicit UGCG promotions. We\nadditionally create a cutting-edge system, UGCG-Guard, designed to aid social\nmedia platforms in effectively identifying images used for illicit UGCG\npromotions. This system leverages recently introduced large vision-language\nmodels (VLMs) and employs a novel conditional prompting strategy for zero-shot\ndomain adaptation, along with chain-of-thought (CoT) reasoning for contextual\nidentification. UGCG-Guard achieves outstanding results, with an accuracy rate\nof 94% in detecting these images used for the illicit promotion of such games\nin real-world scenarios.\n","authors":["Keyan Guo","Ayush Utkarsh","Wenbo Ding","Isabelle Ondracek","Ziming Zhao","Guo Freeman","Nishant Vishwamitra","Hongxin Hu"],"pdf_url":"https://arxiv.org/pdf/2403.18957v2.pdf","comment":"To Appear in the 33rd USENIX Security Symposium, August 14-16, 2024"},{"id":"http://arxiv.org/abs/2402.05346v2","updated":"2024-08-12T17:19:06Z","published":"2024-02-08T01:41:28Z","title":"KIX: A Knowledge and Interaction-Centric Metacognitive Framework for\n Task Generalization","summary":" People aptly exhibit general intelligence behaviors in solving a variety of\ntasks with flexibility and ability to adapt to novel situations by reusing and\napplying high-level knowledge acquired over time. But artificial agents are\nmore like specialists, lacking such generalist behaviors. Artificial agents\nwill require understanding and exploiting critical structured knowledge\nrepresentations. We present a metacognitive generalization framework,\nKnowledge-Interaction-eXecution (KIX), and argue that interactions with objects\nleveraging type space facilitate the learning of transferable interaction\nconcepts and generalization. It is a natural way of integrating knowledge into\nreinforcement learning and is promising to act as an enabler for autonomous and\ngeneralist behaviors in artificial intelligence systems.\n","authors":["Arun Kumar","Paul Schrater"],"pdf_url":"https://arxiv.org/pdf/2402.05346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02075v2","updated":"2024-08-12T17:17:00Z","published":"2024-06-04T07:54:31Z","title":"ReLU-KAN: New Kolmogorov-Arnold Networks that Only Need Matrix Addition,\n Dot Multiplication, and ReLU","summary":" Limited by the complexity of basis function (B-spline) calculations,\nKolmogorov-Arnold Networks (KAN) suffer from restricted parallel computing\ncapability on GPUs. This paper proposes a novel ReLU-KAN implementation that\ninherits the core idea of KAN. By adopting ReLU (Rectified Linear Unit) and\npoint-wise multiplication, we simplify the design of KAN's basis function and\noptimize the computation process for efficient CUDA computing. The proposed\nReLU-KAN architecture can be readily implemented on existing deep learning\nframeworks (e.g., PyTorch) for both inference and training. Experimental\nresults demonstrate that ReLU-KAN achieves a 20x speedup compared to\ntraditional KAN with 4-layer networks. Furthermore, ReLU-KAN exhibits a more\nstable training process with superior fitting ability while preserving the\n\"catastrophic forgetting avoidance\" property of KAN. You can get the code in\nhttps://github.com/quiqi/relu_kan\n","authors":["Qi Qiu","Tao Zhu","Helin Gong","Liming Chen","Huansheng Ning"],"pdf_url":"https://arxiv.org/pdf/2406.02075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06302v1","updated":"2024-08-12T17:14:41Z","published":"2024-08-12T17:14:41Z","title":"Finding Patterns in Ambiguity: Interpretable Stress Testing in the\n Decision~Boundary","summary":" The increasing use of deep learning across various domains highlights the\nimportance of understanding the decision-making processes of these black-box\nmodels. Recent research focusing on the decision boundaries of deep\nclassifiers, relies on generated synthetic instances in areas of low\nconfidence, uncovering samples that challenge both models and humans. We\npropose a novel approach to enhance the interpretability of deep binary\nclassifiers by selecting representative samples from the decision boundary -\nprototypes - and applying post-model explanation algorithms. We evaluate the\neffectiveness of our approach through 2D visualizations and GradientSHAP\nanalysis. Our experiments demonstrate the potential of the proposed method,\nrevealing distinct and compact clusters and diverse prototypes that capture\nessential features that lead to low-confidence decisions. By offering a more\naggregated view of deep classifiers' decision boundaries, our work contributes\nto the responsible development and deployment of reliable machine learning\nsystems.\n","authors":["Inês Gomes","Luís F. Teixeira","Jan N. van Rijn","Carlos Soares","André Restivo","Luís Cunha","Moisés Santos"],"pdf_url":"https://arxiv.org/pdf/2408.06302v1.pdf","comment":"To be published in the Responsible Generative AI workshop at CVPR"},{"id":"http://arxiv.org/abs/2408.06300v1","updated":"2024-08-12T17:09:28Z","published":"2024-08-12T17:09:28Z","title":"Inverse designing metamaterials with programmable nonlinear functional\n responses in graph space","summary":" Material responses to static and dynamic stimuli, represented as nonlinear\ncurves, are design targets for engineering functionalities like structural\nsupport, impact protection, and acoustic and photonic bandgaps.\nThree-dimensional metamaterials offer significant tunability due to their\ninternal structure, yet existing methods struggle to capture their complex\nbehavior-to-structure relationships. We present GraphMetaMat, a graph-based\nframework capable of designing three-dimensional metamaterials with\nprogrammable responses and arbitrary manufacturing constraints. Integrating\ngraph networks, physics biases, reinforcement learning, and tree search,\nGraphMetaMat can target stress-strain curves spanning four orders of magnitude\nand complex behaviors, as well as viscoelastic transmission responses with\nvarying attenuation gaps. GraphMetaMat can create cushioning materials for\nprotective equipment and vibration-damping panels for electric vehicles,\noutperforming commercial materials, and enabling the automatic design of\nmaterials with on-demand functionalities.\n","authors":["Marco Maurizi","Derek Xu","Yu-Tong Wang","Desheng Yao","David Hahn","Mourad Oudich","Anish Satpati","Mathieu Bauchy","Wei Wang","Yizhou Sun","Yun Jing","Xiaoyu Rayne Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.06300v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06297v1","updated":"2024-08-12T17:08:31Z","published":"2024-08-12T17:08:31Z","title":"LEARN: An Invex Loss for Outlier Oblivious Robust Online Optimization","summary":" We study a robust online convex optimization framework, where an adversary\ncan introduce outliers by corrupting loss functions in an arbitrary number of\nrounds k, unknown to the learner. Our focus is on a novel setting allowing\nunbounded domains and large gradients for the losses without relying on a\nLipschitz assumption. We introduce the Log Exponential Adjusted Robust and\niNvex (LEARN) loss, a non-convex (invex) robust loss function to mitigate the\neffects of outliers and develop a robust variant of the online gradient descent\nalgorithm by leveraging the LEARN loss. We establish tight regret guarantees\n(up to constants), in a dynamic setting, with respect to the uncorrupted rounds\nand conduct experiments to validate our theory. Furthermore, we present a\nunified analysis framework for developing online optimization algorithms for\nnon-convex (invex) losses, utilizing it to provide regret bounds with respect\nto the LEARN loss, which may be of independent interest.\n","authors":["Adarsh Barik","Anand Krishna","Vincent Y. F. Tan"],"pdf_url":"https://arxiv.org/pdf/2408.06297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15602v2","updated":"2024-08-12T17:02:48Z","published":"2023-08-29T19:47:31Z","title":"An Experimental Comparison of Partitioning Strategies for Distributed\n Graph Neural Network Training","summary":" Recently, graph neural networks (GNNs) have gained much attention as a\ngrowing area of deep learning capable of learning on graph-structured data.\nHowever, the computational and memory requirements for training GNNs on\nlarge-scale graphs make it necessary to distribute the training. A prerequisite\nfor distributed GNN training is to partition the input graph into smaller parts\nthat are distributed among multiple machines of a compute cluster. Although\ngraph partitioning has been studied with regard to graph analytics and graph\ndatabases, its effect on GNN training performance is largely unexplored. As a\nconsequence, it is unclear whether investing computational efforts into\nhigh-quality graph partitioning would pay off in GNN training scenarios.\n In this paper, we study the effectiveness of graph partitioning for\ndistributed GNN training. Our study aims to understand how different factors\nsuch as GNN parameters, mini-batch size, graph type, features size, and\nscale-out factor influence the effectiveness of graph partitioning. We conduct\nexperiments with two different GNN systems using vertex and edge partitioning.\nWe found that high-quality graph partitioning is a very effective optimization\nto speed up GNN training and to reduce memory consumption. Furthermore, our\nresults show that invested partitioning time can quickly be amortized by\nreduced GNN training time, making it a relevant optimization for most GNN\nscenarios. Compared to research on distributed graph processing, our study\nreveals that graph partitioning plays an even more significant role in\ndistributed GNN training, which motivates further research on the graph\npartitioning problem.\n","authors":["Nikolai Merkel","Daniel Stoll","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2308.15602v2.pdf","comment":"To be published in Proceedings of the 28th International Conference\n on Extending Database Technology (EDBT), 25th, March-28th March, 2025"},{"id":"http://arxiv.org/abs/2402.03110v2","updated":"2024-08-12T16:58:54Z","published":"2024-02-05T15:38:01Z","title":"Non-Stationary Latent Auto-Regressive Bandits","summary":" We consider the stochastic multi-armed bandit problem with non-stationary\nrewards. We present a novel formulation of non-stationarity in the environment\nwhere changes in the mean reward of the arms over time are due to some unknown,\nlatent, auto-regressive (AR) state of order $k$. We call this new environment\nthe latent AR bandit. Different forms of the latent AR bandit appear in many\nreal-world settings, especially in emerging scientific fields such as\nbehavioral health or education where there are few mechanistic models of the\nenvironment. If the AR order $k$ is known, we propose an algorithm that\nachieves $\\tilde{O}(k\\sqrt{T})$ regret in this setting. Empirically, our\nalgorithm outperforms standard UCB across multiple non-stationary environments,\neven if $k$ is mis-specified.\n","authors":["Anna L. Trella","Walter Dempsey","Finale Doshi-Velez","Susan A. Murphy"],"pdf_url":"https://arxiv.org/pdf/2402.03110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06292v1","updated":"2024-08-12T16:58:11Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aids to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06291v1","updated":"2024-08-12T16:57:57Z","published":"2024-08-12T16:57:57Z","title":"Mambular: A Sequential Model for Tabular Deep Learning","summary":" The analysis of tabular data has traditionally been dominated by\ngradient-boosted decision trees (GBDTs), known for their proficiency with mixed\ncategorical and numerical features. However, recent deep learning innovations\nare challenging this dominance. We introduce Mambular, an adaptation of the\nMamba architecture optimized for tabular data. We extensively benchmark\nMambular against state-of-the-art models, including neural networks and\ntree-based methods, and demonstrate its competitive performance across diverse\ndatasets. Additionally, we explore various adaptations of Mambular to\nunderstand its effectiveness for tabular data. We investigate different pooling\nstrategies, feature interaction mechanisms, and bi-directional processing. Our\nanalysis shows that interpreting features as a sequence and passing them\nthrough Mamba layers results in surprisingly performant models. The results\nhighlight Mambulars potential as a versatile and powerful architecture for\ntabular data analysis, expanding the scope of deep learning applications in\nthis domain.\n The source code is available at https://github.com/basf/mamba-tabular.\n","authors":["Anton Frederik Thielmann","Manish Kumar","Christoph Weisser","Arik Reuter","Benjamin Säfken","Soheila Samiee"],"pdf_url":"https://arxiv.org/pdf/2408.06291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17003v2","updated":"2024-08-12T16:56:11Z","published":"2024-02-26T20:19:14Z","title":"Monitoring Fidelity of Online Reinforcement Learning Algorithms in\n Clinical Trials","summary":" Online reinforcement learning (RL) algorithms offer great potential for\npersonalizing treatment for participants in clinical trials. However, deploying\nan online, autonomous algorithm in the high-stakes healthcare setting makes\nquality control and data quality especially difficult to achieve. This paper\nproposes algorithm fidelity as a critical requirement for deploying online RL\nalgorithms in clinical trials. It emphasizes the responsibility of the\nalgorithm to (1) safeguard participants and (2) preserve the scientific utility\nof the data for post-trial analyses. We also present a framework for\npre-deployment planning and real-time monitoring to help algorithm developers\nand clinical researchers ensure algorithm fidelity. To illustrate our\nframework's practical application, we present real-world examples from the\nOralytics clinical trial. Since Spring 2023, this trial successfully deployed\nan autonomous, online RL algorithm to personalize behavioral interventions for\nparticipants at risk for dental disease.\n","authors":["Anna L. Trella","Kelly W. Zhang","Inbal Nahum-Shani","Vivek Shetty","Iris Yan","Finale Doshi-Velez","Susan A. Murphy"],"pdf_url":"https://arxiv.org/pdf/2402.17003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04489v4","updated":"2024-08-12T16:54:55Z","published":"2023-06-07T15:00:38Z","title":"Fair Column Subset Selection","summary":" The problem of column subset selection asks for a subset of columns from an\ninput matrix such that the matrix can be reconstructed as accurately as\npossible within the span of the selected columns. A natural extension is to\nconsider a setting where the matrix rows are partitioned into two groups, and\nthe goal is to choose a subset of columns that minimizes the maximum\nreconstruction error of both groups, relative to their respective best rank-k\napproximation. Extending the known results of column subset selection to this\nfair setting is not straightforward: in certain scenarios it is unavoidable to\nchoose columns separately for each group, resulting in double the expected\ncolumn count. We propose a deterministic leverage-score sampling strategy for\nthe fair setting and show that sampling a column subset of minimum size becomes\nNP-hard in the presence of two groups. Despite these negative results, we give\nan approximation algorithm that guarantees a solution within 1.5 times the\noptimal solution size. We also present practical heuristic algorithms based on\nrank-revealing QR factorization. Finally, we validate our methods through an\nextensive set of experiments using real-world data.\n","authors":["Antonis Matakos","Bruno Ordozgoiti","Suhas Thejaswi"],"pdf_url":"https://arxiv.org/pdf/2306.04489v4.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2311.17693v3","updated":"2024-08-12T16:52:09Z","published":"2023-11-29T15:00:06Z","title":"Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using\n Reinforcement and Imitation Learning","summary":" Robot-assisted surgical systems have demonstrated significant potential in\nenhancing surgical precision and minimizing human errors. However, existing\nsystems cannot accommodate individual surgeons' unique preferences and\nrequirements. Additionally, they primarily focus on general surgeries (e.g.,\nlaparoscopy) and are unsuitable for highly precise microsurgeries, such as\nophthalmic procedures. Thus, we propose an image-guided approach for\nsurgeon-centered autonomous agents that can adapt to the individual surgeon's\nskill level and preferred surgical techniques during ophthalmic cataract\nsurgery. Our approach trains reinforcement and imitation learning agents\nsimultaneously using curriculum learning approaches guided by image data to\nperform all tasks of the incision phase of cataract surgery. By integrating the\nsurgeon's actions and preferences into the training process, our approach\nenables the robot to implicitly learn and adapt to the individual surgeon's\nunique techniques through surgeon-in-the-loop demonstrations. This results in a\nmore intuitive and personalized surgical experience for the surgeon while\nensuring consistent performance for the autonomous robotic apprentice. We\ndefine and evaluate the effectiveness of our approach in a simulated\nenvironment using our proposed metrics and highlight the trade-off between a\ngeneric agent and a surgeon-centered adapted agent. Finally, our approach has\nthe potential to extend to other ophthalmic and microsurgical procedures,\nopening the door to a new generation of surgeon-in-the-loop autonomous surgical\nrobots. We provide an open-source simulation framework for future development\nand reproducibility at\nhttps://github.com/amrgomaaelhady/CataractAdaptSurgRobot.\n","authors":["Amr Gomaa","Bilal Mahdy","Niko Kleer","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2311.17693v3.pdf","comment":"Accepted at IROS'24"},{"id":"http://arxiv.org/abs/2407.21025v2","updated":"2024-08-12T16:51:02Z","published":"2024-07-14T22:07:48Z","title":"Reinforcement Learning in High-frequency Market Making","summary":" This paper establishes a new and comprehensive theoretical analysis for the\napplication of reinforcement learning (RL) in high-frequency market making. We\nbridge the modern RL theory and the continuous-time statistical models in\nhigh-frequency financial economics. Different with most existing literature on\nmethodological research about developing various RL methods for market making\nproblem, our work is a pilot to provide the theoretical analysis. We target the\neffects of sampling frequency, and find an interesting tradeoff between error\nand complexity of RL algorithm when tweaking the values of the time increment\n$\\Delta$ $-$ as $\\Delta$ becomes smaller, the error will be smaller but the\ncomplexity will be larger. We also study the two-player case under the\ngeneral-sum game framework and establish the convergence of Nash equilibrium to\nthe continuous-time game equilibrium as $\\Delta\\rightarrow0$. The Nash\nQ-learning algorithm, which is an online multi-agent RL method, is applied to\nsolve the equilibrium. Our theories are not only useful for practitioners to\nchoose the sampling frequency, but also very general and applicable to other\nhigh-frequency financial decision making problems, e.g., optimal executions, as\nlong as the time-discretization of a continuous-time markov decision process is\nadopted. Monte Carlo simulation evidence support all of our theories.\n","authors":["Yuheng Zheng","Zihan Ding"],"pdf_url":"https://arxiv.org/pdf/2407.21025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06285v1","updated":"2024-08-12T16:49:22Z","published":"2024-08-12T16:49:22Z","title":"Synthetic Patient-Physician Dialogue Generation from Clinical Notes\n Using LLM","summary":" Medical dialogue systems (MDS) enhance patient-physician communication,\nimprove healthcare accessibility, and reduce costs. However, acquiring suitable\ndata to train these systems poses significant challenges. Privacy concerns\nprevent the use of real conversations, necessitating synthetic alternatives.\nSynthetic dialogue generation from publicly available clinical notes offers a\npromising solution to this issue, providing realistic data while safeguarding\nprivacy. Our approach, SynDial, uses a single LLM iteratively with zero-shot\nprompting and a feedback loop to generate and refine high-quality synthetic\ndialogues. The feedback consists of weighted evaluation scores for similarity\nand extractiveness. The iterative process ensures dialogues meet predefined\nthresholds, achieving superior extractiveness as a result of the feedback loop.\nAdditionally, evaluation shows that the generated dialogues excel in factuality\nmetric compared to the baselines and has comparable diversity scores with GPT4.\n","authors":["Trisha Das","Dina Albassam","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2408.06285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06277v1","updated":"2024-08-12T16:39:18Z","published":"2024-08-12T16:39:18Z","title":"Multi-marginal Schrödinger Bridges with Iterative Reference","summary":" Practitioners frequently aim to infer an unobserved population trajectory\nusing sample snapshots at multiple time points. For instance, in single-cell\nsequencing, scientists would like to learn how gene expression evolves over\ntime. But sequencing any cell destroys that cell. So we cannot access any\ncell's full trajectory, but we can access snapshot samples from many cells.\nStochastic differential equations are commonly used to analyze systems with\nfull individual-trajectory access; since here we have only sample snapshots,\nthese methods are inapplicable. The deep learning community has recently\nexplored using Schr\\\"odinger bridges (SBs) and their extensions to estimate\nthese dynamics. However, these methods either (1) interpolate between just two\ntime points or (2) require a single fixed reference dynamic within the SB,\nwhich is often just set to be Brownian motion. But learning piecewise from\nadjacent time points can fail to capture long-term dependencies. And\npractitioners are typically able to specify a model class for the reference\ndynamic but not the exact values of the parameters within it. So we propose a\nnew method that (1) learns the unobserved trajectories from sample snapshots\nacross multiple time points and (2) requires specification only of a class of\nreference dynamics, not a single fixed one. In particular, we suggest an\niterative projection method inspired by Schr\\\"odinger bridges; we alternate\nbetween learning a piecewise SB on the unobserved trajectories and using the\nlearned SB to refine our best guess for the dynamics within the reference\nclass. We demonstrate the advantages of our method via a well-known simulated\nparametric model from ecology, simulated and real data from systems biology,\nand real motion-capture data.\n","authors":["Yunyi Shen","Renato Berlinghieri","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2408.06277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08042v2","updated":"2024-08-12T16:37:31Z","published":"2024-03-12T19:34:50Z","title":"CT evaluation of 2D and 3D holistic deep learning methods for the\n volumetric segmentation of airway lesions","summary":" This research embarked on a comparative exploration of the holistic\nsegmentation capabilities of Convolutional Neural Networks (CNNs) in both 2D\nand 3D formats, focusing on cystic fibrosis (CF) lesions. The study utilized\ndata from two CF reference centers, covering five major CF structural changes.\nInitially, it compared the 2D and 3D models, highlighting the 3D model's\nsuperior capability in capturing complex features like mucus plugs and\nconsolidations. To improve the 2D model's performance, a loss adapted to fine\nstructures segmentation was implemented and evaluated, significantly enhancing\nits accuracy, though not surpassing the 3D model's performance. The models\nunderwent further validation through external evaluation against pulmonary\nfunction tests (PFTs), confirming the robustness of the findings. Moreover,\nthis study went beyond comparing metrics; it also included comprehensive\nassessments of the models' interpretability and reliability, providing valuable\ninsights for their clinical application.\n","authors":["Amel Imene Hadj Bouzid","Baudouin Denis de Senneville","Fabien Baldacci","Pascal Desbarats","Patrick Berger","Ilyes Benlala","Gaël Dournes"],"pdf_url":"https://arxiv.org/pdf/2403.08042v2.pdf","comment":"6 pages, 3 figures, 2 tables, IEEE International Symposium on\n Biomedical Imaging (ISBI) 2024"},{"id":"http://arxiv.org/abs/2408.06266v1","updated":"2024-08-12T16:24:51Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03701v2","updated":"2024-08-12T16:22:38Z","published":"2024-02-06T04:42:36Z","title":"Unified Discrete Diffusion for Categorical Data","summary":" Discrete diffusion models have seen a surge of attention with applications on\nnaturally discrete data such as language and graphs. Although discrete-time\ndiscrete diffusion has been established for a while, only recently Campbell et\nal. (2022) introduced the first framework for continuous-time discrete\ndiffusion. However, their training and sampling processes differ significantly\nfrom the discrete-time version, necessitating nontrivial approximations for\ntractability. In this paper, we first present a series of mathematical\nsimplifications of the variational lower bound that enable more accurate and\neasy-to-optimize training for discrete diffusion. In addition, we derive a\nsimple formulation for backward denoising that enables exact and accelerated\nsampling, and importantly, an elegant unification of discrete-time and\ncontinuous-time discrete diffusion. Thanks to simpler analytical formulations,\nboth forward and now also backward probabilities can flexibly accommodate any\nnoise distribution, including different noise distributions for multi-element\nobjects. Experiments show that our proposed USD3 (for Unified Simplified\nDiscrete Denoising Diffusion) outperform all SOTA baselines on established\ndatasets. We open-source our unified code at\nhttps://github.com/LingxiaoShawn/USD3.\n","authors":["Lingxiao Zhao","Xueying Ding","Lijun Yu","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2402.03701v2.pdf","comment":"Unify Discrete Denoising Diffusion"},{"id":"http://arxiv.org/abs/2408.06262v1","updated":"2024-08-12T16:22:30Z","published":"2024-08-12T16:22:30Z","title":"DUNE: A Machine Learning Deep UNet++ based Ensemble Approach to Monthly,\n Seasonal and Annual Climate Forecasting","summary":" Capitalizing on the recent availability of ERA5 monthly averaged long-term\ndata records of mean atmospheric and climate fields based on high-resolution\nreanalysis, deep-learning architectures offer an alternative to physics-based\ndaily numerical weather predictions for subseasonal to seasonal (S2S) and\nannual means. A novel Deep UNet++-based Ensemble (DUNE) neural architecture is\nintroduced, employing multi-encoder-decoder structures with residual blocks.\nWhen initialized from a prior month or year, this architecture produced the\nfirst AI-based global monthly, seasonal, or annual mean forecast of 2-meter\ntemperatures (T2m) and sea surface temperatures (SST). ERA5 monthly mean data\nis used as input for T2m over land, SST over oceans, and solar radiation at the\ntop of the atmosphere for each month of 40 years to train the model. Validation\nforecasts are performed for an additional two years, followed by five years of\nforecast evaluations to account for natural annual variability. AI-trained\ninference forecast weights generate forecasts in seconds, enabling ensemble\nseasonal forecasts. Root Mean Squared Error (RMSE), Anomaly Correlation\nCoefficient (ACC), and Heidke Skill Score (HSS) statistics are presented\nglobally and over specific regions. These forecasts outperform persistence,\nclimatology, and multiple linear regression for all domains. DUNE forecasts\ndemonstrate comparable statistical accuracy to NOAA's operational monthly and\nseasonal probabilistic outlook forecasts over the US but at significantly\nhigher resolutions. RMSE and ACC error statistics for other recent AI-based\ndaily forecasts also show superior performance for DUNE-based forecasts. The\nDUNE model's application to an ensemble data assimilation cycle shows\ncomparable forecast accuracy with a single high-resolution model, potentially\neliminating the need for retraining on extrapolated datasets.\n","authors":["Pratik Shukla","Milton Halem"],"pdf_url":"https://arxiv.org/pdf/2408.06262v1.pdf","comment":"Excluding Appendix: 18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.06261v1","updated":"2024-08-12T16:21:29Z","published":"2024-08-12T16:21:29Z","title":"Open-Source Molecular Processing Pipeline for Generating Molecules","summary":" Generative models for molecules have shown considerable promise for use in\ncomputational chemistry, but remain difficult to use for non-experts. For this\nreason, we introduce open-source infrastructure for easily building generative\nmolecular models into the widely used DeepChem [Ramsundar et al., 2019] library\nwith the aim of creating a robust and reusable molecular generation pipeline.\nIn particular, we add high quality PyTorch [Paszke et al., 2019]\nimplementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao\nand Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our\nimplementations show strong performance comparable with past work [Kuznetsov\nand Polykovskiy, 2021, Cao and Kipf, 2022].\n","authors":["Shreyas V","Jose Siguenza","Karan Bania","Bharath Ramsundar"],"pdf_url":"https://arxiv.org/pdf/2408.06261v1.pdf","comment":"Presented at the 2024 Molecular Machine Learning Conference (MoML\n 2024)"},{"id":"http://arxiv.org/abs/2407.21770v3","updated":"2024-08-12T16:20:37Z","published":"2024-07-31T17:46:51Z","title":"MoMa: Efficient Early-Fusion Pre-training with Mixture of Modality-Aware\n Experts","summary":" We introduce MoMa, a novel modality-aware mixture-of-experts (MoE)\narchitecture designed for pre-training mixed-modal, early-fusion language\nmodels. MoMa processes images and text in arbitrary sequences by dividing\nexpert modules into modality-specific groups. These groups exclusively process\ndesignated tokens while employing learned routing within each group to maintain\nsemantically informed adaptivity. Our empirical results reveal substantial\npre-training efficiency gains through this modality-specific parameter\nallocation. Under a 1-trillion-token training budget, the MoMa 1.4B model,\nfeaturing 4 text experts and 4 image experts, achieves impressive FLOPs\nsavings: 3.7x overall, with 2.6x for text and 5.2x for image processing\ncompared to a compute-equivalent dense baseline, measured by pre-training loss.\nThis outperforms the standard expert-choice MoE with 8 mixed-modal experts,\nwhich achieves 3x overall FLOPs savings (3x for text, 2.8x for image).\nCombining MoMa with mixture-of-depths (MoD) further improves pre-training FLOPs\nsavings to 4.2x overall (text: 3.4x, image: 5.3x), although this combination\nhurts performance in causal inference due to increased sensitivity to router\naccuracy. These results demonstrate MoMa's potential to significantly advance\nthe efficiency of mixed-modal, early-fusion language model pre-training, paving\nthe way for more resource-efficient and capable multimodal AI systems.\n","authors":["Xi Victoria Lin","Akshat Shrivastava","Liang Luo","Srinivasan Iyer","Mike Lewis","Gargi Ghosh","Luke Zettlemoyer","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2407.21770v3.pdf","comment":"v2 -> update related work section v3 -> fix spelling"},{"id":"http://arxiv.org/abs/2408.06258v1","updated":"2024-08-12T16:14:55Z","published":"2024-08-12T16:14:55Z","title":"Deep Learning System Boundary Testing through Latent Space Style Mixing","summary":" Evaluating the behavioral frontier of deep learning (DL) systems is crucial\nfor understanding their generalizability and robustness. However, boundary\ntesting is challenging due to their high-dimensional input space. Generative\nartificial intelligence offers a promising solution by modeling data\ndistribution within compact latent space representations, thereby facilitating\nfiner-grained explorations. In this work, we introduce MIMICRY, a novel\nblack-box system-agnostic test generator that leverages these latent\nrepresentations to generate frontier inputs for the DL systems under test.\nSpecifically, MIMICRY uses style-based generative adversarial networks trained\nto learn the representation of inputs with disentangled features. This\nrepresentation enables embedding style-mixing operations between a source and a\ntarget input, combining their features to explore the boundary between them. We\nevaluated the effectiveness of different MIMICRY configurations in generating\nboundary inputs for four popular DL image classification systems. Our results\nshow that manipulating the latent space allows for effective and efficient\nexploration of behavioral frontiers. As opposed to a model-based baseline,\nMIMICRY generates a higher quality frontier of behaviors which includes more\nand closer inputs. Additionally, we assessed the validity of these inputs,\nrevealing a high validity rate according to human assessors.\n","authors":["Amr Abdellatif","Xingcheng Chen","Vincenzo Riccio","Andrea Stocco"],"pdf_url":"https://arxiv.org/pdf/2408.06258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06257v1","updated":"2024-08-12T16:14:52Z","published":"2024-08-12T16:14:52Z","title":"Reciprocal Learning","summary":" We demonstrate that a wide array of machine learning algorithms are specific\ninstances of one single paradigm: reciprocal learning. These instances range\nfrom active learning over multi-armed bandits to self-training. We show that\nall these algorithms do not only learn parameters from data but also vice\nversa: They iteratively alter training data in a way that depends on the\ncurrent model fit. We introduce reciprocal learning as a generalization of\nthese algorithms using the language of decision theory. This allows us to study\nunder what conditions they converge. The key is to guarantee that reciprocal\nlearning contracts such that the Banach fixed-point theorem applies. In this\nway, we find that reciprocal learning algorithms converge at linear rates to an\napproximately optimal model under relatively mild assumptions on the loss\nfunction, if their predictions are probabilistic and the sample adaption is\nboth non-greedy and either randomized or regularized. We interpret these\nfindings and provide corollaries that relate them to specific active learning,\nself-training, and bandit algorithms.\n","authors":["Julian Rodemann","Christoph Jansen","Georg Schollmeyer"],"pdf_url":"https://arxiv.org/pdf/2408.06257v1.pdf","comment":"41 pages, 3 figures"},{"id":"http://arxiv.org/abs/2302.04611v3","updated":"2024-08-12T16:05:43Z","published":"2023-02-09T12:59:16Z","title":"A Text-guided Protein Design Framework","summary":" Current AI-assisted protein design mainly utilizes protein sequential and\nstructural information. Meanwhile, there exists tremendous knowledge curated by\nhumans in the text format describing proteins' high-level functionalities. Yet,\nwhether the incorporation of such text data can help protein design tasks has\nnot been explored. To bridge this gap, we propose ProteinDT, a multi-modal\nframework that leverages textual descriptions for protein design. ProteinDT\nconsists of three subsequent steps: ProteinCLAP which aligns the representation\nof two modalities, a facilitator that generates the protein representation from\nthe text modality, and a decoder that creates the protein sequences from the\nrepresentation. To train ProteinDT, we construct a large dataset,\nSwissProtCLAP, with 441K text and protein pairs. We quantitatively verify the\neffectiveness of ProteinDT on three challenging tasks: (1) over 90\\% accuracy\nfor text-guided protein generation; (2) best hit ratio on 12 zero-shot\ntext-guided protein editing tasks; (3) superior performance on four out of six\nprotein property prediction benchmarks.\n","authors":["Shengchao Liu","Yanjing Li","Zhuoxinran Li","Anthony Gitter","Yutao Zhu","Jiarui Lu","Zhao Xu","Weili Nie","Arvind Ramanathan","Chaowei Xiao","Jian Tang","Hongyu Guo","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2302.04611v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02461v3","updated":"2024-08-12T15:54:14Z","published":"2024-07-02T17:40:06Z","title":"Decentralized Intelligence Network (DIN)","summary":" Decentralized Intelligence Network (DIN) is a theoretical framework\naddressing data fragmentation and siloing challenges, enabling scalable AI\nthrough data sovereignty. It facilitates effective AI utilization within\nsovereign networks by overcoming barriers to accessing diverse data sources,\nleveraging: 1) personal data stores to ensure data sovereignty, where data\nremains securely within Participants' control; 2) a scalable federated learning\nprotocol implemented on a public blockchain for decentralized AI training,\nwhere only model parameter updates are shared, keeping data within the personal\ndata stores; and 3) a scalable, trustless cryptographic rewards mechanism on a\npublic blockchain to incentivize participation and ensure fair reward\ndistribution through a decentralized auditing protocol. This approach\nguarantees that no entity can prevent or control access to training data or\ninfluence financial benefits, as coordination and reward distribution are\nmanaged on the public blockchain with an immutable record. The framework\nsupports effective AI training by allowing Participants to maintain control\nover their data, benefit financially, and contribute to a decentralized,\nscalable ecosystem that leverages collective AI to develop beneficial\nalgorithms.\n","authors":["Abraham Nash"],"pdf_url":"https://arxiv.org/pdf/2407.02461v3.pdf","comment":"14 pages, 1 figure. These works have been selected for presentation\n as a speaker at the Summit on Responsible Decentralized Intelligence - Future\n of Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the\n Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC"},{"id":"http://arxiv.org/abs/2408.06229v1","updated":"2024-08-12T15:29:32Z","published":"2024-08-12T15:29:32Z","title":"A Comprehensive Case Study on the Performance of Machine Learning\n Methods on the Classification of Solar Panel Electroluminescence Images","summary":" Photovoltaics (PV) are widely used to harvest solar energy, an important form\nof renewable energy. Photovoltaic arrays consist of multiple solar panels\nconstructed from solar cells. Solar cells in the field are vulnerable to\nvarious defects, and electroluminescence (EL) imaging provides effective and\nnon-destructive diagnostics to detect those defects. We use multiple\ntraditional machine learning and modern deep learning models to classify EL\nsolar cell images into different functional/defective categories. Because of\nthe asymmetry in the number of functional vs. defective cells, an imbalanced\nlabel problem arises in the EL image data. The current literature lacks\ninsights on which methods and metrics to use for model training and prediction.\nIn this paper, we comprehensively compare different machine learning and deep\nlearning methods under different performance metrics on the classification of\nsolar cell EL images from monocrystalline and polycrystalline modules. We\nprovide a comprehensive discussion on different metrics. Our results provide\ninsights and guidelines for practitioners in selecting prediction methods and\nperformance metrics.\n","authors":["Xinyi Song","Kennedy Odongo","Francis G. Pascual","Yili Hong"],"pdf_url":"https://arxiv.org/pdf/2408.06229v1.pdf","comment":"30 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.06226v1","updated":"2024-08-12T15:28:40Z","published":"2024-08-12T15:28:40Z","title":"A Large-Scale Study of Model Integration in ML-Enabled Software Systems","summary":" The rise of machine learning (ML) and its embedding in systems has\ndrastically changed the engineering of software-intensive systems.\nTraditionally, software engineering focuses on manually created artifacts such\nas source code and the process of creating them, as well as best practices for\nintegrating them, i.e., software architectures. In contrast, the development of\nML artifacts, i.e. ML models, comes from data science and focuses on the ML\nmodels and their training data. However, to deliver value to end users, these\nML models must be embedded in traditional software, often forming complex\ntopologies. In fact, ML-enabled software can easily incorporate many different\nML models. While the challenges and practices of building ML-enabled systems\nhave been studied to some extent, beyond isolated examples, little is known\nabout the characteristics of real-world ML-enabled systems. Properly embedding\nML models in systems so that they can be easily maintained or reused is far\nfrom trivial. We need to improve our empirical understanding of such systems,\nwhich we address by presenting the first large-scale study of real ML-enabled\nsoftware systems, covering over 2,928 open source systems on GitHub. We\nclassified and analyzed them to determine their characteristics, as well as\ntheir practices for reusing ML models and related code, and the architecture of\nthese systems. Our findings provide practitioners and researchers with insight\ninto practices for embedding and integrating ML models, bringing data science\nand software engineering closer together.\n","authors":["Yorick Sens","Henriette Knopp","Sven Peldszus","Thorsten Berger"],"pdf_url":"https://arxiv.org/pdf/2408.06226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06220v1","updated":"2024-08-12T15:21:35Z","published":"2024-08-12T15:21:35Z","title":"A Digital Twin Framework Utilizing Machine Learning for Robust\n Predictive Maintenance: Enhancing Tire Health Monitoring","summary":" We introduce a novel digital twin framework for predictive maintenance of\nlong-term physical systems. Using monitoring tire health as an application, we\nshow how the digital twin framework can be used to enhance automotive safety\nand efficiency, and how the technical challenges can be overcome using a\nthree-step approach. Firstly, for managing the data complexity over a long\noperation span, we employ data reduction techniques to concisely represent\nphysical tires using historical performance and usage data. Relying on these\ndata, for fast real-time prediction, we train a transformer-based model offline\non our concise dataset to predict future tire health over time, represented as\nRemaining Casing Potential (RCP). Based on our architecture, our model\nquantifies both epistemic and aleatoric uncertainty, providing reliable\nconfidence intervals around predicted RCP. Secondly, to incorporate real-time\ndata, we update the predictive model in the digital twin framework, ensuring\nits accuracy throughout its life span with the aid of hybrid modeling and the\nuse of discrepancy function. Thirdly, to assist decision making in predictive\nmaintenance, we implement a Tire State Decision Algorithm, which strategically\ndetermines the optimal timing for tire replacement based on RCP forecasted by\nour transformer model. This approach ensures our digital twin accurately\npredicts system health, continually refines its digital representation, and\nsupports predictive maintenance decisions. Our framework effectively embodies a\nphysical system, leveraging big data and machine learning for predictive\nmaintenance, model updates, and decision-making.\n","authors":["Vispi Karkaria","Jie Chen","Christopher Luey","Chase Siuta","Damien Lim","Robert Radulescu","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2408.06220v1.pdf","comment":"Paper accepted at ASME IDETC 2024, and fast-tracked for ASME Journal\n of Computing and Information Science in Engineering"},{"id":"http://arxiv.org/abs/2401.04343v2","updated":"2024-08-12T15:07:50Z","published":"2024-01-09T03:53:59Z","title":"Private Fine-tuning of Large Language Models with Zeroth-order\n Optimization","summary":" Differentially private stochastic gradient descent (DP-SGD) allows models to\nbe trained in a privacy-preserving manner, but has proven difficult to scale to\nthe era of foundation models. We introduce DP-ZO, a private fine-tuning\nframework for large language models by privatizing zeroth order optimization\nmethods. A key insight into the design of our method is that the direction of\nthe gradient in the zeroth-order optimization we use is random and the only\ninformation from training data is the step size, i.e., a scalar. Therefore, we\nonly need to privatize the scalar step size, which is memory-efficient. DP-ZO\nprovides a strong privacy-utility trade-off across different tasks, and model\nsizes that are comparable to DP-SGD in $(\\varepsilon,\\delta)$-DP. Notably,\nDP-ZO possesses significant advantages over DP-SGD in memory efficiency, and\nobtains higher utility in $\\varepsilon$-DP when using the Laplace mechanism.\n","authors":["Xinyu Tang","Ashwinee Panda","Milad Nasr","Saeed Mahloujifar","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2401.04343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02875v3","updated":"2024-08-12T15:07:04Z","published":"2024-06-05T02:50:27Z","title":"Leveraging KANs For Enhanced Deep Koopman Operator Discovery","summary":" Multi-layer perceptrons (MLP's) have been extensively utilized in discovering\nDeep Koopman operators for linearizing nonlinear dynamics. With the emergence\nof Kolmogorov-Arnold Networks (KANs) as a more efficient and accurate\nalternative to the MLP Neural Network, we propose a comparison of the\nperformance of each network type in the context of learning Koopman operators\nwith control. In this work, we propose a KANs-based deep Koopman framework with\napplications to an orbital Two-Body Problem (2BP) and the pendulum for\ndata-driven discovery of linear system dynamics. KANs were found to be superior\nin nearly all aspects of training; learning 31 times faster, being 15 times\nmore parameter efficiency, and predicting 1.25 times more accurately as\ncompared to the MLP Deep Neural Networks (DNNs) in the case of the 2BP. Thus,\nKANs shows potential for being an efficient tool in the development of Deep\nKoopman Theory.\n","authors":["George Nehma","Madhur Tiwari"],"pdf_url":"https://arxiv.org/pdf/2406.02875v3.pdf","comment":"6 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.06212v1","updated":"2024-08-12T15:02:26Z","published":"2024-08-12T15:02:26Z","title":"Computability of Classification and Deep Learning: From Theoretical\n Limits to Practical Feasibility through Quantization","summary":" The unwavering success of deep learning in the past decade led to the\nincreasing prevalence of deep learning methods in various application fields.\nHowever, the downsides of deep learning, most prominently its lack of\ntrustworthiness, may not be compatible with safety-critical or\nhigh-responsibility applications requiring stricter performance guarantees.\nRecently, several instances of deep learning applications have been shown to be\nsubject to theoretical limitations of computability, undermining the\nfeasibility of performance guarantees when employed on real-world computers. We\nextend the findings by studying computability in the deep learning framework\nfrom two perspectives: From an application viewpoint in the context of\nclassification problems and a general limitation viewpoint in the context of\ntraining neural networks. In particular, we show restrictions on the\nalgorithmic solvability of classification problems that also render the\nalgorithmic detection of failure in computations in a general setting\ninfeasible. Subsequently, we prove algorithmic limitations in training deep\nneural networks even in cases where the underlying problem is well-behaved.\nFinally, we end with a positive observation, showing that in quantized versions\nof classification and deep network training, computability restrictions do not\narise or can be overcome to a certain degree.\n","authors":["Holger Boche","Vit Fojtik","Adalbert Fono","Gitta Kutyniok"],"pdf_url":"https://arxiv.org/pdf/2408.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10794v4","updated":"2024-08-12T14:56:43Z","published":"2023-12-17T19:06:29Z","title":"A mathematical perspective on Transformers","summary":" Transformers play a central role in the inner workings of large language\nmodels. We develop a mathematical framework for analyzing Transformers based on\ntheir interpretation as interacting particle systems, which reveals that\nclusters emerge in long time. Our study explores the underlying theory and\noffers new perspectives for mathematicians as well as computer scientists.\n","authors":["Borjan Geshkovski","Cyril Letrouit","Yury Polyanskiy","Philippe Rigollet"],"pdf_url":"https://arxiv.org/pdf/2312.10794v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14499v2","updated":"2024-08-12T14:50:01Z","published":"2024-07-19T17:50:11Z","title":"Discover-then-Name: Task-Agnostic Concept Bottlenecks via Automated\n Concept Discovery","summary":" Concept Bottleneck Models (CBMs) have recently been proposed to address the\n'black-box' problem of deep neural networks, by first mapping images to a\nhuman-understandable concept space and then linearly combining concepts for\nclassification. Such models typically require first coming up with a set of\nconcepts relevant to the task and then aligning the representations of a\nfeature extractor to map to these concepts. However, even with powerful\nfoundational feature extractors like CLIP, there are no guarantees that the\nspecified concepts are detectable. In this work, we leverage recent advances in\nmechanistic interpretability and propose a novel CBM approach -- called\nDiscover-then-Name-CBM (DN-CBM) -- that inverts the typical paradigm: instead\nof pre-selecting concepts based on the downstream classification task, we use\nsparse autoencoders to first discover concepts learnt by the model, and then\nname them and train linear probes for classification. Our concept extraction\nstrategy is efficient, since it is agnostic to the downstream task, and uses\nconcepts already known to the model. We perform a comprehensive evaluation\nacross multiple datasets and CLIP architectures and show that our method yields\nsemantically meaningful concepts, assigns appropriate names to them that make\nthem easy to interpret, and yields performant and interpretable CBMs. Code\navailable at https://github.com/neuroexplicit-saar/discover-then-name.\n","authors":["Sukrut Rao","Sweta Mahajan","Moritz Böhle","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2407.14499v2.pdf","comment":"40 pages, 21 figures, 6 tables, European Conference on Computer\n Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2408.03459v2","updated":"2024-08-12T14:45:34Z","published":"2024-08-06T22:11:00Z","title":"On the Generalization of Preference Learning with DPO","summary":" Large language models (LLMs) have demonstrated remarkable capabilities but\noften struggle to align with human preferences, leading to harmful or\nundesirable outputs. Preference learning, which trains models to distinguish\nbetween preferred and non-preferred responses based on human feedback, has\nbecome a crucial component for ensuring that LLMs align with human values.\nDespite the widespread adoption in real-world systems, a thorough theoretical\nunderstanding of the generalization guarantees for these models remain lacking.\nThis paper bridges that gap by introducing a new theoretical framework to\nanalyze the generalization guarantees of models trained with direct preference\noptimization (DPO). While existing generalization theory often focuses on\noverparameterized models achieving near-optimal loss or models independent of\nthe training process, our framework rigorously assesses how well models\ngeneralize after a finite number of gradient steps, reflecting real-world LLM\ntraining practices. By analyzing the reward margin associated with each sample\nand its trajectory throughout training, we can effectively bound the\ngeneralization error. We derive learning guarantees showing that, under\nspecific conditions, models trained with DPO can correctly discern preferred\nresponses on unseen data with high probability. These insights are empirically\nvalidated on contemporary LLMs, underscoring the practical relevance of our\ntheoretical findings.\n","authors":["Shawn Im","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2408.03459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03636v2","updated":"2024-08-12T14:39:56Z","published":"2024-08-07T08:51:10Z","title":"Time is Not Enough: Time-Frequency based Explanation for Time-Series\n Black-Box Models","summary":" Despite the massive attention given to time-series explanations due to their\nextensive applications, a notable limitation in existing approaches is their\nprimary reliance on the time-domain. This overlooks the inherent characteristic\nof time-series data containing both time and frequency features. In this work,\nwe present Spectral eXplanation (SpectralX), an XAI framework that provides\ntime-frequency explanations for time-series black-box classifiers. This easily\nadaptable framework enables users to \"plug-in\" various perturbation-based XAI\nmethods for any pre-trained time-series classification models to assess their\nimpact on the explanation quality without having to modify the framework\narchitecture. Additionally, we introduce Feature Importance Approximations\n(FIA), a new perturbation-based XAI method. These methods consist of feature\ninsertion, deletion, and combination techniques to enhance computational\nefficiency and class-specific explanations in time-series classification tasks.\nWe conduct extensive experiments in the generated synthetic dataset and various\nUCR Time-Series datasets to first compare the explanation performance of FIA\nand other existing perturbation-based XAI methods in both time-domain and\ntime-frequency domain, and then show the superiority of our FIA in the\ntime-frequency domain with the SpectralX framework. Finally, we conduct a user\nstudy to confirm the practicality of our FIA in SpectralX framework for\nclass-specific time-frequency based time-series explanations. The source code\nis available in https://github.com/gustmd0121/Time_is_not_Enough\n","authors":["Hyunseung Chung","Sumin Jo","Yeonsu Kwon","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2408.03636v2.pdf","comment":"Accepted to CIKM 2024 (10 pages, 9 figures, 9 tables)"},{"id":"http://arxiv.org/abs/2406.12507v2","updated":"2024-08-12T14:38:18Z","published":"2024-06-18T11:18:46Z","title":"Improving the Evaluation and Actionability of Explanation Methods for\n Multivariate Time Series Classification","summary":" Explanation for Multivariate Time Series Classification (MTSC) is an\nimportant topic that is under explored. There are very few quantitative\nevaluation methodologies and even fewer examples of actionable explanation,\nwhere the explanation methods are shown to objectively improve specific\ncomputational tasks on time series data. In this paper we focus on analyzing\nInterpretTime, a recent evaluation methodology for attribution methods applied\nto MTSC. We showcase some significant weaknesses of the original methodology\nand propose ideas to improve both its accuracy and efficiency. Unlike related\nwork, we go beyond evaluation and also showcase the actionability of the\nproduced explainer ranking, by using the best attribution methods for the task\nof channel selection in MTSC. We find that perturbation-based methods such as\nSHAP and Feature Ablation work well across a set of datasets, classifiers and\ntasks and outperform gradient-based methods. We apply the best ranked\nexplainers to channel selection for MTSC and show significant data size\nreduction and improved classifier accuracy.\n","authors":["Davide Italo Serramazza","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2406.12507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06186v1","updated":"2024-08-12T14:34:06Z","published":"2024-08-12T14:34:06Z","title":"Improving Structural Diversity of Blackbox LLMs via\n Chain-of-Specification Prompting","summary":" The capability to generate diverse text is a key challenge facing large\nlanguage models (LLMs). Thus far, diversity has been studied via metrics such\nas $n$-gram diversity or diversity of BERT embeddings. However, for these kinds\nof diversity, the user has little control over the dimensions along which\ndiversity is considered. For example, in the poetry domain, one might desire\ndiversity in terms of rhyme and meter, whereas in the code domain, one might\ndesire diversity in terms of the kinds of expressions used to solve a problem.\nWe propose a diversity metric called structural diversity, where the user\nprovides a mapping from generated text to features capturing the kinds of\ndiversity that they care about. In addition, we propose a novel strategy called\nchain-of-specification (CoS) prompting for improving diversity by first having\nthe LLM generate a specification encoding one instance of structural features,\nand then prompting the LLM to generate text that satisfies these features;\nnotably, our strategy works with blackbox LLMs. In our experiments, we show\nthat for structural diversity in the poetry and code domains, CoS significantly\nimproves diversity compared to several baselines.\n","authors":["Halley Young","Yimeng Zeng","Jacob Gardner","Osbert Bastani"],"pdf_url":"https://arxiv.org/pdf/2408.06186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06183v1","updated":"2024-08-12T14:29:54Z","published":"2024-08-12T14:29:54Z","title":"Centralized and Federated Heart Disease Classification Models Using UCI\n Dataset and their Shapley-value Based Interpretability","summary":" Cardiovascular diseases are a leading cause of mortality worldwide,\nhighlighting the need for accurate diagnostic methods. This study benchmarks\ncentralized and federated machine learning algorithms for heart disease\nclassification using the UCI dataset which includes 920 patient records from\nfour hospitals in the USA, Hungary and Switzerland. Our benchmark is supported\nby Shapley-value interpretability analysis to quantify features' importance for\nclassification. In the centralized setup, various binary classification\nalgorithms are trained on pooled data, with a support vector machine (SVM)\nachieving the highest testing accuracy of 83.3\\%, surpassing the established\nbenchmark of 78.7\\% with logistic regression. Additionally, federated learning\nalgorithms with four clients (hospitals) are explored, leveraging the dataset's\nnatural partition to enhance privacy without sacrificing accuracy. Federated\nSVM, an uncommon approach in the literature, achieves a top testing accuracy of\n73.8\\%. Our interpretability analysis aligns with existing medical knowledge of\nheart disease indicators. Overall, this study establishes a benchmark for\nefficient and interpretable pre-screening tools for heart disease while\nmaintaining patients' privacy.\n","authors":["Mario Padilla Rodriguez","Mohamed Nafea"],"pdf_url":"https://arxiv.org/pdf/2408.06183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01606v4","updated":"2024-08-12T14:26:38Z","published":"2023-12-04T03:38:17Z","title":"Investigating the ability of deep learning to predict Welding Depth and\n Pore Volume in Hairpin Welding","summary":" To advance quality assurance in the welding process, this study presents a\ndeep learning DL model that enables the prediction of two critical welds' Key\nPerformance Characteristics (KPCs): welding depth and average pore volume. In\nthe proposed approach, a wide range of laser welding Key Input Characteristics\n(KICs) is utilized, including welding beam geometries, welding feed rates, path\nrepetitions for weld beam geometries, and bright light weld ratios for all\npaths, all of which were obtained from hairpin welding experiments. Two DL\nnetworks are employed with multiple hidden dense layers and linear activation\nfunctions to investigate the capabilities of deep neural networks in capturing\nthe complex nonlinear relationships between the welding input and output\nvariables (KPCs and KICs). Applying DL networks to the small numerical\nexperimental hairpin welding dataset has shown promising results, achieving\nMean Absolute Error (MAE) values 0.1079 for predicting welding depth and 0.0641\nfor average pore volume. This, in turn, promises significant advantages in\ncontrolling welding outcomes, moving beyond the current trend of relying only\non defect classification in weld monitoring, to capture the correlation between\nthe weld parameters and weld geometries.\n","authors":["Amena Darwish","Stefan Ericson","Rohollah Ghasemi","Tobias Andersson","Dan Lönn","Andreas Andersson Lassila","Kent Salomonsson"],"pdf_url":"https://arxiv.org/pdf/2312.01606v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20324v2","updated":"2024-08-12T14:19:44Z","published":"2024-03-29T17:51:50Z","title":"Localising the Seizure Onset Zone from Single-Pulse Electrical\n Stimulation Responses with a CNN Transformer","summary":" Epilepsy is one of the most common neurological disorders, often requiring\nsurgical intervention when medication fails to control seizures. For effective\nsurgical outcomes, precise localisation of the epileptogenic focus - often\napproximated through the Seizure Onset Zone (SOZ) - is critical yet remains a\nchallenge. Active probing through electrical stimulation is already standard\nclinical practice for identifying epileptogenic areas. Our study advances the\napplication of deep learning for SOZ localisation using Single-Pulse Electrical\nStimulation (SPES) responses, with two key contributions. Firstly, we implement\nan existing deep learning model to compare two SPES analysis paradigms:\ndivergent and convergent. These paradigms evaluate outward and inward effective\nconnections, respectively. We assess the generalisability of these models to\nunseen patients and electrode placements using held-out test sets. Our findings\nreveal a notable improvement in moving from a divergent (AUROC: 0.574) to a\nconvergent approach (AUROC: 0.666), marking the first application of the latter\nin this context. Secondly, we demonstrate the efficacy of CNN Transformers with\ncross-channel attention in handling heterogeneous electrode placements,\nincreasing the AUROC to 0.730. These findings represent a significant step in\nmodelling patient-specific intracranial EEG electrode placements in SPES.\nFuture work will explore integrating these models into clinical decision-making\nprocesses to bridge the gap between deep learning research and practical\nhealthcare applications.\n","authors":["Jamie Norris","Aswin Chari","Dorien van Blooijs","Gerald Cooray","Karl Friston","Martin Tisdall","Richard Rosch"],"pdf_url":"https://arxiv.org/pdf/2403.20324v2.pdf","comment":"21 pages, 6 figures, accepted at Machine Learning for Healthcare 2024"},{"id":"http://arxiv.org/abs/2407.09212v3","updated":"2024-08-12T13:13:59Z","published":"2024-07-12T12:20:39Z","title":"Generating $SROI^-$ Ontologies via Knowledge Graph Query Embedding\n Learning","summary":" Query embedding approaches answer complex logical queries over incomplete\nknowledge graphs (KGs) by computing and operating on low-dimensional vector\nrepresentations of entities, relations, and queries. However, current query\nembedding models heavily rely on excessively parameterized neural networks and\ncannot explain the knowledge learned from the graph. We propose a novel query\nembedding method, AConE, which explains the knowledge learned from the graph in\nthe form of $SROI^-$ description logic axioms while being more\nparameter-efficient than most existing approaches. AConE associates queries to\na $SROI^-$ description logic concept. Every $SROI^-$ concept is embedded as a\ncone in complex vector space, and each $SROI^-$ relation is embedded as a\ntransformation that rotates and scales cones. We show theoretically that AConE\ncan learn $SROI^-$ axioms, and defines an algebra whose operations correspond\none to one to $SROI^-$ description logic concept constructs. Our empirical\nstudy on multiple query datasets shows that AConE achieves superior results\nover previous baselines with fewer parameters. Notably on the WN18RR dataset,\nAConE achieves significant improvement over baseline models. We provide\ncomprehensive analyses showing that the capability to represent axioms\npositively impacts the results of query answering.\n","authors":["Yunjie He","Daniel Hernandez","Mojtaba Nayyeri","Bo Xiong","Yuqicheng Zhu","Evgeny Kharlamov","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2407.09212v3.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2408.06121v1","updated":"2024-08-12T13:03:34Z","published":"2024-08-12T13:03:34Z","title":"A Methodological Report on Anomaly Detection on Dynamic Knowledge Graphs","summary":" In this paper, we explore different approaches to anomaly detection on\ndynamic knowledge graphs, specifically in a microservices environment for\nKubernetes applications. Our approach explores three dynamic knowledge graph\nrepresentations: sequential data, one-hop graph structure, and two-hop graph\nstructure, with each representation incorporating increasingly complex\nstructural information. Each phase includes different machine learning and deep\nlearning models. We empirically analyse their performance and propose an\napproach based on ensemble learning of these models. Our approach significantly\noutperforms the baseline on the ISWC 2024 Dynamic Knowledge Graph Anomaly\nDetection dataset, providing a robust solution for anomaly detection in dynamic\ncomplex data.\n","authors":["Xiaohua Lu","Leshanshui Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11046v3","updated":"2024-08-12T12:41:57Z","published":"2024-07-08T12:32:10Z","title":"A Survey on LoRA of Large Language Models","summary":" Low-Rank Adaptation~(LoRA), which updates the dense neural network layers\nwith pluggable low-rank matrices, is one of the best performed parameter\nefficient fine-tuning paradigms. Furthermore, it has significant advantages in\ncross-task generalization and privacy-preserving. Hence, LoRA has gained much\nattention recently, and the number of related literature demonstrates\nexponential growth. It is necessary to conduct a comprehensive overview of the\ncurrent progress on LoRA. This survey categorizes and reviews the progress from\nthe perspectives of (1) downstream adaptation improving variants that improve\nLoRA's performance on downstream tasks; (2) cross-task generalization methods\nthat mix multiple LoRA plugins to achieve cross-task generalization; (3)\nefficiency-improving methods that boost the computation-efficiency of LoRA; (4)\ndata privacy-preserving methods that use LoRA in federated learning; (5)\napplication. Besides, this survey also discusses the future directions in this\nfield. At last, we provide a Github\npage~\\footnote{\\href{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}}\nfor readers to check the updates and initiate discussions on this survey paper.\n","authors":["Yuren Mao","Yuhang Ge","Yijiang Fan","Wenyi Xu","Yu Mi","Zhonghao Hu","Yunjun Gao"],"pdf_url":"https://arxiv.org/pdf/2407.11046v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06102v1","updated":"2024-08-12T12:36:06Z","published":"2024-08-12T12:36:06Z","title":"Contexts Matter: An Empirical Study on Contextual Influence in Fairness\n Testing for Deep Learning Systems","summary":" Background: Fairness testing for deep learning systems has been becoming\nincreasingly important. However, much work assumes perfect context and\nconditions from the other parts: well-tuned hyperparameters for accuracy;\nrectified bias in data, and mitigated bias in the labeling. Yet, these are\noften difficult to achieve in practice due to their resource-/labour-intensive\nnature. Aims: In this paper, we aim to understand how varying contexts affect\nfairness testing outcomes. Method:We conduct an extensive empirical study,\nwhich covers $10,800$ cases, to investigate how contexts can change the\nfairness testing result at the model level against the existing assumptions. We\nalso study why the outcomes were observed from the lens of correlation/fitness\nlandscape analysis. Results: Our results show that different context types and\nsettings generally lead to a significant impact on the testing, which is mainly\ncaused by the shifts of the fitness landscape under varying contexts.\nConclusions: Our findings provide key insights for practitioners to evaluate\nthe test generators and hint at future research directions.\n","authors":["Chengwen Du","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.06102v1.pdf","comment":"Received by ESEM 24"},{"id":"http://arxiv.org/abs/2408.06101v1","updated":"2024-08-12T12:32:15Z","published":"2024-08-12T12:32:15Z","title":"Generalization capabilities of MeshGraphNets to unseen geometries for\n fluid dynamics","summary":" This works investigates the generalization capabilities of MeshGraphNets\n(MGN) [Pfaff et al. Learning Mesh-Based Simulation with Graph Networks. ICML\n2021] to unseen geometries for fluid dynamics, e.g. predicting the flow around\na new obstacle that was not part of the training data. For this purpose, we\ncreate a new benchmark dataset for data-driven computational fluid dynamics\n(CFD) which extends DeepMind's flow around a cylinder dataset by including\ndifferent shapes and multiple objects. We then use this new dataset to extend\nthe generalization experiments conducted by DeepMind on MGNs by testing how\nwell an MGN can generalize to different shapes. In our numerical tests, we show\nthat MGNs can sometimes generalize well to various shapes by training on a\ndataset of one obstacle shape and testing on a dataset of another obstacle\nshape.\n","authors":["Robin Schmöcker","Alexander Henkes","Julian Roth","Thomas Wick"],"pdf_url":"https://arxiv.org/pdf/2408.06101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06099v1","updated":"2024-08-12T12:30:48Z","published":"2024-08-12T12:30:48Z","title":"Approximating Discrimination Within Models When Faced With Several\n Non-Binary Sensitive Attributes","summary":" Discrimination mitigation with machine learning (ML) models could be\ncomplicated because multiple factors may interweave with each other including\nhierarchically and historically. Yet few existing fairness measures are able to\ncapture the discrimination level within ML models in the face of multiple\nsensitive attributes. To bridge this gap, we propose a fairness measure based\non distances between sets from a manifold perspective, named as 'harmonic\nfairness measure via manifolds (HFM)' with two optional versions, which can\ndeal with a fine-grained discrimination evaluation for several sensitive\nattributes of multiple values. To accelerate the computation of distances of\nsets, we further propose two approximation algorithms named 'Approximation of\ndistance between sets for one sensitive attribute with multiple values\n(ApproxDist)' and 'Approximation of extended distance between sets for several\nsensitive attributes with multiple values (ExtendDist)' to respectively resolve\nbias evaluation of one single sensitive attribute with multiple values and that\nof several sensitive attributes with multiple values. Moreover, we provide an\nalgorithmic effectiveness analysis for ApproxDist under certain assumptions to\nexplain how well it could work. The empirical results demonstrate that our\nproposed fairness measure HFM is valid and approximation algorithms (i.e.,\nApproxDist and ExtendDist) are effective and efficient.\n","authors":["Yijun Bian","Yujie Luo","Ping Xu"],"pdf_url":"https://arxiv.org/pdf/2408.06099v1.pdf","comment":"The first two authors contributed equally, listed in alphabetical\n order. arXiv admin note: substantial text overlap with arXiv:2405.09251"},{"id":"http://arxiv.org/abs/2408.06087v1","updated":"2024-08-12T12:04:14Z","published":"2024-08-12T12:04:14Z","title":"Building Decision Making Models Through Language Model Regime","summary":" We propose a novel approach for decision making problems leveraging the\ngeneralization capabilities of large language models (LLMs). Traditional\nmethods such as expert systems, planning algorithms, and reinforcement learning\noften exhibit limited generalization, typically requiring the training of new\nmodels for each unique task. In contrast, LLMs demonstrate remarkable success\nin generalizing across varied language tasks, inspiring a new strategy for\ntraining decision making models. Our approach, referred to as \"Learning then\nUsing\" (LTU), entails a two-stage process. Initially, the \\textit{learning}\nphase develops a robust foundational decision making model by integrating\ndiverse knowledge from various domains and decision making contexts. The\nsubsequent \\textit{using} phase refines this foundation model for specific\ndecision making scenarios. Distinct from other studies that employ LLMs for\ndecision making through supervised learning, our LTU method embraces a\nversatile training methodology that combines broad pre-training with targeted\nfine-tuning. Experiments in e-commerce domains such as advertising and search\noptimization have shown that LTU approach outperforms traditional supervised\nlearning regimes in decision making capabilities and generalization. The LTU\napproach is the first practical training architecture for both single-step and\nmulti-step decision making tasks combined with LLMs, which can be applied\nbeyond game and robot domains. It provides a robust and adaptable framework for\ndecision making, enhances the effectiveness and flexibility of various systems\nin tackling various challenges.\n","authors":["Yu Zhang","Haoxiang Liu","Feijun Jiang","Weihua Luo","Kaifu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06071v1","updated":"2024-08-12T11:44:47Z","published":"2024-08-12T11:44:47Z","title":"A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in\n Adverse Weather and Lighting","summary":" High-autonomy vehicle functions rely on machine learning (ML) algorithms to\nunderstand the environment. Despite displaying remarkable performance in fair\nweather scenarios, perception algorithms are heavily affected by adverse\nweather and lighting conditions. To overcome these difficulties, ML engineers\nmainly rely on comprehensive real-world datasets. However, the difficulties in\nreal-world data collection for critical areas of the operational design domain\n(ODD) often means synthetic data is required for perception training and safety\nvalidation. Thus, we present A-BDD, a large set of over 60,000 synthetically\naugmented images based on BDD100K that are equipped with semantic segmentation\nand bounding box annotations (inherited from the BDD100K dataset). The dataset\ncontains augmented data for rain, fog, overcast and sunglare/shadow with\nvarying intensity levels. We further introduce novel strategies utilizing\nfeature-based image quality metrics like FID and CMMD, which help identify\nuseful augmented and real-world data for ML training and testing. By conducting\nexperiments on A-BDD, we provide evidence that data augmentations can play a\npivotal role in closing performance gaps in adverse weather and lighting\nconditions.\n","authors":["Felix Assion","Florens Gressner","Nitin Augustine","Jona Klemenc","Ahmed Hammam","Alexandre Krattinger","Holger Trittenbach","Sascha Riemer"],"pdf_url":"https://arxiv.org/pdf/2408.06071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06069v1","updated":"2024-08-12T11:41:07Z","published":"2024-08-12T11:41:07Z","title":"Fully Bayesian Differential Gaussian Processes through Stochastic\n Differential Equations","summary":" Traditional deep Gaussian processes model the data evolution using a discrete\nhierarchy, whereas differential Gaussian processes (DIFFGPs) represent the\nevolution as an infinitely deep Gaussian process. However, prior DIFFGP methods\noften overlook the uncertainty of kernel hyperparameters and assume them to be\nfixed and time-invariant, failing to leverage the unique synergy between\ncontinuous-time models and approximate inference. In this work, we propose a\nfully Bayesian approach that treats the kernel hyperparameters as random\nvariables and constructs coupled stochastic differential equations (SDEs) to\nlearn their posterior distribution and that of inducing points. By\nincorporating estimation uncertainty on hyperparameters, our method enhances\nthe model's flexibility and adaptability to complex dynamics. Additionally, our\napproach provides a time-varying, comprehensive, and realistic posterior\napproximation through coupling variables using SDE methods. Experimental\nresults demonstrate the advantages of our method over traditional approaches,\nshowcasing its superior performance in terms of flexibility, accuracy, and\nother metrics. Our work opens up exciting research avenues for advancing\nBayesian inference and offers a powerful modeling tool for continuous-time\nGaussian processes.\n","authors":["Jian Xu","Zhiqi Lin","Min Chen","Junmei Yang","Delu Zeng","John Paisley"],"pdf_url":"https://arxiv.org/pdf/2408.06069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06067v1","updated":"2024-08-12T11:39:21Z","published":"2024-08-12T11:39:21Z","title":"Don't You (Project Around Discs)? Neural Network Surrogate and Projected\n Gradient Descent for Calibrating an Intervertebral Disc Finite Element Model","summary":" Accurate calibration of finite element (FE) models of human intervertebral\ndiscs (IVDs) is essential for their reliability and application in diagnosing\nand planning treatments for spinal conditions. Traditional calibration methods\nare computationally intensive, requiring iterative, derivative-free\noptimization algorithms that often take hours or days to converge.\n This study addresses these challenges by introducing a novel, efficient, and\neffective calibration method for an L4-L5 IVD FE model using a neural network\n(NN) surrogate. The NN surrogate predicts simulation outcomes with high\naccuracy, outperforming other machine learning models, and significantly\nreduces the computational cost associated with traditional FE simulations.\nNext, a Projected Gradient Descent (PGD) approach guided by gradients of the NN\nsurrogate is proposed to efficiently calibrate FE models. Our method explicitly\nenforces feasibility with a projection step, thus maintaining material bounds\nthroughout the optimization process.\n The proposed method is evaluated against state-of-the-art Genetic Algorithm\n(GA) and inverse model baselines on synthetic and in vitro experimental\ndatasets. Our approach demonstrates superior performance on synthetic data,\nachieving a Mean Absolute Error (MAE) of 0.06 compared to the baselines' MAE of\n0.18 and 0.54, respectively. On experimental specimens, our method outperforms\nthe baseline in 5 out of 6 cases. Most importantly, our approach reduces\ncalibration time to under three seconds, compared to up to 8 days per sample\nrequired by traditional calibration. Such efficiency paves the way for applying\nmore complex FE models, enabling accurate patient-specific simulations and\nadvancing spinal treatment planning.\n","authors":["Matan Atad","Gabriel Gruber","Marx Ribeiro","Luis Fernando Nicolini","Robert Graf","Hendrik Möller","Kati Nispel","Ivan Ezhov","Daniel Rueckert","Jan S. Kirschke"],"pdf_url":"https://arxiv.org/pdf/2408.06067v1.pdf","comment":"Under submission. Project code:\n https://github.com/matanat/IVD-CalibNN/"},{"id":"http://arxiv.org/abs/2408.06063v1","updated":"2024-08-12T11:29:54Z","published":"2024-08-12T11:29:54Z","title":"TruVRF: Towards Triple-Granularity Verification on Machine Unlearning","summary":" The concept of the right to be forgotten has led to growing interest in\nmachine unlearning, but reliable validation methods are lacking, creating\nopportunities for dishonest model providers to mislead data contributors.\nTraditional invasive methods like backdoor injection are not feasible for\nlegacy data. To address this, we introduce TruVRF, a non-invasive unlearning\nverification framework operating at class-, volume-, and sample-level\ngranularities. TruVRF includes three Unlearning-Metrics designed to detect\ndifferent types of dishonest servers: Neglecting, Lazy, and Deceiving.\nUnlearning-Metric-I checks class alignment, Unlearning-Metric-II verifies\nsample count, and Unlearning-Metric-III confirms specific sample deletion.\nEvaluations on three datasets show TruVRF's robust performance, with over 90%\naccuracy for Metrics I and III, and a 4.8% to 8.2% inference deviation for\nMetric II. TruVRF also demonstrates generalizability and practicality across\nvarious conditions and with state-of-the-art unlearning frameworks like SISA\nand Amnesiac Unlearning.\n","authors":["Chunyi Zhou","Anmin Fu","Zhiyang Dai"],"pdf_url":"https://arxiv.org/pdf/2408.06063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08442v3","updated":"2024-08-12T11:17:34Z","published":"2022-06-16T20:48:19Z","title":"A Look at Value-Based Decision-Time vs. Background Planning Methods\n Across Different Settings","summary":" In model-based reinforcement learning (RL), an agent can leverage a learned\nmodel to improve its way of behaving in different ways. Two of the prevalent\nways to do this are through decision-time and background planning methods. In\nthis study, we are interested in understanding how the value-based versions of\nthese two planning methods will compare against each other across different\nsettings. Towards this goal, we first consider the simplest instantiations of\nvalue-based decision-time and background planning methods and provide\ntheoretical results on which one will perform better in the regular RL and\ntransfer learning settings. Then, we consider the modern instantiations of them\nand provide hypotheses on which one will perform better in the same settings.\nFinally, we perform illustrative experiments to validate these theoretical\nresults and hypotheses. Overall, our findings suggest that even though\nvalue-based versions of the two planning methods perform on par in their\nsimplest instantiations, the modern instantiations of value-based decision-time\nplanning methods can perform on par or better than the modern instantiations of\nvalue-based background planning methods in both the regular RL and transfer\nlearning settings.\n","authors":["Safa Alver","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2206.08442v3.pdf","comment":"Accepted to EWRL 2024"},{"id":"http://arxiv.org/abs/2401.09493v5","updated":"2024-08-12T11:06:01Z","published":"2024-01-17T01:17:12Z","title":"Identifying Three-Dimensional Radiative Patterns Associated with Early\n Tropical Cyclone Intensification","summary":" Cloud radiative feedback impacts early tropical cyclone (TC) intensification,\nbut limitations in existing diagnostic frameworks make them unsuitable for\nstudying asymmetric or transient radiative heating. We propose a linear\nVariational Encoder-Decoder (VED) to learn the hidden relationship between\nradiation and the surface intensification of realistic simulated TCs. Limiting\nVED model inputs enables using its uncertainty to identify periods when\nradiation has more importance for intensification. A close examination of the\nextracted 3D radiative structures suggests that longwave radiative forcing from\ninner core deep convection and shallow clouds both contribute to\nintensification, with the deep convection having the most impact overall. We\nfind that deep convection downwind of the shallow clouds is critical to the\nintensification of Haiyan. Our work demonstrates that machine learning can\ndiscover thermodynamic-kinematic relationships without relying on axisymmetric\nor deterministic assumptions, paving the way towards the objective discovery of\nprocesses leading to TC intensification in realistic conditions.\n","authors":["Frederick Iat-Hin Tam","Tom Beucler","James H. Ruppert Jr"],"pdf_url":"https://arxiv.org/pdf/2401.09493v5.pdf","comment":"15 pages, 6 figures (main text)"},{"id":"http://arxiv.org/abs/2407.14861v2","updated":"2024-08-12T10:55:59Z","published":"2024-07-20T12:42:24Z","title":"Improving Bias Correction Standards by Quantifying its Effects on\n Treatment Outcomes","summary":" With the growing access to administrative health databases, retrospective\nstudies have become crucial evidence for medical treatments. Yet,\nnon-randomized studies frequently face selection biases, requiring mitigation\nstrategies. Propensity score matching (PSM) addresses these biases by selecting\ncomparable populations, allowing for analysis without further methodological\nconstraints. However, PSM has several drawbacks. Different matching methods can\nproduce significantly different Average Treatment Effects (ATE) for the same\ntask, even when meeting all validation criteria. To prevent cherry-picking the\nbest method, public authorities must involve field experts and engage in\nextensive discussions with researchers.\n To address this issue, we introduce a novel metric, A2A, to reduce the number\nof valid matches. A2A constructs artificial matching tasks that mirror the\noriginal ones but with known outcomes, assessing each matching method's\nperformance comprehensively from propensity estimation to ATE estimation. When\ncombined with Standardized Mean Difference, A2A enhances the precision of model\nselection, resulting in a reduction of up to 50% in ATE estimation errors\nacross synthetic tasks and up to 90% in predicted ATE variability across both\nsynthetic and real-world datasets. To our knowledge, A2A is the first metric\ncapable of evaluating outcome correction accuracy using covariates not involved\nin selection.\n Computing A2A requires solving hundreds of PSMs, we therefore automate all\nmanual steps of the PSM pipeline. We integrate PSM methods from Python and R,\nour automated pipeline, a new metric, and reproducible experiments into\npopmatch, our new Python package, to enhance reproducibility and accessibility\nto bias correction methods.\n","authors":["Alexandre Abraham","Andrés Hoyos Idrobo"],"pdf_url":"https://arxiv.org/pdf/2407.14861v2.pdf","comment":"ECML PKDD 2024, 18 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.06051v1","updated":"2024-08-12T10:55:42Z","published":"2024-08-12T10:55:42Z","title":"Perceptual Similarity for Measuring Decision-Making Style and Policy\n Diversity in Games","summary":" Defining and measuring decision-making styles, also known as playstyles, is\ncrucial in gaming, where these styles reflect a broad spectrum of individuality\nand diversity. However, finding a universally applicable measure for these\nstyles poses a challenge. Building on Playstyle Distance, the first\nunsupervised metric to measure playstyle similarity based on game screens and\nraw actions, we introduce three enhancements to increase accuracy: multiscale\nanalysis with varied state granularity, a perceptual kernel rooted in\npsychology, and the utilization of the intersection-over-union method for\nefficient evaluation. These innovations not only advance measurement precision\nbut also offer insights into human cognition of similarity. Across two racing\ngames and seven Atari games, our techniques significantly improve the precision\nof zero-shot playstyle classification, achieving an accuracy exceeding 90\npercent with fewer than 512 observation-action pairs, which is less than half\nan episode of these games. Furthermore, our experiments with 2048 and Go\ndemonstrate the potential of discrete playstyle measures in puzzle and board\ngames. We also develop an algorithm for assessing decision-making diversity\nusing these measures. Our findings improve the measurement of end-to-end game\nanalysis and the evolution of artificial intelligence for diverse playstyles.\n","authors":["Chiu-Chou Lin","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06051v1.pdf","comment":"TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49"},{"id":"http://arxiv.org/abs/2408.06050v1","updated":"2024-08-12T10:55:29Z","published":"2024-08-12T10:55:29Z","title":"What Ails Generative Structure-based Drug Design: Too Little or Too Much\n Expressivity?","summary":" Several generative models with elaborate training and sampling procedures\nhave been proposed recently to accelerate structure-based drug design (SBDD);\nhowever, perplexingly, their empirical performance turns out to be suboptimal.\nWe seek to better understand this phenomenon from both theoretical and\nempirical perspectives. Since most of these models apply graph neural networks\n(GNNs), one may suspect that they inherit the representational limitations of\nGNNs. We analyze this aspect, establishing the first such results for\nprotein-ligand complexes. A plausible counterview may attribute the\nunderperformance of these models to their excessive parameterizations, inducing\nexpressivity at the expense of generalization. We also investigate this\npossibility with a simple metric-aware approach that learns an economical\nsurrogate for affinity to infer an unlabelled molecular graph and optimizes for\nlabels conditioned on this graph and molecular properties. The resulting model\nachieves state-of-the-art results using 100x fewer trainable parameters and\naffords up to 1000x speedup. Collectively, our findings underscore the need to\nreassess and redirect the existing paradigm and efforts for SBDD.\n","authors":["Rafał Karczewski","Samuel Kaski","Markus Heinonen","Vikas Garg"],"pdf_url":"https://arxiv.org/pdf/2408.06050v1.pdf","comment":"25 pages, 11 figures"},{"id":"http://arxiv.org/abs/2303.15244v3","updated":"2024-08-12T10:27:06Z","published":"2023-03-27T14:29:04Z","title":"Manifold Learning by Mixture Models of VAEs for Inverse Problems","summary":" Representing a manifold of very high-dimensional data with generative models\nhas been shown to be computationally efficient in practice. However, this\nrequires that the data manifold admits a global parameterization. In order to\nrepresent manifolds of arbitrary topology, we propose to learn a mixture model\nof variational autoencoders. Here, every encoder-decoder pair represents one\nchart of a manifold. We propose a loss function for maximum likelihood\nestimation of the model weights and choose an architecture that provides us the\nanalytical expression of the charts and of their inverses. Once the manifold is\nlearned, we use it for solving inverse problems by minimizing a data fidelity\nterm restricted to the learned manifold. To solve the arising minimization\nproblem we propose a Riemannian gradient descent algorithm on the learned\nmanifold. We demonstrate the performance of our method for low-dimensional toy\nexamples as well as for deblurring and electrical impedance tomography on\ncertain image manifolds.\n","authors":["Giovanni S. Alberti","Johannes Hertrich","Matteo Santacesaria","Silvia Sciutto"],"pdf_url":"https://arxiv.org/pdf/2303.15244v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05206v3","updated":"2024-08-12T10:23:27Z","published":"2024-07-06T23:16:41Z","title":"Helios: An extremely low power event-based gesture recognition for\n always-on smart eyewear","summary":" This paper introduces Helios, the first extremely low-power, real-time,\nevent-based hand gesture recognition system designed for all-day on smart\neyewear. As augmented reality (AR) evolves, current smart glasses like the Meta\nRay-Bans prioritize visual and wearable comfort at the expense of\nfunctionality. Existing human-machine interfaces (HMIs) in these devices, such\nas capacitive touch and voice controls, present limitations in ergonomics,\nprivacy and power consumption. Helios addresses these challenges by leveraging\nnatural hand interactions for a more intuitive and comfortable user experience.\nOur system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera\nto perform natural hand-based gesture recognition for always-on smart eyewear.\nThe camera's output is processed by a convolutional neural network (CNN)\nrunning on a NXP Nano UltraLite compute platform, consuming less than 350mW.\nHelios can recognize seven classes of gestures, including subtle microgestures\nlike swipes and pinches, with 91% accuracy. We also demonstrate real-time\nperformance across 20 users at a remarkably low latency of 60ms. Our user\ntesting results align with the positive feedback we received during our recent\nsuccessful demo at AWE-USA-2024.\n","authors":["Prarthana Bhattacharyya","Joshua Mitton","Ryan Page","Owen Morgan","Ben Menzies","Gabriel Homewood","Kemi Jacobs","Paolo Baesso","Dave Trickett","Chris Mair","Taru Muhonen","Rory Clark","Louis Berridge","Richard Vigars","Iain Wallace"],"pdf_url":"https://arxiv.org/pdf/2407.05206v3.pdf","comment":"Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024.\n 18 pages, 10 figures. First three authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2408.00343v2","updated":"2024-08-12T10:19:08Z","published":"2024-08-01T07:27:54Z","title":"IN-Sight: Interactive Navigation through Sight","summary":" Current visual navigation systems often treat the environment as static,\nlacking the ability to adaptively interact with obstacles. This limitation\nleads to navigation failure when encountering unavoidable obstructions. In\nresponse, we introduce IN-Sight, a novel approach to self-supervised path\nplanning, enabling more effective navigation strategies through interaction\nwith obstacles. Utilizing RGB-D observations, IN-Sight calculates\ntraversability scores and incorporates them into a semantic map, facilitating\nlong-range path planning in complex, maze-like environments. To precisely\nnavigate around obstacles, IN-Sight employs a local planner, trained\nimperatively on a differentiable costmap using representation learning\ntechniques. The entire framework undergoes end-to-end training within the\nstate-of-the-art photorealistic Intel SPEAR Simulator. We validate the\neffectiveness of IN-Sight through extensive benchmarking in a variety of\nsimulated scenarios and ablation studies. Moreover, we demonstrate the system's\nreal-world applicability with zero-shot sim-to-real transfer, deploying our\nplanner on the legged robot platform ANYmal, showcasing its practical potential\nfor interactive navigation in real environments.\n","authors":["Philipp Schoch","Fan Yang","Yuntao Ma","Stefan Leutenegger","Marco Hutter","Quentin Leboutet"],"pdf_url":"https://arxiv.org/pdf/2408.00343v2.pdf","comment":"The 2024 IEEE/RSJ International Conference on Intelligent Robots and\n Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2408.06039v1","updated":"2024-08-12T10:13:45Z","published":"2024-08-12T10:13:45Z","title":"Spacetime $E(n)$-Transformer: Equivariant Attention for Spatio-temporal\n Graphs","summary":" We introduce an $E(n)$-equivariant Transformer architecture for\nspatio-temporal graph data. By imposing rotation, translation, and permutation\nequivariance inductive biases in both space and time, we show that the\nSpacetime $E(n)$-Transformer (SET) outperforms purely spatial and temporal\nmodels without symmetry-preserving properties. We benchmark SET against said\nmodels on the charged $N$-body problem, a simple physical system with complex\ndynamics. While existing spatio-temporal graph neural networks focus on\nsequential modeling, we empirically demonstrate that leveraging underlying\ndomain symmetries yields considerable improvements for modeling dynamical\nsystems on graphs.\n","authors":["Sergio G. Charles"],"pdf_url":"https://arxiv.org/pdf/2408.06039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19132v2","updated":"2024-08-12T10:00:17Z","published":"2024-04-29T22:31:21Z","title":"Integrating Present and Past in Unsupervised Continual Learning","summary":" We formulate a unifying framework for unsupervised continual learning (UCL),\nwhich disentangles learning objectives that are specific to the present and the\npast data, encompassing stability, plasticity, and cross-task consolidation.\nThe framework reveals that many existing UCL approaches overlook cross-task\nconsolidation and try to balance plasticity and stability in a shared embedding\nspace. This results in worse performance due to a lack of within-task data\ndiversity and reduced effectiveness in learning the current task. Our method,\nOsiris, which explicitly optimizes all three objectives on separate embedding\nspaces, achieves state-of-the-art performance on all benchmarks, including two\nnovel benchmarks proposed in this paper featuring semantically structured task\nsequences. Compared to standard benchmarks, these two structured benchmarks\nmore closely resemble visual signals received by humans and animals when\nnavigating real-world environments. Finally, we show some preliminary evidence\nthat continual models can benefit from such realistic learning scenarios.\n","authors":["Yipeng Zhang","Laurent Charlin","Richard Zemel","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2404.19132v2.pdf","comment":"CoLLAs 2024 (Oral)"},{"id":"http://arxiv.org/abs/2401.16356v5","updated":"2024-08-12T09:57:03Z","published":"2024-01-29T17:59:26Z","title":"cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and\n Glitch Generation","summary":" Simulating realistic time-domain observations of gravitational waves (GWs)\nand GW detector glitches can help in advancing GW data analysis. Simulated data\ncan be used in downstream tasks by augmenting datasets for signal searches,\nbalancing data sets for machine learning, and validating detection schemes. In\nthis work, we present Conditional Derivative GAN (cDVGAN), a novel conditional\nmodel in the Generative Adversarial Network framework for simulating multiple\nclasses of time-domain observations that represent gravitational waves (GWs)\nand detector glitches. cDVGAN can also generate generalized hybrid samples that\nspan the variation between classes through interpolation in the conditioned\nclass vector. cDVGAN introduces an additional player into the typical 2-player\nadversarial game of GANs, where an auxiliary discriminator analyzes the\nfirst-order derivative time-series. Our results show that this provides\nsynthetic data that better captures the features of the original data. cDVGAN\nconditions on three classes, two denoised from LIGO blip and tomte glitch\nevents from its 3rd observing run (O3), and the third representing binary black\nhole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN\nmodels in replicating the features of the three classes. Specifically, our\nexperiments show that training convolutional neural networks (CNNs) with our\ncDVGAN-generated data improves the detection of samples embedded in detector\nnoise beyond the synthetic data from other state-of-the-art GAN models. Our\nbest synthetic dataset yields as much as a 4.2% increase in\narea-under-the-curve (AUC) performance compared to synthetic datasets from\nbaseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN\noutperforms CNNs trained only on the standard classes, when identifying real\nsamples embedded in LIGO detector background (4% AUC improvement for cDVGAN).\n","authors":["Tom Dooney","Lyana Curier","Daniel Tan","Melissa Lopez","Chris Van Den Broeck","Stefano Bromuri"],"pdf_url":"https://arxiv.org/pdf/2401.16356v5.pdf","comment":"20 pages, 17 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.02932v2","updated":"2024-08-12T09:48:45Z","published":"2024-08-06T03:34:43Z","title":"Doubly Stochastic Adaptive Neighbors Clustering via the Marcus Mapping","summary":" Clustering is a fundamental task in machine learning and data science, and\nsimilarity graph-based clustering is an important approach within this domain.\nDoubly stochastic symmetric similarity graphs provide numerous benefits for\nclustering problems and downstream tasks, yet learning such graphs remains a\nsignificant challenge. Marcus theorem states that a strictly positive symmetric\nmatrix can be transformed into a doubly stochastic symmetric matrix by diagonal\nmatrices. However, in clustering, learning sparse matrices is crucial for\ncomputational efficiency. We extend Marcus theorem by proposing the Marcus\nmapping, which indicates that certain sparse matrices can also be transformed\ninto doubly stochastic symmetric matrices via diagonal matrices. Additionally,\nwe introduce rank constraints into the clustering problem and propose the\nDoubly Stochastic Adaptive Neighbors Clustering algorithm based on the Marcus\nMapping (ANCMM). This ensures that the learned graph naturally divides into the\ndesired number of clusters. We validate the effectiveness of our algorithm\nthrough extensive comparisons with state-of-the-art algorithms. Finally, we\nexplore the relationship between the Marcus mapping and optimal transport. We\nprove that the Marcus mapping solves a specific type of optimal transport\nproblem and demonstrate that solving this problem through Marcus mapping is\nmore efficient than directly applying optimal transport methods.\n","authors":["Jinghui Yuan","Chusheng Zeng","Fangyuan Xie","Zhe Cao","Mulin Chen","Rong Wang","Feiping Nie","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.02932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02936v2","updated":"2024-08-12T09:46:30Z","published":"2024-08-06T03:42:38Z","title":"Achieving More with Less: A Tensor-Optimization-Powered Ensemble Method","summary":" Ensemble learning is a method that leverages weak learners to produce a\nstrong learner. However, obtaining a large number of base learners requires\nsubstantial time and computational resources. Therefore, it is meaningful to\nstudy how to achieve the performance typically obtained with many base learners\nusing only a few. We argue that to achieve this, it is essential to enhance\nboth classification performance and generalization ability during the ensemble\nprocess. To increase model accuracy, each weak base learner needs to be more\nefficiently integrated. It is observed that different base learners exhibit\nvarying levels of accuracy in predicting different classes. To capitalize on\nthis, we introduce confidence tensors $\\tilde{\\mathbf{\\Theta}}$ and\n$\\tilde{\\mathbf{\\Theta}}_{rst}$ signifies the degree of confidence that the\n$t$-th base classifier assigns the sample to class $r$ while it actually\nbelongs to class $s$. To the best of our knowledge, this is the first time an\nevaluation of the performance of base classifiers across different classes has\nbeen proposed. The proposed confidence tensor compensates for the strengths and\nweaknesses of each base classifier in different classes, enabling the method to\nachieve superior results with a smaller number of base learners. To enhance\ngeneralization performance, we design a smooth and convex objective function\nthat leverages the concept of margin, making the strong learner more\ndiscriminative. Furthermore, it is proved that in gradient matrix of the loss\nfunction, the sum of each column's elements is zero, allowing us to solve a\nconstrained optimization problem using gradient-based methods. We then compare\nour algorithm with random forests of ten times the size and other classical\nmethods across numerous datasets, demonstrating the superiority of our\napproach.\n","authors":["Jinghui Yuan","Weijin Jiang","Zhe Cao","Fangyuan Xie","Rong Wang","Feiping Nie","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.02936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06029v1","updated":"2024-08-12T09:38:15Z","published":"2024-08-12T09:38:15Z","title":"Graph Clustering with Cross-View Feature Propagation","summary":" Graph clustering is a fundamental and challenging learning task, which is\nconventionally approached by grouping similar vertices based on edge structure\nand feature similarity.In contrast to previous methods, in this paper, we\ninvestigate how multi-view feature propagation can influence cluster discovery\nin graph data.To this end, we present Graph Clustering With Cross-View Feature\nPropagation (GCCFP), a novel method that leverages multi-view feature\npropagation to enhance cluster identification in graph data.GCCFP employs a\nunified objective function that utilizes graph topology and multi-view vertex\nfeatures to determine vertex cluster membership, regularized by a module that\nsupports key latent feature propagation. We derive an iterative algorithm to\noptimize this function, prove model convergence within a finite number of\niterations, and analyze its computational complexity. Our experiments on\nvarious real-world graphs demonstrate the superior clustering performance of\nGCCFP compared to well-established methods, manifesting its effectiveness\nacross different scenarios.\n","authors":["Zhixuan Duan","Zuo Wang","Fanghui Bi"],"pdf_url":"https://arxiv.org/pdf/2408.06029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05510v2","updated":"2024-08-12T09:31:30Z","published":"2024-06-08T16:19:18Z","title":"Representation Learning with Conditional Information Flow Maximization","summary":" This paper proposes an information-theoretic representation learning\nframework, named conditional information flow maximization, to extract\nnoise-invariant sufficient representations for the input data and target task.\nIt promotes the learned representations have good feature uniformity and\nsufficient predictive ability, which can enhance the generalization of\npre-trained language models (PLMs) for the target task. Firstly, an information\nflow maximization principle is proposed to learn more sufficient\nrepresentations for the input and target by simultaneously maximizing both\ninput-representation and representation-label mutual information. Unlike the\ninformation bottleneck, we handle the input-representation information in an\nopposite way to avoid the over-compression issue of latent representations.\nBesides, to mitigate the negative effect of potential redundant features from\nthe input, we design a conditional information minimization principle to\neliminate negative redundant features while preserve noise-invariant features.\nExperiments on 13 language understanding benchmarks demonstrate that our method\neffectively improves the performance of PLMs for classification and regression.\nExtensive experiments show that the learned representations are more\nsufficient, robust and transferable.\n","authors":["Dou Hu","Lingwei Wei","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2406.05510v2.pdf","comment":"16 pages, accepted to ACL 2024 (main conference), the code is\n available at https://github.com/zerohd4869/CIFM"},{"id":"http://arxiv.org/abs/2408.06027v1","updated":"2024-08-12T09:29:26Z","published":"2024-08-12T09:29:26Z","title":"A Comprehensive Survey on EEG-Based Emotion Recognition: A Graph-Based\n Perspective","summary":" Compared to other modalities, electroencephalogram (EEG) based emotion\nrecognition can intuitively respond to emotional patterns in the human brain\nand, therefore, has become one of the most focused tasks in affective\ncomputing. The nature of emotions is a physiological and psychological state\nchange in response to brain region connectivity, making emotion recognition\nfocus more on the dependency between brain regions instead of specific brain\nregions. A significant trend is the application of graphs to encapsulate such\ndependency as dynamic functional connections between nodes across temporal and\nspatial dimensions. Concurrently, the neuroscientific underpinnings behind this\ndependency endow the application of graphs in this field with a distinctive\nsignificance. However, there is neither a comprehensive review nor a tutorial\nfor constructing emotion-relevant graphs in EEG-based emotion recognition. In\nthis paper, we present a comprehensive survey of these studies, delivering a\nsystematic review of graph-related methods in this field from a methodological\nperspective. We propose a unified framework for graph applications in this\nfield and categorize these methods on this basis. Finally, based on previous\nstudies, we also present several open challenges and future directions in this\nfield.\n","authors":["Chenyu Liu","Xinliang Zhou","Yihao Wu","Yi Ding","Liming Zhai","Kun Wang","Ziyu Jia","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06024v1","updated":"2024-08-12T09:24:48Z","published":"2024-08-12T09:24:48Z","title":"Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis\n Search","summary":" Deep neural network models have a complex architecture and are\noverparameterized. The number of parameters is more than the whole dataset,\nwhich is highly resource-consuming. This complicates their application and\nlimits its usage on different devices. Reduction in the number of network\nparameters helps to reduce the size of the model, but at the same time,\nthoughtlessly applied, can lead to a deterioration in the quality of the\nnetwork. One way to reduce the number of model parameters is matrix\ndecomposition, where a matrix is represented as a product of smaller matrices.\nIn this paper, we propose a new way of applying the matrix decomposition with\nrespect to the weights of convolutional layers. The essence of the method is to\ntrain not all convolutions, but only the subset of convolutions (basis\nconvolutions), and represent the rest as linear combinations of the basis ones.\nExperiments on models from the ResNet family and the CIFAR-10 dataset\ndemonstrate that basis convolutions can not only reduce the size of the model\nbut also accelerate the forward and backward passes of the network. Another\ncontribution of this work is that we propose a fast method for selecting a\nsubset of network layers in which the use of matrix decomposition does not\ndegrade the quality of the final model.\n","authors":["Vasiliy Alekseev","Ilya Lukashevich","Ilia Zharikov","Ilya Vasiliev"],"pdf_url":"https://arxiv.org/pdf/2408.06024v1.pdf","comment":"A revived draft of an unpublished (and never-to-be-published)\n article. For the sake of history, memory, and old times"},{"id":"http://arxiv.org/abs/2408.06018v1","updated":"2024-08-12T09:14:23Z","published":"2024-08-12T09:14:23Z","title":"Uncertainty-Informed Volume Visualization using Implicit Neural\n Representation","summary":" The increasing adoption of Deep Neural Networks (DNNs) has led to their\napplication in many challenging scientific visualization tasks. While advanced\nDNNs offer impressive generalization capabilities, understanding factors such\nas model prediction quality, robustness, and uncertainty is crucial. These\ninsights can enable domain scientists to make informed decisions about their\ndata. However, DNNs inherently lack ability to estimate prediction uncertainty,\nnecessitating new research to construct robust uncertainty-aware visualization\ntechniques tailored for various visualization tasks. In this work, we propose\nuncertainty-aware implicit neural representations to model scalar field data\nsets effectively and comprehensively study the efficacy and benefits of\nestimated uncertainty information for volume visualization tasks. We evaluate\nthe effectiveness of two principled deep uncertainty estimation techniques: (1)\nDeep Ensemble and (2) Monte Carlo Dropout (MCDropout). These techniques enable\nuncertainty-informed volume visualization in scalar field data sets. Our\nextensive exploration across multiple data sets demonstrates that\nuncertainty-aware models produce informative volume visualization results.\nMoreover, integrating prediction uncertainty enhances the trustworthiness of\nour DNN model, making it suitable for robustly analyzing and visualizing\nreal-world scientific volumetric data sets.\n","authors":["Shanu Saklani","Chitwan Goel","Shrey Bansal","Zhe Wang","Soumya Dutta","Tushar M. Athawale","David Pugmire","Christopher R. Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.06018v1.pdf","comment":"To appear in IEEE Workshop on Uncertainty Visualization in\n conjunction with IEEE VIS 2024, Florida, USA"},{"id":"http://arxiv.org/abs/2408.06003v1","updated":"2024-08-12T08:52:14Z","published":"2024-08-12T08:52:14Z","title":"LUT Tensor Core: Lookup Table Enables Efficient Low-Bit LLM Inference\n Acceleration","summary":" As large language model (LLM) inference demands ever-greater resources, there\nis a rapid growing trend of using low-bit weights to shrink memory usage and\nboost inference efficiency. However, these low-bit LLMs introduce the need for\nmixed-precision matrix multiplication (mpGEMM), which is a crucial yet\nunder-explored operation that involves multiplying lower-precision weights with\nhigher-precision activations. Unfortunately, current hardware does not natively\nsupport mpGEMM, resulting in indirect and inefficient dequantization-based\nimplementations.\n To address the mpGEMM requirements in low-bit LLMs, we explored the lookup\ntable (LUT)-based approach for mpGEMM. However, a conventional LUT\nimplementation falls short of its potential. To fully harness the power of\nLUT-based mpGEMM, we introduce LUT Tensor Core, a software-hardware co-design\noptimized for low-bit LLM inference. Specifically, we introduce software-based\noperator fusion and table symmetrization techniques to optimize table\nprecompute and table storage, respectively. Then, LUT Tensor Core proposes the\nhardware design featuring an elongated tiling shape design to enhance table\nreuse and a bit-serial design to support various precision combinations in\nmpGEMM. Moreover, we design an end-to-end compilation stack with new\ninstructions for LUT-based mpGEMM, enabling efficient LLM compilation and\noptimizations. The evaluation on low-bit LLMs (e.g., BitNet, LLAMA) shows that\nLUT Tensor Core achieves more than a magnitude of improvements on both compute\ndensity and energy efficiency.\n","authors":["Zhiwen Mo","Lei Wang","Jianyu Wei","Zhichen Zeng","Shijie Cao","Lingxiao Ma","Naifeng Jing","Ting Cao","Jilong Xue","Fan Yang","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.11003v2","updated":"2024-08-12T08:43:35Z","published":"2021-01-26T10:07:33Z","title":"FDApy: a Python package for functional data","summary":" We introduce FDApy, an open-source Python package for the analysis of\nfunctional data. The package provides tools for the representation of\n(multivariate) functional data defined on different dimensional domains and for\nfunctional data that is irregularly sampled. Additionally, dimension reduction\ntechniques are implemented for multivariate and/or multidimensional functional\ndata that are regularly or irregularly sampled. A toolbox for generating\nfunctional datasets is also provided. The documentation includes installation\nand usage instructions, examples on simulated and real datasets and a complete\ndescription of the API. FDApy is released under the MIT license. The code and\ndocumentation are available at https://github.com/StevenGolovkine/FDApy.\n","authors":["Steven Golovkine"],"pdf_url":"https://arxiv.org/pdf/2101.11003v2.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.05992v1","updated":"2024-08-12T08:40:20Z","published":"2024-08-12T08:40:20Z","title":"Transfer learning of state-based potential games for process\n optimization in decentralized manufacturing systems","summary":" This paper presents a novel transfer learning approach in state-based\npotential games (TL-SbPGs) for enhancing distributed self-optimization in\nmanufacturing systems. The approach focuses on the practical relevant\nindustrial setting where sharing and transferring gained knowledge among\nsimilar-behaved players improves the self-learning mechanism in large-scale\nsystems. With TL-SbPGs, the gained knowledge can be reused by other players to\noptimize their policies, thereby improving the learning outcomes of the players\nand accelerating the learning process. To accomplish this goal, we develop\ntransfer learning concepts and similarity criteria for players, which offer two\ndistinct settings: (a) predefined similarities between players and (b)\ndynamically inferred similarities between players during training. We formally\nprove the applicability of the SbPG framework in transfer learning.\nAdditionally, we introduce an efficient method to determine the optimal timing\nand weighting of the transfer learning procedure during the training phase.\nThrough experiments on a laboratory-scale testbed, we demonstrate that TL-SbPGs\nsignificantly boost production efficiency while reducing power consumption of\nthe production schedules while also outperforming native SbPGs.\n","authors":["Steve Yuwono","Dorothea Schwung","Andreas Schwung"],"pdf_url":"https://arxiv.org/pdf/2408.05992v1.pdf","comment":"This pre-print was submitted to Computers in Industry on May 02, 2024"},{"id":"http://arxiv.org/abs/2403.12559v2","updated":"2024-08-12T08:37:24Z","published":"2024-03-19T09:14:52Z","title":"Confidence Self-Calibration for Multi-Label Class-Incremental Learning","summary":" The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL)\narises when only the new classes are labeled during training, while past and\nfuture labels remain unavailable. This issue leads to a proliferation of\nfalse-positive errors due to erroneously high confidence multi-label\npredictions, exacerbating catastrophic forgetting within the disjoint label\nspace. In this paper, we aim to refine multi-label confidence calibration in\nMLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for\nlabel relationship calibration, we introduce a class-incremental graph\nconvolutional network that bridges the isolated label spaces by constructing\nlearnable, dynamically extended label relationship graph. Then, for confidence\ncalibration, we present a max-entropy regularization for each multi-label\nincrement, facilitating confidence self-calibration through the penalization of\nover-confident output distributions. Our approach attains new state-of-the-art\nresults in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the\ncalibration of label confidences confirmed through our methodology.\n","authors":["Kaile Du","Yifan Zhou","Fan Lyu","Yuyang Li","Chen Lu","Guangcan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.12559v2.pdf","comment":"Accepted at the European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2401.04191v2","updated":"2024-08-12T08:34:26Z","published":"2024-01-08T19:06:59Z","title":"Dense Hopfield Networks in the Teacher-Student Setting","summary":" Dense Hopfield networks are known for their feature to prototype transition\nand adversarial robustness. However, previous theoretical studies have been\nmostly concerned with their storage capacity. We bridge this gap by studying\nthe phase diagram of p-body Hopfield networks in the teacher-student setting of\nan unsupervised learning problem, uncovering ferromagnetic phases reminiscent\nof the prototype and feature learning regimes. On the Nishimori line, we find\nthe critical size of the training set necessary for efficient pattern\nretrieval. Interestingly, we find that that the paramagnetic to ferromagnetic\ntransition of the teacher-student setting coincides with the paramagnetic to\nspin-glass transition of the direct model, i.e. with random patterns. Outside\nof the Nishimori line, we investigate the learning performance in relation to\nthe inference temperature and dataset noise. Moreover, we show that using a\nlarger p for the student than the teacher gives the student an extensive\ntolerance to noise. We then derive a closed-form expression measuring the\nadversarial robustness of such a student at zero temperature, corroborating the\npositive correlation between number of parameters and robustness observed in\nlarge neural networks. We also use our model to clarify why the prototype phase\nof modern Hopfield networks is adversarially robust.\n","authors":["Robin Thériault","Daniele Tantari"],"pdf_url":"https://arxiv.org/pdf/2401.04191v2.pdf","comment":"34 pages, 9 figures, updated to match published version, implemented\n minor changes proposed in referee reports"},{"id":"http://arxiv.org/abs/2408.05990v1","updated":"2024-08-12T08:33:09Z","published":"2024-08-12T08:33:09Z","title":"Parameters Inference for Nonlinear Wave Equations with Markovian\n Switching","summary":" Traditional partial differential equations with constant coefficients often\nstruggle to capture abrupt changes in real-world phenomena, leading to the\ndevelopment of variable coefficient PDEs and Markovian switching models.\nRecently, research has introduced the concept of PDEs with Markov switching\nmodels, established their well-posedness and presented numerical methods.\nHowever, there has been limited discussion on parameter estimation for the jump\ncoefficients in these models. This paper addresses this gap by focusing on\nparameter inference for the wave equation with Markovian switching. We propose\na Bayesian statistical framework using discrete sparse Bayesian learning to\nestablish its convergence and a uniform error bound. Our method requires fewer\nassumptions and enables independent parameter inference for each segment by\nallowing different underlying structures for the parameter estimation problem\nwithin each segmented time interval. The effectiveness of our approach is\ndemonstrated through three numerical cases, which involve noisy spatiotemporal\ndata from different wave equations with Markovian switching. The results show\nstrong performance in parameter estimation for variable coefficient PDEs.\n","authors":["Yi Zhang","Zhikun Zhang","Xiangjun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13840v3","updated":"2024-08-12T08:30:05Z","published":"2023-05-23T09:03:19Z","title":"Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion\n Prior and Reward Feedback Learning","summary":" Recent advances in text-to-image (T2I) diffusion models have enabled\nimpressive image generation capabilities guided by text prompts. However,\nextending these techniques to video generation remains challenging, with\nexisting text-to-video (T2V) methods often struggling to produce high-quality\nand motion-consistent videos. In this work, we introduce Control-A-Video, a\ncontrollable T2V diffusion model that can generate videos conditioned on text\nprompts and reference control maps like edge and depth maps. To tackle video\nquality and motion consistency issues, we propose novel strategies to\nincorporate content prior and motion prior into the diffusion-based generation\nprocess. Specifically, we employ a first-frame condition scheme to transfer\nvideo generation from the image domain. Additionally, we introduce\nresidual-based and optical flow-based noise initialization to infuse motion\npriors from reference videos, promoting relevance among frame latents for\nreduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback\nLearning (ST-ReFL) algorithm that optimizes the video diffusion model using\nmultiple reward models for video quality and motion consistency, leading to\nsuperior outputs. Comprehensive experiments demonstrate that our framework\ngenerates higher-quality, more consistent videos compared to existing\nstate-of-the-art methods in controllable text-to-video generation\n","authors":["Weifeng Chen","Yatai Ji","Jie Wu","Hefeng Wu","Pan Xie","Jiashi Li","Xin Xia","Xuefeng Xiao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.13840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03720v4","updated":"2024-08-12T08:27:48Z","published":"2023-09-07T13:52:20Z","title":"A Natural Gas Consumption Forecasting System for Continual Learning\n Scenarios based on Hoeffding Trees with Change Point Detection Mechanism","summary":" Forecasting natural gas consumption, considering seasonality and trends, is\ncrucial in planning its supply and consumption and optimizing the cost of\nobtaining it, mainly by industrial entities. However, in times of threats to\nits supply, it is also a critical element that guarantees the supply of this\nraw material to meet individual consumers' needs, ensuring society's energy\nsecurity. This article introduces a novel multistep ahead forecasting of\nnatural gas consumption with change point detection integration for model\ncollection selection with continual learning capabilities using data stream\nprocessing. The performance of the forecasting models based on the proposed\napproach is evaluated in a complex real-world use case of natural gas\nconsumption forecasting. We employed Hoeffding tree predictors as forecasting\nmodels and the Pruned Exact Linear Time (PELT) algorithm for the change point\ndetection procedure. The change point detection integration enables selecting a\ndifferent model collection for successive time frames. Thus, three model\ncollection selection procedures (with and without an error feedback loop) are\ndefined and evaluated for forecasting scenarios with various densities of\ndetected change points. These models were compared with change point agnostic\nbaseline approaches. Our experiments show that fewer change points result in a\nlower forecasting error regardless of the model collection selection procedure\nemployed. Also, simpler model collection selection procedures omitting\nforecasting error feedback leads to more robust forecasting models suitable for\ncontinual learning tasks.\n","authors":["Radek Svoboda","Sebastian Basterrech","Jedrzej Kozal","Jan Platos","Michal Wozniak"],"pdf_url":"https://arxiv.org/pdf/2309.03720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05539v3","updated":"2024-08-12T08:26:26Z","published":"2023-11-09T17:34:57Z","title":"A Deep Learning Method for Simultaneous Denoising and Missing Wedge\n Reconstruction in Cryogenic Electron Tomography","summary":" Cryogenic electron tomography is a technique for imaging biological samples\nin 3D. A microscope collects a series of 2D projections of the sample, and the\ngoal is to reconstruct the 3D density of the sample called the tomogram.\nReconstruction is difficult as the 2D projections are noisy and can not be\nrecorded from all directions, resulting in a missing wedge of information.\nTomograms conventionally reconstructed with filtered back-projection suffer\nfrom noise and strong artifacts due to the missing wedge. Here, we propose a\ndeep-learning approach for simultaneous denoising and missing wedge\nreconstruction called DeepDeWedge. The algorithm requires no ground truth data\nand is based on fitting a neural network to the 2D projections using a\nself-supervised loss. DeepDeWedge is simpler than current state-of-the-art\napproaches for denoising and missing wedge reconstruction, performs\ncompetitively and produces more denoised tomograms with higher overall\ncontrast.\n","authors":["Simon Wiedemann","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2311.05539v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15740v2","updated":"2024-08-12T08:21:32Z","published":"2024-03-23T06:36:32Z","title":"Protecting Copyrighted Material with Unique Identifiers in Large\n Language Model Training","summary":" A major public concern regarding the training of large language models (LLMs)\nis whether they abusing copyrighted online text. Previous membership inference\nmethods may be misled by similar examples in vast amounts of training data.\nAdditionally, these methods are often too complex for general users to\nunderstand and use, making them centralized, lacking transparency, and\ntrustworthiness. To address these issues, we propose an alternative\n\\textit{insert-and-detection} methodology, advocating that web users and\ncontent platforms employ \\textbf{\\textit{unique identifiers}} for reliable and\nindependent membership inference. Users and platforms can create their own\nidentifiers, embed them in copyrighted text, and independently detect them in\nfuture LLMs. As an initial demonstration, we introduce \\textit{ghost\nsentences}, a primitive form of unique identifiers, consisting primarily of\npassphrases made up of random words. By embedding one ghost sentences in a few\ncopyrighted texts, users can detect its membership using a perplexity test and\na \\textit{user-friendly} last-$k$ words test. The perplexity test is based on\nthe fact that LLMs trained on natural language should exhibit high perplexity\nwhen encountering unnatural passphrases. As the repetition increases, users can\nleverage the verbatim memorization ability of LLMs to perform a last-$k$ words\ntest by chatting with LLMs without writing any code. Both tests offer rigorous\nstatistical guarantees for membership inference. For LLaMA-13B, a perplexity\ntest on 30 ghost sentences with an average of 7 repetitions in 148K examples\nyields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out\nof 16 users, with an average of 24 examples each, successfully identify their\ndata from 1.8M examples.\n","authors":["Shuai Zhao","Linchao Zhu","Ruijie Quan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15740v2.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2201.08078v4","updated":"2024-08-12T08:14:59Z","published":"2022-01-20T09:22:43Z","title":"Addressing Maximization Bias in Reinforcement Learning with Two-Sample\n Testing","summary":" Value-based reinforcement-learning algorithms have shown strong results in\ngames, robotics, and other real-world applications. Overestimation bias is a\nknown threat to those algorithms and can sometimes lead to dramatic performance\ndecreases or even complete algorithmic failure. We frame the bias problem\nstatistically and consider it an instance of estimating the maximum expected\nvalue (MEV) of a set of random variables. We propose the $T$-Estimator (TE)\nbased on two-sample testing for the mean, that flexibly interpolates between\nover- and underestimation by adjusting the significance level of the underlying\nhypothesis tests. We also introduce a generalization, termed $K$-Estimator\n(KE), that obeys the same bias and variance bounds as the TE and relies on a\nnearly arbitrary kernel function. We introduce modifications of $Q$-Learning\nand the Bootstrapped Deep $Q$-Network (BDQN) using the TE and the KE, and prove\nconvergence in the tabular setting. Furthermore, we propose an adaptive variant\nof the TE-based BDQN that dynamically adjusts the significance level to\nminimize the absolute estimation bias. All proposed estimators and algorithms\nare thoroughly tested and validated on diverse tasks and environments,\nillustrating the bias control and performance potential of the TE and KE.\n","authors":["Martin Waltz","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2201.08078v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05976v1","updated":"2024-08-12T08:05:30Z","published":"2024-08-12T08:05:30Z","title":"Global-to-Local Support Spectrums for Language Model Explainability","summary":" Existing sample-based methods, like influence functions and representer\npoints, measure the importance of a training point by approximating the effect\nof its removal from training. As such, they are skewed towards outliers and\npoints that are very close to the decision boundaries. The explanations\nprovided by these methods are often static and not specific enough for\ndifferent test points. In this paper, we propose a method to generate an\nexplanation in the form of support spectrums which are based on two main ideas:\nthe support sets and a global-to-local importance measure. The support set is\nthe set of training points, in the predicted class, that ``lie in between'' the\ntest point and training points in the other classes. They indicate how well the\ntest point can be distinguished from the points not in the predicted class. The\nglobal-to-local importance measure is obtained by decoupling existing methods\ninto the global and local components which are then used to select the points\nin the support set. Using this method, we are able to generate explanations\nthat are tailored to specific test points. In the experiments, we show the\neffectiveness of the method in image classification and text generation tasks.\n","authors":["Lucas Agussurja","Xinyang Lu","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2408.05976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09548v2","updated":"2024-08-12T08:02:06Z","published":"2024-06-13T19:29:37Z","title":"Between Randomness and Arbitrariness: Some Lessons for Reliable Machine\n Learning at Scale","summary":" To develop rigorous knowledge about ML models -- and the systems in which\nthey are embedded -- we need reliable measurements. But reliable measurement is\nfundamentally challenging, and touches on issues of reproducibility,\nscalability, uncertainty quantification, epistemology, and more. This\ndissertation addresses criteria needed to take reliability seriously: both\ncriteria for designing meaningful metrics, and for methodologies that ensure\nthat we can dependably and efficiently measure these metrics at scale and in\npractice. In doing so, this dissertation articulates a research vision for a\nnew field of scholarship at the intersection of machine learning, law, and\npolicy. Within this frame, we cover topics that fit under three different\nthemes: (1) quantifying and mitigating sources of arbitrariness in ML, (2)\ntaming randomness in uncertainty estimation and optimization algorithms, in\norder to achieve scalability without sacrificing reliability, and (3) providing\nmethods for evaluating generative-AI systems, with specific focuses on\nquantifying memorization in language models and training latent diffusion\nmodels on open-licensed data. By making contributions in these three themes,\nthis dissertation serves as an empirical proof by example that research on\nreliable measurement for machine learning is intimately and inescapably bound\nup with research in law and policy. These different disciplines pose similar\nresearch questions about reliable measurement in machine learning. They are, in\nfact, two complementary sides of the same research vision, which, broadly\nconstrued, aims to construct machine-learning systems that cohere with broader\nsocietal values.\n","authors":["A. Feder Cooper"],"pdf_url":"https://arxiv.org/pdf/2406.09548v2.pdf","comment":"Ph.D. Dissertation"},{"id":"http://arxiv.org/abs/2408.05964v1","updated":"2024-08-12T07:33:11Z","published":"2024-08-12T07:33:11Z","title":"Target Detection of Safety Protective Gear Using the Improved YOLOv5","summary":" In high-risk railway construction, personal protective equipment monitoring\nis critical but challenging due to small and frequently obstructed targets. We\npropose YOLO-EA, an innovative model that enhances safety measure detection by\nintegrating ECA into its backbone's convolutional layers, improving discernment\nof minuscule objects like hardhats. YOLO-EA further refines target recognition\nunder occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was\nempirically substantiated using a dataset derived from real-world railway\nconstruction site surveillance footage. It outperforms YOLOv5, achieving 98.9%\nprecision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining\nreal-time performance at 70.774 fps. This highly efficient and precise YOLO-EA\nholds great promise for practical application in intricate construction\nscenarios, enforcing stringent safety compliance during complex railway\nconstruction projects.\n","authors":["Hao Liu","Xue Qin"],"pdf_url":"https://arxiv.org/pdf/2408.05964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18508v2","updated":"2024-08-12T07:13:54Z","published":"2024-04-29T08:50:27Z","title":"Scalable Event-by-event Processing of Neuromorphic Sensory Signals With\n Deep State-Space Models","summary":" Event-based sensors are well suited for real-time processing due to their\nfast response times and encoding of the sensory data as successive temporal\ndifferences. These and other valuable properties, such as a high dynamic range,\nare suppressed when the data is converted to a frame-based format. However,\nmost current methods either collapse events into frames or cannot scale up when\nprocessing the event data directly event-by-event. In this work, we address the\nkey challenges of scaling up event-by-event modeling of the long event streams\nemitted by such sensors, which is a particularly relevant problem for\nneuromorphic computing. While prior methods can process up to a few thousand\ntime steps, our model, based on modern recurrent deep state-space models,\nscales to event streams of millions of events for both training and\ninference.We leverage their stable parameterization for learning long-range\ndependencies, parallelizability along the sequence dimension, and their ability\nto integrate asynchronous events effectively to scale them up to long event\nstreams.We further augment these with novel event-centric techniques enabling\nour model to match or beat the state-of-the-art performance on several event\nstream benchmarks. In the Spiking Speech Commands task, we improve\nstate-of-the-art by a large margin of 6.6% to 87.1%. On the DVS128-Gestures\ndataset, we achieve competitive results without using frames or convolutional\nneural networks. Our work demonstrates, for the first time, that it is possible\nto use fully event-based processing with purely recurrent networks to achieve\nstate-of-the-art task performance in several event-based benchmarks.\n","authors":["Mark Schöne","Neeraj Mohan Sushma","Jingyue Zhuge","Christian Mayr","Anand Subramoney","David Kappel"],"pdf_url":"https://arxiv.org/pdf/2404.18508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04353v3","updated":"2024-08-12T07:11:20Z","published":"2023-04-10T02:22:36Z","title":"Exponentially improved efficient machine learning for quantum many-body\n states with provable guarantees","summary":" Solving the ground state and the ground-state properties of quantum many-body\nsystems is generically a hard task for classical algorithms. For a family of\nHamiltonians defined on an $m$-dimensional space of physical parameters, the\nground state and its properties at an arbitrary parameter configuration can be\npredicted via a machine learning protocol up to a prescribed prediction error\n$\\varepsilon$, provided that a sample set (of size $N$) of the states can be\nefficiently prepared and measured. In a recent work [Huang et al., Science 377,\neabk3333 (2022)], a rigorous guarantee for such a generalization was proved.\nUnfortunately, an exponential scaling for the provable sample complexity,\n$N=m^{{\\cal{O}}\\left(\\frac{1}{\\varepsilon}\\right)}$, was found to be universal\nfor generic gapped Hamiltonians. This result applies to the situation where the\ndimension of the parameter space is large while the scaling with the accuracy\nis not an urgent factor. In this work, we consider an alternative scenario\nwhere $m$ is a finite, not necessarily large constant while the scaling with\nthe prediction error becomes the central concern. By jointly preserving the\nfundamental properties of density matrices in the learning protocol and\nutilizing the continuity of quantum states in the parameter range of interest,\nwe rigorously obtain a polynomial sample complexity for predicting quantum\nmany-body states and their properties, with respect to the uniform prediction\nerror $\\varepsilon$ and the number of qubits $n$. Moreover, if restricted to\nlearning local quantum-state properties, the number of samples with respect to\n$n$ can be further reduced exponentially. Our results provide theoretical\nguarantees for efficient learning of quantum many-body states and their\nproperties, with model-independent applications not restricted to ground states\nof gapped Hamiltonians.\n","authors":["Yanming Che","Clemens Gneiting","Franco Nori"],"pdf_url":"https://arxiv.org/pdf/2304.04353v3.pdf","comment":"Published on Physical Review Research 6, 033035 (2024)"},{"id":"http://arxiv.org/abs/2408.04903v2","updated":"2024-08-12T07:04:56Z","published":"2024-08-09T07:10:07Z","title":"Axiomatic Characterisations of Sample-based Explainers","summary":" Explaining decisions of black-box classifiers is both important and\ncomputationally challenging. In this paper, we scrutinize explainers that\ngenerate feature-based explanations from samples or datasets. We start by\npresenting a set of desirable properties that explainers would ideally satisfy,\ndelve into their relationships, and highlight incompatibilities of some of\nthem. We identify the entire family of explainers that satisfy two key\nproperties which are compatible with all the others. Its instances provide\nsufficient reasons, called weak abductive explanations.We then unravel its\nvarious subfamilies that satisfy subsets of compatible properties. Indeed, we\nfully characterize all the explainers that satisfy any subset of compatible\nproperties. In particular, we introduce the first (broad family of) explainers\nthat guarantee the existence of explanations and their global consistency.We\ndiscuss some of its instances including the irrefutable explainer and the\nsurrogate explainer whose explanations can be found in polynomial time.\n","authors":["Leila Amgoud","Martin C. Cooper","Salim Debbaoui"],"pdf_url":"https://arxiv.org/pdf/2408.04903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05948v1","updated":"2024-08-12T06:48:43Z","published":"2024-08-12T06:48:43Z","title":"ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge\n Graph QA datasets with Large Language Models","summary":" The rapid advancement of Large Language Models (LLMs) and conversational\nassistants necessitates dynamic, scalable, and configurable conversational\ndatasets for training and evaluation. These datasets must accommodate diverse\nuser interaction modes, including text and voice, each presenting unique\nmodeling challenges. Knowledge Graphs (KGs), with their structured and evolving\nnature, offer an ideal foundation for current and precise knowledge. Although\nhuman-curated KG-based conversational datasets exist, they struggle to keep\npace with the rapidly changing user information needs. We present ConvKGYarn, a\nscalable method for generating up-to-date and configurable conversational KGQA\ndatasets. Qualitative psychometric analyses confirm our method can generate\nhigh-quality datasets rivaling a popular conversational KGQA dataset while\noffering it at scale and covering a wide range of human-interaction\nconfigurations. We showcase its utility by testing LLMs on diverse\nconversations - exploring model behavior on conversational KGQA sets with\ndifferent configurations grounded in the same KG fact set. Our results\nhighlight the ability of ConvKGYarn to improve KGQA foundations and evaluate\nparametric knowledge of LLMs, thus offering a robust solution to the constantly\nevolving landscape of conversational assistants.\n","authors":["Ronak Pradeep","Daniel Lee","Ali Mousavi","Jeff Pound","Yisi Sang","Jimmy Lin","Ihab Ilyas","Saloni Potdar","Mostafa Arefiyan","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.05948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16808v3","updated":"2024-08-12T06:36:13Z","published":"2024-01-30T08:11:36Z","title":"Encoding Temporal Statistical-space Priors via Augmented Representation","summary":" Modeling time series data remains a pervasive issue as the temporal dimension\nis inherent to numerous domains. Despite significant strides in time series\nforecasting, high noise-to-signal ratio, non-normality, non-stationarity, and\nlack of data continue challenging practitioners. In response, we leverage a\nsimple representation augmentation technique to overcome these challenges. Our\naugmented representation acts as a statistical-space prior encoded at each time\nstep. In response, we name our method Statistical-space Augmented\nRepresentation (SSAR). The underlying high-dimensional data-generating process\ninspires our representation augmentation. We rigorously examine the empirical\ngeneralization performance on two data sets with two downstream temporal\nlearning algorithms. Our approach significantly beats all five up-to-date\nbaselines. Moreover, the highly modular nature of our approach can easily be\napplied to various settings. Lastly, fully-fledged theoretical perspectives are\navailable throughout the writing for a clear and rigorous understanding.\n","authors":["Insu Choi","Woosung Koh","Gimin Kang","Yuntae Jang","Woo Chang Kim"],"pdf_url":"https://arxiv.org/pdf/2401.16808v3.pdf","comment":"IJCAI 2024 STRL Workshop (Oral)"},{"id":"http://arxiv.org/abs/2312.06363v3","updated":"2024-08-12T06:17:21Z","published":"2023-12-11T13:11:04Z","title":"MMICT: Boosting Multi-Modal Fine-Tuning with In-Context Examples","summary":" Although In-Context Learning (ICL) brings remarkable performance gains to\nLarge Language Models (LLMs), the improvements remain lower than fine-tuning on\ndownstream tasks. This paper introduces Multi-Modal In-Context Tuning (MMICT),\na novel multi-modal fine-tuning paradigm that boosts multi-modal fine-tuning by\nfully leveraging the promising ICL capability of multi-modal LLMs (MM-LLMs). We\npropose the Multi-Modal Hub (M-Hub), a unified module that captures various\nmulti-modal features according to different inputs and objectives. Based on\nM-Hub, MMICT enables MM-LLMs to learn from in-context visual-guided textual\nfeatures and subsequently generate outputs conditioned on the textual-guided\nvisual features. Moreover, leveraging the flexibility of M-Hub, we design a\nvariety of in-context demonstrations. Extensive experiments on a diverse range\nof downstream multi-modal tasks demonstrate that MMICT significantly\noutperforms traditional fine-tuning strategy and the vanilla ICT method that\ndirectly takes the concatenation of all information from different modalities\nas input. Our implementation is available at:\nhttps://github.com/KDEGroup/MMICT.\n","authors":["Tao Chen","Enwei Zhang","Yuting Gao","Ke Li","Xing Sun","Yan Zhang","Hui Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.06363v3.pdf","comment":"TOMM 2024"},{"id":"http://arxiv.org/abs/2212.03637v3","updated":"2024-08-12T06:14:14Z","published":"2022-12-06T15:05:54Z","title":"Unsupervised Anomaly Detection in Time-series: An Extensive Evaluation\n and Analysis of State-of-the-art Methods","summary":" Unsupervised anomaly detection in time-series has been extensively\ninvestigated in the literature. Notwithstanding the relevance of this topic in\nnumerous application fields, a comprehensive and extensive evaluation of recent\nstate-of-the-art techniques taking into account real-world constraints is still\nneeded. Some efforts have been made to compare existing unsupervised\ntime-series anomaly detection methods rigorously. However, only standard\nperformance metrics, namely precision, recall, and F1-score are usually\nconsidered. Essential aspects for assessing their practical relevance are\ntherefore neglected. This paper proposes an in-depth evaluation study of recent\nunsupervised anomaly detection techniques in time-series. Instead of relying\nsolely on standard performance metrics, additional yet informative metrics and\nprotocols are taken into account. In particular, (i) more elaborate performance\nmetrics specifically tailored for time-series are used; (ii) the model size and\nthe model stability are studied; (iii) an analysis of the tested approaches\nwith respect to the anomaly type is provided; and (iv) a clear and unique\nprotocol is followed for all experiments. Overall, this extensive analysis aims\nto assess the maturity of state-of-the-art time-series anomaly detection, give\ninsights regarding their applicability under real-world setups and provide to\nthe community a more complete evaluation protocol.\n","authors":["Nesryne Mejri","Laura Lopez-Fuentes","Kankana Roy","Pavel Chernakov","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2212.03637v3.pdf","comment":"Accepted at Expert Systems with Applications journal"},{"id":"http://arxiv.org/abs/2406.18568v2","updated":"2024-08-12T06:11:33Z","published":"2024-06-02T13:25:44Z","title":"A Diagnostic Model for Acute Lymphoblastic Leukemia Using Metaheuristics\n and Deep Learning Methods","summary":" Acute lymphoblastic leukemia (ALL) severity is determined by the presence and\nratios of blast cells (abnormal white blood cells) in both bone marrow and\nperipheral blood. Manual diagnosis of this disease is a tedious and\ntime-consuming operation, making it difficult for professionals to accurately\nexamine blast cell characteristics. To address this difficulty, researchers use\ndeep learning and machine learning. In this paper, a ResNet-based feature\nextractor is utilized to detect ALL, along with a variety of feature selectors\nand classifiers. To get the best results, a variety of transfer learning\nmodels, including the Resnet, VGG, EfficientNet, and DensNet families, are used\nas deep feature extractors. Following extraction, different feature selectors\nare used, including Genetic algorithm, PCA, ANOVA, Random Forest, Univariate,\nMutual information, Lasso, XGB, Variance, and Binary ant colony. After feature\nqualification, a variety of classifiers are used, with MLP outperforming the\nothers. The recommended technique is used to categorize ALL and HEM in the\nselected dataset which is C-NMC 2019. This technique got an impressive 90.71%\naccuracy and 95.76% sensitivity for the relevant classifications, and its\nmetrics on this dataset outperformed others.\n","authors":["Amir Masoud Rahmani","Parisa Khoshvaght","Hamid Alinejad-Rokny","Samira Sadeghi","Parvaneh Asghari","Zohre Arabi","Mehdi Hosseinzadeh"],"pdf_url":"https://arxiv.org/pdf/2406.18568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02266v2","updated":"2024-08-12T06:08:35Z","published":"2024-08-05T06:47:32Z","title":"One-Shot Collaborative Data Distillation","summary":" Large machine-learning training datasets can be distilled into small\ncollections of informative synthetic data samples. These synthetic sets support\nefficient model learning and reduce the communication cost of data sharing.\nThus, high-fidelity distilled data can support the efficient deployment of\nmachine learning applications in distributed network environments. A naive way\nto construct a synthetic set in a distributed environment is to allow each\nclient to perform local data distillation and to merge local distillations at a\ncentral server. However, the quality of the resulting set is impaired by\nheterogeneity in the distributions of the local data held by clients. To\novercome this challenge, we introduce the first collaborative data distillation\ntechnique, called CollabDM, which captures the global distribution of the data\nand requires only a single round of communication between client and server.\nOur method outperforms the state-of-the-art one-shot learning method on skewed\ndata in distributed learning environments. We also show the promising practical\nbenefits of our method when applied to attack detection in 5G networks.\n","authors":["William Holland","Chandra Thapa","Sarah Ali Siddiqui","Wei Shao","Seyit Camtepe"],"pdf_url":"https://arxiv.org/pdf/2408.02266v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01360v2","updated":"2024-08-12T05:55:26Z","published":"2024-04-01T11:42:43Z","title":"Deep learning phase recovery: data-driven, physics-driven, or combining\n both?","summary":" Phase recovery, calculating the phase of a light wave from its intensity\nmeasurements, is essential for various applications, such as coherent\ndiffraction imaging, adaptive optics, and biomedical imaging. It enables the\nreconstruction of an object's refractive index distribution or topography as\nwell as the correction of imaging system aberrations. In recent years, deep\nlearning has been proven to be highly effective in addressing phase recovery\nproblems. Two most direct deep learning phase recovery strategies are\ndata-driven (DD) with supervised learning mode and physics-driven (PD) with\nself-supervised learning mode. DD and PD achieve the same goal in different\nways and lack the necessary study to reveal similarities and differences.\nTherefore, in this paper, we comprehensively compare these two deep learning\nphase recovery strategies in terms of time consumption, accuracy,\ngeneralization ability, ill-posedness adaptability, and prior capacity. What's\nmore, we propose a co-driven (CD) strategy of combining datasets and physics\nfor the balance of high- and low-frequency information. The codes for DD, PD,\nand CD are publicly available at https://github.com/kqwang/DLPR.\n","authors":["Kaiqiang Wang","Edmund Y. Lam"],"pdf_url":"https://arxiv.org/pdf/2404.01360v2.pdf","comment":"24 pages, 13 figures"},{"id":"http://arxiv.org/abs/2407.11075v3","updated":"2024-08-12T05:23:34Z","published":"2024-07-13T04:29:36Z","title":"A Comprehensive Survey on Kolmogorov Arnold Networks (KAN)","summary":" Through this comprehensive survey of Kolmogorov-Arnold Networks(KAN), we have\ngained a thorough understanding of its theoretical foundation, architectural\ndesign, application scenarios, and current research progress. KAN, with its\nunique architecture and flexible activation functions, excels in handling\ncomplex data patterns and nonlinear relationships, demonstrating wide-ranging\napplication potential. While challenges remain, KAN is poised to pave the way\nfor innovative solutions in various fields, potentially revolutionizing how we\napproach complex computational problems.\n","authors":["Yuntian Hou","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.11075v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12968v2","updated":"2024-08-12T04:48:11Z","published":"2024-03-19T17:59:56Z","title":"LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic\n Prompt Compression","summary":" This paper focuses on task-agnostic prompt compression for better\ngeneralizability and efficiency. Considering the redundancy in natural\nlanguage, existing approaches compress prompts by removing tokens or lexical\nunits according to their information entropy obtained from a causal language\nmodel such as LLaMa-7B. The challenge is that information entropy may be a\nsuboptimal compression metric: (i) it only leverages unidirectional context and\nmay fail to capture all essential information needed for prompt compression;\n(ii) it is not aligned with the prompt compression objective.\n To address these issues, we propose a data distillation procedure to derive\nknowledge from an LLM to compress prompts without losing crucial information,\nand meantime, introduce an extractive text compression dataset. We formulate\nprompt compression as a token classification problem to guarantee the\nfaithfulness of the compressed prompt to the original one, and use a\nTransformer encoder as the base architecture to capture all essential\ninformation for prompt compression from the full bidirectional context. Our\napproach leads to lower latency by explicitly learning the compression\nobjective with smaller models such as XLM-RoBERTa-large and mBERT.\n We evaluate our method on both in-domain and out-of-domain datasets,\nincluding MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its\nsmall size, our model shows significant performance gains over strong baselines\nand demonstrates robust generalization ability across different LLMs.\nAdditionally, our model is 3x-6x faster than existing prompt compression\nmethods, while accelerating the end-to-end latency by 1.6x-2.9x with\ncompression ratios of 2x-5x. Our code is available at\nhttps://aka.ms/LLMLingua-2.\n","authors":["Zhuoshi Pan","Qianhui Wu","Huiqiang Jiang","Menglin Xia","Xufang Luo","Jue Zhang","Qingwei Lin","Victor Rühle","Yuqing Yang","Chin-Yew Lin","H. Vicky Zhao","Lili Qiu","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12968v2.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.05917v1","updated":"2024-08-12T04:43:40Z","published":"2024-08-12T04:43:40Z","title":"Inverse design of Non-parameterized Ventilated Acoustic Resonator via\n Variational Autoencoder with Acoustic Response-encoded Latent Space","summary":" Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge\nas an alternative for sound attenuation in environments that require\nventilation, owing to its excellent low-frequency attenuation performance and\nflexible shape adaptability. However, due to the non-linear acoustic responses\nof VARs, the VAR designs are generally obtained within a limited parametrized\ndesign space, and the design relies on the iteration of the numerical\nsimulation which consumes a considerable amount of computational time and\nresources. This paper proposes an acoustic response-encoded variational\nautoencoder (AR-VAE), a novel variational autoencoder-based generative design\nmodel for the efficient and accurate inverse design of VAR even with\nnon-parametrized designs. The AR-VAE matches the high-dimensional acoustic\nresponse with the VAR cross-section image in the dimension-reduced latent\nspace, which enables the AR-VAE to generate various non-parametrized VAR\ncross-section images with the target acoustic response. AR-VAE generates\nnon-parameterized VARs from target acoustic responses, which show a 25-fold\nreduction in mean squared error compared to conventional deep learning-based\nparameter searching methods while exhibiting lower average mean squared error\nand peak frequency variance. By combining the inverse-designed VARs by AR-VAE,\nmulti-cavity VAR was devised for broadband and multitarget peak frequency\nattenuation. The proposed design method presents a new approach for structural\ninverse-design with a high-dimensional non-linear physical response.\n","authors":["Min Woo Cho","Seok Hyeon Hwang","Jun-Young Jang","Jin Yeong Song","Sun-kwang Hwang","Kyoung Je Cha","Dong Yong Park","Kyungjun Song","Sang Min Park"],"pdf_url":"https://arxiv.org/pdf/2408.05917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05916v1","updated":"2024-08-12T04:29:54Z","published":"2024-08-12T04:29:54Z","title":"Cluster-Segregate-Perturb (CSP): A Model-agnostic Explainability\n Pipeline for Spatiotemporal Land Surface Forecasting Models","summary":" Satellite images have become increasingly valuable for modelling regional\nclimate change effects. Earth surface forecasting represents one such task that\nintegrates satellite images with meteorological data to capture the joint\nevolution of regional climate change effects. However, understanding the\ncomplex relationship between specific meteorological variables and land surface\nevolution poses a significant challenge. In light of this challenge, our paper\nintroduces a pipeline that integrates principles from both perturbation-based\nexplainability techniques like LIME and global marginal explainability\ntechniques like PDP, besides addressing the constraints of using such\ntechniques when applying them to high-dimensional spatiotemporal deep models.\nThe proposed pipeline simplifies the undertaking of diverse investigative\nanalyses, such as marginal sensitivity analysis, marginal correlation analysis,\nlag analysis, etc., on complex land surface forecasting models In this study we\nutilised Convolutional Long Short-Term Memory (ConvLSTM) as the surface\nforecasting model and did analyses on the Normalized Difference Vegetation\nIndex (NDVI) of the surface forecasts, since meteorological variables like\ntemperature, pressure, and precipitation significantly influence it. The study\narea encompasses various regions in Europe. Our analyses show that\nprecipitation exhibits the highest sensitivity in the study area, followed by\ntemperature and pressure. Pressure has little to no direct effect on NDVI.\nAdditionally, interesting nonlinear correlations between meteorological\nvariables and NDVI have been uncovered.\n","authors":["Tushar Verma","Sudipan Saha"],"pdf_url":"https://arxiv.org/pdf/2408.05916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06839v2","updated":"2024-08-12T03:53:35Z","published":"2023-10-10T17:59:58Z","title":"LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios\n via Prompt Compression","summary":" In long context scenarios, large language models (LLMs) face three main\nchallenges: higher computational cost, performance reduction, and position\nbias. Research indicates that LLM performance hinges on the density and\nposition of key information in the input prompt. Inspired by these findings, we\npropose LongLLMLingua for prompt compression towards improving LLMs' perception\nof the key information to simultaneously address the three challenges. Our\nextensive evaluation across various long context scenarios demonstrates that\nLongLLMLingua not only enhances performance but also significantly reduces\ncosts and latency. For instance, in the NaturalQuestions benchmark,\nLongLLMLingua boosts performance by up to 21.4% with around 4x fewer tokens in\nGPT-3.5-Turbo, leading to substantial cost savings. It achieves a 94.0% cost\nreduction in the LooGLE benchmark. Moreover, when compressing prompts of about\n10k tokens at ratios of 2x-6x, LongLLMLingua can accelerate end-to-end latency\nby 1.4x-2.6x. Our code is available at https://aka.ms/LongLLMLingua.\n","authors":["Huiqiang Jiang","Qianhui Wu","Xufang Luo","Dongsheng Li","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.06839v2.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2304.05215v4","updated":"2024-08-12T03:33:12Z","published":"2023-04-11T13:33:45Z","title":"A Billion-scale Foundation Model for Remote Sensing Images","summary":" As the potential of foundation models in visual tasks has garnered\nsignificant attention, pretraining these models before downstream tasks has\nbecome a crucial step. The three key factors in pretraining foundation models\nare the pretraining method, the size of the pretraining dataset, and the number\nof model parameters. Recently, research in the remote sensing field has focused\nprimarily on the pretraining method and the size of the dataset, with limited\nemphasis on the number of model parameters. This paper addresses this gap by\nexamining the effect of increasing the number of model parameters on the\nperformance of foundation models in downstream tasks such as rotated object\ndetection and semantic segmentation. We pretrained foundation models with\nvarying numbers of parameters, including 86M, 605.26M, 1.3B, and 2.4B, to\ndetermine whether performance in downstream tasks improved with an increase in\nparameters. To the best of our knowledge, this is the first billion-scale\nfoundation model in the remote sensing field. Furthermore, we propose an\neffective method for scaling up and fine-tuning a vision transformer in the\nremote sensing field. To evaluate general performance in downstream tasks, we\nemployed the DOTA v2.0 and DIOR-R benchmark datasets for rotated object\ndetection, and the Potsdam and LoveDA datasets for semantic segmentation.\nExperimental results demonstrated that, across all benchmark datasets and\ndownstream tasks, the performance of the foundation models and data efficiency\nimproved as the number of parameters increased. Moreover, our models achieve\nthe state-of-the-art performance on several datasets including DIOR-R, Postdam,\nand LoveDA.\n","authors":["Keumgang Cha","Junghoon Seo","Taekyung Lee"],"pdf_url":"https://arxiv.org/pdf/2304.05215v4.pdf","comment":"This manuscript is the accepted version for IEEE Journal of Selected\n Topics in Applied Earth Observations and Remote Sensing (IEEE J-STARS)"},{"id":"http://arxiv.org/abs/2403.18330v2","updated":"2024-08-12T03:00:37Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nimplicit-learned memories to retain as many temporal cues as possible. However,\nimplicit memories still struggle to preserve long-term features effectively. In\nthis paper, we consider those invisible objects as pseudo-occluded objects and\naim to detect them by tracking through occlusions. Firstly, we introduce the\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nnot only clean the existing event camera dataset but also append additional\nvisibility labels to it. Secondly, we exploit tracking strategies for\npseudo-occluded objects to maintain their permanence and retain their bounding\nboxes, even when features have not been available for a very long time. These\nstrategies can be treated as an explicit-learned memory guided by the tracking\nobjective to record the displacements of objects across frames. Lastly, we\npropose a spatio-temporal feature aggregation module to enrich the latent\nfeatures and a consistency loss to increase the robustness of the overall\npipeline. We conduct comprehensive experiments to verify our method's\neffectiveness where still objects are retained, but real occluded objects are\ndiscarded. The results demonstrate that (1) the additional visibility labels\ncan assist in supervised training, and (2) our method outperforms\nstate-of-the-art approaches with a significant improvement of 7.9% absolute\nmAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05899v1","updated":"2024-08-12T02:45:58Z","published":"2024-08-12T02:45:58Z","title":"Quantum Gradient Class Activation Map for Model Interpretability","summary":" Quantum machine learning (QML) has recently made significant advancements in\nvarious topics. Despite the successes, the safety and interpretability of QML\napplications have not been thoroughly investigated. This work proposes using\nVariational Quantum Circuits (VQCs) for activation mapping to enhance model\ntransparency, introducing the Quantum Gradient Class Activation Map\n(QGrad-CAM). This hybrid quantum-classical computing framework leverages both\nquantum and classical strengths and gives access to the derivation of an\nexplicit formula of feature map importance. Experimental results demonstrate\nsignificant, fine-grained, class-discriminative visual explanations generated\nacross both image and speech datasets.\n","authors":["Hsin-Yi Lin","Huan-Hsin Tseng","Samuel Yen-Chi Chen","Shinjae Yoo"],"pdf_url":"https://arxiv.org/pdf/2408.05899v1.pdf","comment":"Submitted to IEEE SiPS 2024"},{"id":"http://arxiv.org/abs/2308.07843v6","updated":"2024-08-12T02:40:24Z","published":"2023-08-15T15:43:12Z","title":"Dyadic Reinforcement Learning","summary":" Mobile health aims to enhance health outcomes by delivering interventions to\nindividuals as they go about their daily life. The involvement of care partners\nand social support networks often proves crucial in helping individuals\nmanaging burdensome medical conditions. This presents opportunities in mobile\nhealth to design interventions that target the dyadic relationship -- the\nrelationship between a target person and their care partner -- with the aim of\nenhancing social support. In this paper, we develop dyadic RL, an online\nreinforcement learning algorithm designed to personalize intervention delivery\nbased on contextual factors and past responses of a target person and their\ncare partner. Here, multiple sets of interventions impact the dyad across\nmultiple time intervals. The developed dyadic RL is Bayesian and hierarchical.\nWe formally introduce the problem setup, develop dyadic RL and establish a\nregret bound. We demonstrate dyadic RL's empirical performance through\nsimulation studies on both toy scenarios and on a realistic test bed\nconstructed from data collected in a mobile health study.\n","authors":["Shuangning Li","Lluis Salvat Niell","Sung Won Choi","Inbal Nahum-Shani","Guy Shani","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2308.07843v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10240v3","updated":"2024-08-12T02:10:34Z","published":"2024-07-14T15:15:00Z","title":"xLSTMTime : Long-term Time Series Forecasting With xLSTM","summary":" In recent years, transformer-based models have gained prominence in\nmultivariate long-term time series forecasting (LTSF), demonstrating\nsignificant advancements despite facing challenges such as high computational\ndemands, difficulty in capturing temporal dynamics, and managing long-term\ndependencies. The emergence of LTSF-Linear, with its straightforward linear\narchitecture, has notably outperformed transformer-based counterparts,\nprompting a reevaluation of the transformer's utility in time series\nforecasting. In response, this paper presents an adaptation of a recent\narchitecture termed extended LSTM (xLSTM) for LTSF. xLSTM incorporates\nexponential gating and a revised memory structure with higher capacity that has\ngood potential for LTSF. Our adopted architecture for LTSF termed as xLSTMTime\nsurpasses current approaches. We compare xLSTMTime's performance against\nvarious state-of-the-art models across multiple real-world da-tasets,\ndemonstrating superior forecasting capabilities. Our findings suggest that\nrefined recurrent architectures can offer competitive alternatives to\ntransformer-based models in LTSF tasks, po-tentially redefining the landscape\nof time series forecasting.\n","authors":["Musleh Alharthi","Ausif Mahmood"],"pdf_url":"https://arxiv.org/pdf/2407.10240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05892v1","updated":"2024-08-12T02:10:18Z","published":"2024-08-12T02:10:18Z","title":"Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer\n Detection","summary":" Polyp segmentation plays a crucial role in the early detection and diagnosis\nof colorectal cancer. However, obtaining accurate segmentations often requires\nlabor-intensive annotations and specialized models. Recently, Meta AI Research\nreleased a general Segment Anything Model 2 (SAM 2), which has demonstrated\npromising performance in several segmentation tasks. In this work, we evaluate\nthe performance of SAM 2 in segmenting polyps under various prompted settings.\nWe hope this report will provide insights to advance the field of polyp\nsegmentation and promote more interesting work in the future. This project is\npublicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.\n","authors":["Mobina Mansoori","Sajjad Shahabodini","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2408.05892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00958v2","updated":"2024-08-12T02:08:03Z","published":"2024-07-01T04:29:35Z","title":"Universal Approximation Theory: The basic theory for large language\n models","summary":" Language models have emerged as a critical area of focus in artificial\nintelligence, particularly with the introduction of groundbreaking innovations\nlike ChatGPT. Large-scale Transformer networks have quickly become the leading\napproach for advancing natural language processing algorithms. Built on the\nTransformer architecture, these models enable interactions that closely mimic\nhuman communication and, equipped with extensive knowledge, can even assist in\nguiding human tasks. Despite their impressive capabilities and growing\ncomplexity, a key question remains-the theoretical foundations of large\nlanguage models (LLMs). What makes Transformer so effective for powering\nintelligent language applications, such as translation and coding? What\nunderlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme\nenhance the fine-tuning of LLMs? And what supports the practicality of pruning\nLLMs? To address these critical questions and explore the technological\nstrategies within LLMs, we leverage the Universal Approximation Theory (UAT) to\noffer a theoretical backdrop, shedding light on the mechanisms that underpin\nthese advancements.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.00958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21670v3","updated":"2024-08-12T01:50:23Z","published":"2024-07-31T15:13:39Z","title":"Universal Approximation Theory: Foundations for Parallelism in Neural\n Networks","summary":" Neural networks are increasingly evolving towards training large models with\nbig data, a method that has demonstrated superior performance across many\ntasks. However, this approach introduces an urgent problem: current deep\nlearning models are predominantly serial, meaning that as the number of network\nlayers increases, so do the training and inference times. This is unacceptable\nif deep learning is to continue advancing. Therefore, this paper proposes a\ndeep learning parallelization strategy based on the Universal Approximation\nTheorem (UAT). From this foundation, we designed a parallel network called\nPara-Former to test our theory. Unlike traditional serial models, the inference\ntime of Para-Former does not increase with the number of layers, significantly\naccelerating the inference speed of multi-layer networks. Experimental results\nvalidate the effectiveness of this network.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.21670v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01528v3","updated":"2024-08-12T01:44:26Z","published":"2024-02-02T16:15:24Z","title":"Decoding Speculative Decoding","summary":" Speculative Decoding is a widely used technique to speed up inference for\nLarge Language Models (LLMs) without sacrificing quality. When performing\ninference, speculative decoding uses a smaller draft model to generate\nspeculative tokens and then uses the target LLM to verify those draft tokens.\nThe speedup provided by speculative decoding heavily depends on the choice of\nthe draft model. In this work, we perform a detailed study comprising over 350\nexperiments with LLaMA-65B and OPT-66B using speculative decoding and delineate\nthe factors that affect the performance gain provided by speculative decoding.\nOur experiments indicate that the performance of speculative decoding depends\nheavily on the latency of the draft model, and the draft model's capability in\nlanguage modeling does not correlate strongly with its performance in\nspeculative decoding. Based on these insights we explore a new design space for\ndraft models and design hardware-efficient draft models for speculative\ndecoding. Our newly designed draft model for LLaMA-65B can provide 111% higher\nthroughput than existing draft models and can generalize further to the LLaMA-2\nmodel family and supervised fine-tuned models.\n","authors":["Minghao Yan","Saurabh Agarwal","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2402.01528v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05886v1","updated":"2024-08-12T01:27:06Z","published":"2024-08-12T01:27:06Z","title":"Online-Score-Aided Federated Learning: Taming the Resource Constraints\n in Wireless Networks","summary":" While FL is a widely popular distributed ML strategy that protects data\nprivacy, time-varying wireless network parameters and heterogeneous system\nconfigurations of the wireless device pose significant challenges. Although the\nlimited radio and computational resources of the network and the clients,\nrespectively, are widely acknowledged, two critical yet often ignored aspects\nare (a) wireless devices can only dedicate a small chunk of their limited\nstorage for the FL task and (b) new training samples may arrive in an online\nmanner in many practical wireless applications. Therefore, we propose a new FL\nalgorithm called OSAFL, specifically designed to learn tasks relevant to\nwireless applications under these practical considerations. Since it has long\nbeen proven that under extreme resource constraints, clients may perform an\narbitrary number of local training steps, which may lead to client drift under\nstatistically heterogeneous data distributions, we leverage normalized gradient\nsimilarities and exploit weighting clients' updates based on optimized scores\nthat facilitate the convergence rate of the proposed OSAFL algorithm. Our\nextensive simulation results on two different tasks -- each with three\ndifferent datasets -- with four popular ML models validate the effectiveness of\nOSAFL compared to six existing state-of-the-art FL baselines.\n","authors":["Md Ferdous Pervej","Minseok Choi","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2408.05886v1.pdf","comment":"Under review for possible publication in IEEE Transactions on\n Wireless Communications (TWC)"},{"id":"http://arxiv.org/abs/2408.05885v1","updated":"2024-08-12T01:24:49Z","published":"2024-08-12T01:24:49Z","title":"GFlowNet Training by Policy Gradients","summary":" Generative Flow Networks (GFlowNets) have been shown effective to generate\ncombinatorial objects with desired properties. We here propose a new GFlowNet\ntraining framework, with policy-dependent rewards, that bridges keeping flow\nbalance of GFlowNets to optimizing the expected accumulated reward in\ntraditional Reinforcement-Learning (RL). This enables the derivation of new\npolicy-based GFlowNet training methods, in contrast to existing ones resembling\nvalue-based RL. It is known that the design of backward policies in GFlowNet\ntraining affects efficiency. We further develop a coupled training strategy\nthat jointly solves GFlowNet forward policy training and backward policy\ndesign. Performance analysis is provided with a theoretical guarantee of our\npolicy-based GFlowNet training. Experiments on both simulated and real-world\ndatasets verify that our policy-based strategies provide advanced RL\nperspectives for robust gradient estimation to improve GFlowNet performance.\n","authors":["Puhua Niu","Shili Wu","Mingzhou Fan","Xiaoning Qian"],"pdf_url":"https://arxiv.org/pdf/2408.05885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14755v2","updated":"2024-08-12T01:09:36Z","published":"2024-05-23T16:21:57Z","title":"Large language models can be zero-shot anomaly detectors for time\n series?","summary":" Recent studies have shown the ability of large language models to perform a\nvariety of tasks, including time series forecasting. The flexible nature of\nthese models allows them to be used for many applications. In this paper, we\npresent a novel study of large language models used for the challenging task of\ntime series anomaly detection. This problem entails two aspects novel for LLMs:\nthe need for the model to identify part of the input sequence (or multiple\nparts) as anomalous; and the need for it to work with time series data rather\nthan the traditional text input. We introduce sigllm, a framework for time\nseries anomaly detection using large language models. Our framework includes a\ntime-series-to-text conversion module, as well as end-to-end pipelines that\nprompt language models to perform time series anomaly detection. We investigate\ntwo paradigms for testing the abilities of large language models to perform the\ndetection task. First, we present a prompt-based detection method that directly\nasks a language model to indicate which elements of the input are anomalies.\nSecond, we leverage the forecasting capability of a large language model to\nguide the anomaly detection process. We evaluated our framework on 11 datasets\nspanning various sources and 10 pipelines. We show that the forecasting method\nsignificantly outperformed the prompting method in all 11 datasets with respect\nto the F1 score. Moreover, while large language models are capable of finding\nanomalies, state-of-the-art deep learning models are still superior in\nperformance, achieving results 30% better than large language models.\n","authors":["Sarah Alnegheimish","Linh Nguyen","Laure Berti-Equille","Kalyan Veeramachaneni"],"pdf_url":"https://arxiv.org/pdf/2405.14755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05883v1","updated":"2024-08-12T00:51:21Z","published":"2024-08-12T00:51:21Z","title":"Low-Rank Approximation, Adaptation, and Other Tales","summary":" Low-rank approximation is a fundamental technique in modern data analysis,\nwidely utilized across various fields such as signal processing, machine\nlearning, and natural language processing. Despite its ubiquity, the mechanics\nof low-rank approximation and its application in adaptation can sometimes be\nobscure, leaving practitioners and researchers with questions about its true\ncapabilities and limitations. This paper seeks to clarify low-rank\napproximation and adaptation by offering a comprehensive guide that reveals\ntheir inner workings and explains their utility in a clear and accessible way.\nOur focus here is to develop a solid intuition for how low-rank approximation\nand adaptation operate, and why they are so effective. We begin with basic\nconcepts and gradually build up to the mathematical underpinnings, ensuring\nthat readers of all backgrounds can gain a deeper understanding of low-rank\napproximation and adaptation. We strive to strike a balance between informal\nexplanations and rigorous mathematics, ensuring that both newcomers and\nexperienced experts can benefit from this survey. Additionally, we introduce\nnew low-rank decomposition and adaptation algorithms that have not yet been\nexplored in the field, hoping that future researchers will investigate their\npotential applicability.\n","authors":["Jun Lu"],"pdf_url":"https://arxiv.org/pdf/2408.05883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07820v3","updated":"2024-08-12T00:43:56Z","published":"2023-10-11T19:01:28Z","title":"Large Language Models Are Zero-Shot Time Series Forecasters","summary":" By encoding time series as a string of numerical digits, we can frame time\nseries forecasting as next-token prediction in text. Developing this approach,\nwe find that large language models (LLMs) such as GPT-3 and LLaMA-2 can\nsurprisingly zero-shot extrapolate time series at a level comparable to or\nexceeding the performance of purpose-built time series models trained on the\ndownstream tasks. To facilitate this performance, we propose procedures for\neffectively tokenizing time series data and converting discrete distributions\nover tokens into highly flexible densities over continuous values. We argue the\nsuccess of LLMs for time series stems from their ability to naturally represent\nmultimodal distributions, in conjunction with biases for simplicity, and\nrepetition, which align with the salient features in many time series, such as\nrepeated seasonal trends. We also show how LLMs can naturally handle missing\ndata without imputation through non-numerical text, accommodate textual side\ninformation, and answer questions to help explain predictions. While we find\nthat increasing model size generally improves performance on time series, we\nshow GPT-4 can perform worse than GPT-3 because of how it tokenizes numbers,\nand poor uncertainty calibration, which is likely the result of alignment\ninterventions such as RLHF.\n","authors":["Nate Gruver","Marc Finzi","Shikai Qiu","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2310.07820v3.pdf","comment":"NeurIPS 2023. Code available at: https://github.com/ngruver/llmtime"},{"id":"http://arxiv.org/abs/2404.10034v2","updated":"2024-08-12T00:39:01Z","published":"2024-04-15T17:25:21Z","title":"A Realistic Protocol for Evaluation of Weakly Supervised Object\n Localization","summary":" Weakly Supervised Object Localization (WSOL) allows training deep learning\nmodels for classification and localization (LOC) using only global class-level\nlabels. The absence of bounding box (bbox) supervision during training raises\nchallenges in the literature for hyper-parameter tuning, model selection, and\nevaluation. WSOL methods rely on a validation set with bbox annotations for\nmodel selection, and a test set with bbox annotations for threshold estimation\nfor producing bboxes from localization maps. This approach, however, is not\naligned with the WSOL setting as these annotations are typically unavailable in\nreal-world scenarios. Our initial empirical analysis shows a significant\ndecline in LOC performance when model selection and threshold estimation rely\nsolely on class labels and the image itself, respectively, compared to using\nmanual bbox annotations. This highlights the importance of incorporating bbox\nlabels for optimal model performance. In this paper, a new WSOL evaluation\nprotocol is proposed that provides LOC information without the need for manual\nbbox annotations. In particular, we generated noisy pseudo-boxes from a\npretrained off-the-shelf region proposal method such as Selective Search, CLIP,\nand RPN for model selection. These bboxes are also employed to estimate the\nthreshold from LOC maps, circumventing the need for test-set bbox annotations.\nOur experiments with several WSOL methods on ILSVRC and CUB datasets show that\nusing the proposed pseudo-bboxes for validation facilitates the model selection\nand threshold estimation, with LOC performance comparable to those selected\nusing GT bboxes on the validation set and threshold estimation on the test set.\nIt also outperforms models selected using class-level labels, and then\ndynamically thresholded based solely on LOC maps.\n","authors":["Shakeeb Murtaza","Soufiane Belharbi","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2404.10034v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2110.12906v2","updated":"2024-08-12T00:10:48Z","published":"2021-10-22T08:22:36Z","title":"Tackling the Local Bias in Federated Graph Learning","summary":" Federated graph learning (FGL) has become an important research topic in\nresponse to the increasing scale and the distributed nature of graph-structured\ndata in the real world. In FGL, a global graph is distributed across different\nclients, where each client holds a subgraph. Existing FGL methods often fail to\neffectively utilize cross-client edges, losing structural information during\nthe training; additionally, local graphs often exhibit significant distribution\ndivergence. These two issues make local models in FGL less desirable than in\ncentralized graph learning, namely the local bias problem in this paper. To\nsolve this problem, we propose a novel FGL framework to make the local models\nsimilar to the model trained in a centralized setting. Specifically, we design\na distributed learning scheme, fully leveraging cross-client edges to aggregate\ninformation from other clients. In addition, we propose a label-guided sampling\napproach to alleviate the imbalanced local data and meanwhile, distinctly\nreduce the training overhead. Extensive experiments demonstrate that local bias\ncan compromise the model performance and slow down the convergence during\ntraining. Experimental results also verify that our framework successfully\nmitigates local bias, achieving better performance than other baselines with\nlower time and memory overhead.\n","authors":["Binchi Zhang","Minnan Luo","Shangbin Feng","Ziqi Liu","Jun Zhou","Qinghua Zheng"],"pdf_url":"https://arxiv.org/pdf/2110.12906v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.00013v2","updated":"2024-08-12T00:07:19Z","published":"2024-02-27T07:15:35Z","title":"Prioritizing Informative Features and Examples for Deep Learning from\n Noisy Data","summary":" In this dissertation, we propose a systemic framework that prioritizes\ninformative features and examples to enhance each stage of the development\nprocess. Specifically, we prioritize informative features and examples and\nimprove the performance of feature learning, data labeling, and data selection.\nWe first propose an approach to extract only informative features that are\ninherent to solving a target task by using auxiliary out-of-distribution data.\nWe deactivate the noise features in the target distribution by using that in\nthe out-of-distribution data. Next, we introduce an approach that prioritizes\ninformative examples from unlabeled noisy data in order to reduce the labeling\ncost of active learning. In order to solve the purity-information dilemma,\nwhere an attempt to select informative examples induces the selection of many\nnoisy examples, we propose a meta-model that finds the best balance between\npurity and informativeness. Lastly, we suggest an approach that prioritizes\ninformative examples from labeled noisy data to preserve the performance of\ndata selection. For labeled image noise data, we propose a data selection\nmethod that considers the confidence of neighboring samples to maintain the\nperformance of the state-of-the-art Re-labeling models. For labeled text noise\ndata, we present an instruction selection method that takes diversity into\naccount for ranking the quality of instructions with prompting, thereby\nenhancing the performance of aligned large language models.\n Overall, our unified framework induces the deep learning development process\nrobust to noisy data, thereby effectively mitigating noisy features and\nexamples in real-world applications.\n","authors":["Dongmin Park"],"pdf_url":"https://arxiv.org/pdf/2403.00013v2.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2402.05071v2","updated":"2024-08-12T23:43:59Z","published":"2024-02-07T18:22:41Z","title":"Revisiting Inexact Fixed-Point Iterations for Min-Max Problems:\n Stochasticity and Structured Nonconvexity","summary":" We focus on constrained, $L$-smooth, potentially stochastic and\nnonconvex-nonconcave min-max problems either satisfying\n$\\rho$-cohypomonotonicity or admitting a solution to the $\\rho$-weakly Minty\nVariational Inequality (MVI), where larger values of the parameter $\\rho>0$\ncorrespond to a greater degree of nonconvexity. These problem classes include\nexamples in two player reinforcement learning, interaction dominant min-max\nproblems, and certain synthetic test problems on which classical min-max\nalgorithms fail. It has been conjectured that first-order methods can tolerate\na value of $\\rho$ no larger than $\\frac{1}{L}$, but existing results in the\nliterature have stagnated at the tighter requirement $\\rho < \\frac{1}{2L}$.\nWith a simple argument, we obtain optimal or best-known complexity guarantees\nwith cohypomonotonicity or weak MVI conditions for $\\rho < \\frac{1}{L}$. First\nmain insight for the improvements in the convergence analyses is to harness the\nrecently proposed $\\textit{conic nonexpansiveness}$ property of operators.\nSecond, we provide a refined analysis for inexact Halpern iteration that\nrelaxes the required inexactness level to improve some state-of-the-art\ncomplexity results even for constrained stochastic convex-concave min-max\nproblems. Third, we analyze a stochastic inexact Krasnosel'ski\\u{\\i}-Mann\niteration with a multilevel Monte Carlo estimator when the assumptions only\nhold with respect to a solution.\n","authors":["Ahmet Alacaoglu","Donghwan Kim","Stephen J. Wright"],"pdf_url":"https://arxiv.org/pdf/2402.05071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13033v2","updated":"2024-08-12T23:36:58Z","published":"2024-02-20T14:18:43Z","title":"Enhancing Node Representations for Real-World Complex Networks with\n Topological Augmentation","summary":" Graph augmentation methods play a crucial role in improving the performance\nand enhancing generalisation capabilities in Graph Neural Networks (GNNs).\nExisting graph augmentation methods mainly perturb the graph structures, and\nare usually limited to pairwise node relations. These methods cannot fully\naddress the complexities of real-world large-scale networks, which often\ninvolve higher-order node relations beyond only being pairwise. Meanwhile,\nreal-world graph datasets are predominantly modelled as simple graphs, due to\nthe scarcity of data that can be used to form higher-order edges. Therefore,\nreconfiguring the higher-order edges as an integration into graph augmentation\nstrategies lights up a promising research path to address the aforementioned\nissues. In this paper, we present Topological Augmentation (TopoAug), a novel\ngraph augmentation method that builds a combinatorial complex from the original\ngraph by constructing virtual hyperedges directly from the raw data. TopoAug\nthen produces auxiliary node features by extracting information from the\ncombinatorial complex, which are used for enhancing GNN performances on\ndownstream tasks. We design three diverse virtual hyperedge construction\nstrategies to accompany the construction of combinatorial complexes: (1) via\ngraph statistics, (2) from multiple data perspectives, and (3) utilising\nmulti-modality. Furthermore, to facilitate TopoAug evaluation, we provide 23\nnovel real-world graph datasets across various domains including social media,\nbiology, and e-commerce. Our empirical study shows that TopoAug consistently\nand significantly outperforms GNN baselines and other graph augmentation\nmethods, across a variety of application contexts, which clearly indicates that\nit can effectively incorporate higher-order node relations into the graph\naugmentation for real-world complex networks.\n","authors":["Xiangyu Zhao","Zehui Li","Mingzhu Shen","Guy-Bart Stan","Pietro Liò","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.13033v2.pdf","comment":"In 27th European Conference on Artificial Intelligence (ECAI 2024).\n 13 pages, 2 figures, 13 tables"},{"id":"http://arxiv.org/abs/2408.06526v1","updated":"2024-08-12T23:10:39Z","published":"2024-08-12T23:10:39Z","title":"Operator Learning Using Random Features: A Tool for Scientific Computing","summary":" Supervised operator learning centers on the use of training data, in the form\nof input-output pairs, to estimate maps between infinite-dimensional spaces. It\nis emerging as a powerful tool to complement traditional scientific computing,\nwhich may often be framed in terms of operators mapping between spaces of\nfunctions. Building on the classical random features methodology for scalar\nregression, this paper introduces the function-valued random features method.\nThis leads to a supervised operator learning architecture that is practical for\nnonlinear problems yet is structured enough to facilitate efficient training\nthrough the optimization of a convex, quadratic cost. Due to the quadratic\nstructure, the trained model is equipped with convergence guarantees and error\nand complexity bounds, properties that are not readily available for most other\noperator learning architectures. At its core, the proposed approach builds a\nlinear combination of random operators. This turns out to be a low-rank\napproximation of an operator-valued kernel ridge regression algorithm, and\nhence the method also has strong connections to Gaussian process regression.\nThe paper designs function-valued random features that are tailored to the\nstructure of two nonlinear operator learning benchmark problems arising from\nparametric partial differential equations. Numerical results demonstrate the\nscalability, discretization invariance, and transferability of the\nfunction-valued random features method.\n","authors":["Nicholas H. Nelsen","Andrew M. Stuart"],"pdf_url":"https://arxiv.org/pdf/2408.06526v1.pdf","comment":"36 pages, 1 table, 9 figures. SIGEST version of SIAM J. Sci. Comput.\n Vol. 43 No. 5 (2021) pp. A3212-A3243, hence text overlap with\n arXiv:2005.10224"},{"id":"http://arxiv.org/abs/2408.06525v1","updated":"2024-08-12T23:04:30Z","published":"2024-08-12T23:04:30Z","title":"The NP-hardness of the Gromov-Wasserstein distance","summary":" This note addresses the property frequently mentioned in the literature that\nthe Gromov-Wasserstein (GW) distance is NP-hard. We provide the details on the\nnon-convex nature of the GW optimization problem that imply NP-hardness of the\nGW distance between finite spaces for any instance of an input data. We further\nillustrate the non-convexity of the problem with several explicit examples.\n","authors":["Natalia Kravtsova"],"pdf_url":"https://arxiv.org/pdf/2408.06525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.01772v3","updated":"2024-08-12T23:04:14Z","published":"2021-08-03T22:14:01Z","title":"Nonconvex Factorization and Manifold Formulations are Almost Equivalent\n in Low-rank Matrix Optimization","summary":" In this paper, we consider the geometric landscape connection of the widely\nstudied manifold and factorization formulations in low-rank positive\nsemidefinite (PSD) and general matrix optimization. We establish a sandwich\nrelation on the spectrum of Riemannian and Euclidean Hessians at first-order\nstationary points (FOSPs). As a result of that, we obtain an equivalence on the\nset of FOSPs, second-order stationary points (SOSPs) and strict saddles between\nthe manifold and the factorization formulations. In addition, we show the\nsandwich relation can be used to transfer more quantitative geometric\nproperties from one formulation to another. Similarities and differences in the\nlandscape connection under the PSD case and the general case are discussed. To\nthe best of our knowledge, this is the first geometric landscape connection\nbetween the manifold and the factorization formulations for handling rank\nconstraints, and it provides a geometric explanation for the similar empirical\nperformance of factorization and manifold approaches in low-rank matrix\noptimization observed in the literature. In the general low-rank matrix\noptimization, the landscape connection of two factorization formulations\n(unregularized and regularized ones) is also provided. By applying these\ngeometric landscape connections, in particular, the sandwich relation, we are\nable to solve unanswered questions in literature and establish stronger results\nin the applications on geometric analysis of phase retrieval, well-conditioned\nlow-rank matrix optimization, and the role of regularization in factorization\narising from machine learning and signal processing.\n","authors":["Yuetian Luo","Xudong Li","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2108.01772v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00963v2","updated":"2024-08-12T23:00:47Z","published":"2024-08-02T00:35:18Z","title":"MIS-ME: A Multi-modal Framework for Soil Moisture Estimation","summary":" Soil moisture estimation is an important task to enable precision agriculture\nin creating optimal plans for irrigation, fertilization, and harvest. It is\ncommon to utilize statistical and machine learning models to estimate soil\nmoisture from traditional data sources such as weather forecasts, soil\nproperties, and crop properties. However, there is a growing interest in\nutilizing aerial and geospatial imagery to estimate soil moisture. Although\nthese images capture high-resolution crop details, they are expensive to curate\nand challenging to interpret. Imagine, an AI-enhanced software tool that\npredicts soil moisture using visual cues captured by smartphones and\nstatistical data given by weather forecasts. This work is a first step towards\nthat goal of developing a multi-modal approach for soil moisture estimation. In\nparticular, we curate a dataset consisting of real-world images taken from\nground stations and their corresponding weather data. We also propose MIS-ME -\nMeteorological & Image based Soil Moisture Estimator, a multi-modal framework\nfor soil moisture estimation. Our extensive analysis shows that MIS-ME achieves\na MAPE of 10.14%, outperforming traditional unimodal approaches with a\nreduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image\ndata, highlighting the effectiveness of tailored multi-modal approaches.\n","authors":["Mohammed Rakib","Adil Aman Mohammed","Cole Diggins","Sumit Sharma","Jeff Michael Sadler","Tyson Ochsner","Arun Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2408.00963v2.pdf","comment":"Accepted by DSAA2024"},{"id":"http://arxiv.org/abs/2408.06524v1","updated":"2024-08-12T22:53:14Z","published":"2024-08-12T22:53:14Z","title":"From Graphs to Qubits: A Critical Review of Quantum Graph Neural\n Networks","summary":" Quantum Graph Neural Networks (QGNNs) represent a novel fusion of quantum\ncomputing and Graph Neural Networks (GNNs), aimed at overcoming the\ncomputational and scalability challenges inherent in classical GNNs that are\npowerful tools for analyzing data with complex relational structures but suffer\nfrom limitations such as high computational complexity and over-smoothing in\nlarge-scale applications. Quantum computing, leveraging principles like\nsuperposition and entanglement, offers a pathway to enhanced computational\ncapabilities. This paper critically reviews the state-of-the-art in QGNNs,\nexploring various architectures. We discuss their applications across diverse\nfields such as high-energy physics, molecular chemistry, finance and earth\nsciences, highlighting the potential for quantum advantage. Additionally, we\naddress the significant challenges faced by QGNNs, including noise,\ndecoherence, and scalability issues, proposing potential strategies to mitigate\nthese problems. This comprehensive review aims to provide a foundational\nunderstanding of QGNNs, fostering further research and development in this\npromising interdisciplinary field.\n","authors":["Andrea Ceschini","Francesco Mauro","Francesca De Falco","Alessandro Sebastianelli","Alessio Verdone","Antonello Rosato","Bertrand Le Saux","Massimo Panella","Paolo Gamba","Silvia L. Ullo"],"pdf_url":"https://arxiv.org/pdf/2408.06524v1.pdf","comment":"21 pages, 9 figures, 2 tables. arXiv admin note: text overlap with\n arXiv:1909.12264 by other authors"},{"id":"http://arxiv.org/abs/2408.06512v1","updated":"2024-08-12T22:02:39Z","published":"2024-08-12T22:02:39Z","title":"Learned Ranking Function: From Short-term Behavior Predictions to\n Long-term User Satisfaction","summary":" We present the Learned Ranking Function (LRF), a system that takes short-term\nuser-item behavior predictions as input and outputs a slate of recommendations\nthat directly optimizes for long-term user satisfaction. Most previous work is\nbased on optimizing the hyperparameters of a heuristic function. We propose to\nmodel the problem directly as a slate optimization problem with the objective\nof maximizing long-term user satisfaction. We also develop a novel constraint\noptimization algorithm that stabilizes objective trade-offs for multi-objective\noptimization. We evaluate our approach with live experiments and describe its\ndeployment on YouTube.\n","authors":["Yi Wu","Daryl Chang","Jennifer She","Zhe Zhao","Li Wei","Lukasz Heldt"],"pdf_url":"https://arxiv.org/pdf/2408.06512v1.pdf","comment":"RecSys 24"},{"id":"http://arxiv.org/abs/2408.06509v1","updated":"2024-08-12T21:57:18Z","published":"2024-08-12T21:57:18Z","title":"Fooling SHAP with Output Shuffling Attacks","summary":" Explainable AI~(XAI) methods such as SHAP can help discover feature\nattributions in black-box models. If the method reveals a significant\nattribution from a ``protected feature'' (e.g., gender, race) on the model\noutput, the model is considered unfair. However, adversarial attacks can\nsubvert the detection of XAI methods. Previous approaches to constructing such\nan adversarial model require access to underlying data distribution, which may\nnot be possible in many practical scenarios. We relax this constraint and\npropose a novel family of attacks, called shuffling attacks, that are\ndata-agnostic. The proposed attack strategies can adapt any trained machine\nlearning model to fool Shapley value-based explanations. We prove that Shapley\nvalues cannot detect shuffling attacks. However, algorithms that estimate\nShapley values, such as linear SHAP and SHAP, can detect these attacks with\nvarying degrees of effectiveness. We demonstrate the efficacy of the attack\nstrategies by comparing the performance of linear SHAP and SHAP using\nreal-world datasets.\n","authors":["Jun Yuan","Aritra Dasgupta"],"pdf_url":"https://arxiv.org/pdf/2408.06509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06502v1","updated":"2024-08-12T21:35:59Z","published":"2024-08-12T21:35:59Z","title":"Prompt Recovery for Image Generation Models: A Comparative Study of\n Discrete Optimizers","summary":" Recovering natural language prompts for image generation models, solely based\non the generated images is a difficult discrete optimization problem. In this\nwork, we present the first head-to-head comparison of recent discrete\noptimization techniques for the problem of prompt inversion. We evaluate Greedy\nCoordinate Gradients (GCG), PEZ , Random Search, AutoDAN and BLIP2's image\ncaptioner across various evaluation metrics related to the quality of inverted\nprompts and the quality of the images generated by the inverted prompts. We\nfind that focusing on the CLIP similarity between the inverted prompts and the\nground truth image acts as a poor proxy for the similarity between ground truth\nimage and the image generated by the inverted prompts. While the discrete\noptimizers effectively minimize their objectives, simply using responses from a\nwell-trained captioner often leads to generated images that more closely\nresemble those produced by the original prompts.\n","authors":["Joshua Nathaniel Williams","Avi Schwarzschild","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2408.06502v1.pdf","comment":"9 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2408.06500v1","updated":"2024-08-12T21:25:19Z","published":"2024-08-12T21:25:19Z","title":"Music2Latent: Consistency Autoencoders for Latent Audio Compression","summary":" Efficient audio representations in a compressed continuous latent space are\ncritical for generative audio modeling and Music Information Retrieval (MIR)\ntasks. However, some existing audio autoencoders have limitations, such as\nmulti-stage training procedures, slow iterative sampling, or low reconstruction\nquality. We introduce Music2Latent, an audio autoencoder that overcomes these\nlimitations by leveraging consistency models. Music2Latent encodes samples into\na compressed continuous latent space in a single end-to-end training process\nwhile enabling high-fidelity single-step reconstruction. Key innovations\ninclude conditioning the consistency model on upsampled encoder outputs at all\nlevels through cross connections, using frequency-wise self-attention to\ncapture long-range frequency dependencies, and employing frequency-wise learned\nscaling to handle varying value distributions across frequencies at different\nnoise levels. We demonstrate that Music2Latent outperforms existing continuous\naudio autoencoders in sound quality and reconstruction accuracy while achieving\ncompetitive performance on downstream MIR tasks using its latent\nrepresentations. To our knowledge, this represents the first successful attempt\nat training an end-to-end consistency autoencoder model.\n","authors":["Marco Pasini","Stefan Lattner","George Fazekas"],"pdf_url":"https://arxiv.org/pdf/2408.06500v1.pdf","comment":"Accepted to ISMIR 2024"},{"id":"http://arxiv.org/abs/2311.07786v2","updated":"2024-08-12T21:17:30Z","published":"2023-11-13T22:32:02Z","title":"Predicting the First Response Latency of Maintainers and Contributors in\n Pull Requests","summary":" The success of a Pull Request (PR) depends on the responsiveness of the\nmaintainers and the contributor during the review process. Being aware of the\nexpected waiting times can lead to better interactions and managed expectations\nfor both the maintainers and the contributor. In this paper, we propose a\nmachine-learning approach to predict the first response latency of the\nmaintainers following the submission of a PR, and the first response latency of\nthe contributor after receiving the first response from the maintainers. We\ncurate a dataset of 20 large and popular open-source projects on GitHub and\nextract 21 features to characterize projects, contributors, PRs, and review\nprocesses. Using these features, we then evaluate seven types of classifiers to\nidentify the best-performing models. We also conduct permutation feature\nimportance and SHAP analyses to understand the importance and the impact of\ndifferent features on the predicted response latencies. We find that our\nCatBoost models are the most effective for predicting the first response\nlatencies of both maintainers and contributors. We also observe that PRs\nsubmitted earlier in the week, containing an average number of commits, and\nwith concise descriptions are more likely to receive faster first responses\nfrom the maintainers. Similarly, PRs with a lower first response latency from\nmaintainers, that received the first response of maintainers earlier in the\nweek, and containing an average number of commits tend to receive faster first\nresponses from the contributors. Additionally, contributors with a higher\nacceptance rate and a history of timely responses in the project are likely to\nboth obtain and provide faster first responses. Moreover, we show the\neffectiveness of our approach in a cross-project setting.\n","authors":["SayedHassan Khatoonabadi","Ahmad Abdellatif","Diego Elias Costa","Emad Shihab"],"pdf_url":"https://arxiv.org/pdf/2311.07786v2.pdf","comment":"Manuscript accepted for publication in IEEE Transactions on Software\n Engineering (TSE)"},{"id":"http://arxiv.org/abs/2404.12358v2","updated":"2024-08-12T21:13:35Z","published":"2024-04-18T17:37:02Z","title":"From $r$ to $Q^*$: Your Language Model is Secretly a Q-Function","summary":" Reinforcement Learning From Human Feedback (RLHF) has been critical to the\nsuccess of the latest generation of generative AI models. In response to the\ncomplex nature of the classical RLHF pipeline, direct alignment algorithms such\nas Direct Preference Optimization (DPO) have emerged as an alternative\napproach. Although DPO solves the same objective as the standard RLHF setup,\nthere is a mismatch between the two approaches. Standard RLHF deploys\nreinforcement learning in a specific token-level MDP, while DPO is derived as a\nbandit problem in which the whole response of the model is treated as a single\narm. In this work we rectify this difference. We theoretically show that we can\nderive DPO in the token-level MDP as a general inverse Q-learning algorithm,\nwhich satisfies the Bellman equation. Using our theoretical results, we provide\nthree concrete empirical insights. First, we show that because of its token\nlevel interpretation, DPO is able to perform some type of credit assignment.\nNext, we prove that under the token level formulation, classical search-based\nalgorithms, such as MCTS, which have recently been applied to the language\ngeneration space, are equivalent to likelihood-based search on a DPO policy.\nEmpirically we show that a simple beam search yields meaningful improvement\nover the base DPO policy. Finally, we show how the choice of reference policy\ncauses implicit rewards to decline during training. We conclude by discussing\napplications of our work, including information elicitation in multi-turn\ndialogue, reasoning, agentic applications and end-to-end training of\nmulti-model systems.\n","authors":["Rafael Rafailov","Joey Hejna","Ryan Park","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2404.12358v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2305.01094v4","updated":"2024-08-12T20:59:55Z","published":"2023-05-01T21:31:29Z","title":"Performative Prediction with Bandit Feedback: Learning through\n Reparameterization","summary":" Performative prediction, as introduced by Perdomo et al, is a framework for\nstudying social prediction in which the data distribution itself changes in\nresponse to the deployment of a model. Existing work in this field usually\nhinges on three assumptions that are easily violated in practice: that the\nperformative risk is convex over the deployed model, that the mapping from the\nmodel to the data distribution is known to the model designer in advance, and\nthe first-order information of the performative risk is available. In this\npaper, we initiate the study of performative prediction problems that do not\nrequire these assumptions. Specifically, we develop a reparameterization\nframework that reparametrizes the performative prediction objective as a\nfunction of the induced data distribution. We then develop a two-level\nzeroth-order optimization procedure, where the first level performs iterative\noptimization on the distribution parameter space, and the second level learns\nthe model that induces a particular target distribution at each iteration.\nUnder mild conditions, this reparameterization allows us to transform the\nnon-convex objective into a convex one and achieve provable regret guarantees.\nIn particular, we provide a regret bound that is sublinear in the total number\nof performative samples taken and is only polynomial in the dimension of the\nmodel parameter.\n","authors":["Yatong Chen","Wei Tang","Chien-Ju Ho","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2305.01094v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00664v6","updated":"2024-08-12T20:49:41Z","published":"2023-05-01T05:26:33Z","title":"EvoluNet: Advancing Dynamic Non-IID Transfer Learning on Graphs","summary":" Non-IID transfer learning on graphs is crucial in many high-stakes domains.\nThe majority of existing works assume stationary distribution for both source\nand target domains. However, real-world graphs are intrinsically dynamic,\npresenting challenges in terms of domain evolution and dynamic discrepancy\nbetween source and target domains. To bridge the gap, we shift the problem to\nthe dynamic setting and pose the question: given the label-rich source graphs\nand the label-scarce target graphs both observed in previous T timestamps, how\ncan we effectively characterize the evolving domain discrepancy and optimize\nthe generalization performance of the target domain at the incoming T+1\ntimestamp? To answer it, we propose a generalization bound for dynamic non-IID\ntransfer learning on graphs, which implies the generalization performance is\ndominated by domain evolution and domain discrepancy between source and target\ngraphs. Inspired by the theoretical results, we introduce a novel generic\nframework named EvoluNet. It leverages a transformer-based temporal encoding\nmodule to model temporal information of the evolving domains and then uses a\ndynamic domain unification module to efficiently learn domain-invariant\nrepresentations across the source and target domains. Finally, EvoluNet\noutperforms the state-of-the-art models by up to 12.1%, demonstrating its\neffectiveness in transferring knowledge from dynamic source graphs to dynamic\ntarget graphs.\n","authors":["Haohui Wang","Yuzhen Mao","Yujun Yan","Yaoqing Yang","Jianhui Sun","Kevin Choi","Balaji Veeramani","Alison Hu","Edward Bowen","Tyler Cody","Dawei Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.00664v6.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2408.06486v1","updated":"2024-08-12T20:41:07Z","published":"2024-08-12T20:41:07Z","title":"Implicit Neural Representation For Accurate CFD Flow Field Prediction","summary":" Despite the plethora of deep learning frameworks for flow field prediction,\nmost of them deal with flow fields on regular domains, and although the best\nones can cope with irregular domains, they mostly rely on graph networks, so\nthat real industrial applications remain currently elusive. We present a deep\nlearning framework for 3D flow field prediction applied to blades of aircraft\nengine turbines and compressors. Crucially, we view any 3D field as a function\nfrom coordinates that is modeled by a neural network we call the backbone-net.\nIt inherits the property of coordinate-based MLPs, namely the\ndiscretization-agnostic representation of flow fields in domains of arbitrary\ntopology at infinite resolution. First, we demonstrate the performance of the\nbackbone-net solo in regressing 3D steady simulations of single blade rows in\nvarious flow regimes: it can accurately render important flow characteristics\nsuch as boundary layers, wakes and shock waves. Second, we introduce a\nhyper-net that maps the surface mesh of a blade to the parameters of the\nbackbone-net. By doing so, the flow solution can be directly predicted from the\nblade geometry, irrespective of its parameterization. Together, backbone-net\nand hyper-net form a highly-accurate memory-efficient data-driven proxy to CFD\nsolvers with good generalization on unseen geometries.\n","authors":["Laurent de Vito","Nils Pinnau","Simone Dey"],"pdf_url":"https://arxiv.org/pdf/2408.06486v1.pdf","comment":"ECCOMAS CONGRESS 2024, 9th European Congress on Computational Methods\n in Applied Sciences and Engineering"},{"id":"http://arxiv.org/abs/2408.06465v1","updated":"2024-08-12T19:32:28Z","published":"2024-08-12T19:32:28Z","title":"Kernel Sum of Squares for Data Adapted Kernel Learning of Dynamical\n Systems from Data: A global optimization approach","summary":" This paper examines the application of the Kernel Sum of Squares (KSOS)\nmethod for enhancing kernel learning from data, particularly in the context of\ndynamical systems. Traditional kernel-based methods, despite their theoretical\nsoundness and numerical efficiency, frequently struggle with selecting optimal\nbase kernels and parameter tuning, especially with gradient-based methods prone\nto local optima. KSOS mitigates these issues by leveraging a global\noptimization framework with kernel-based surrogate functions, thereby achieving\nmore reliable and precise learning of dynamical systems. Through comprehensive\nnumerical experiments on the Logistic Map, Henon Map, and Lorentz System, KSOS\nis shown to consistently outperform gradient descent in minimizing the\nrelative-$\\rho$ metric and improving kernel accuracy. These results highlight\nKSOS's effectiveness in predicting the behavior of chaotic dynamical systems,\ndemonstrating its capability to adapt kernels to underlying dynamics and\nenhance the robustness and predictive power of kernel-based approaches, making\nit a valuable asset for time series analysis in various scientific fields.\n","authors":["Daniel Lengyel","Panos Parpas","Boumediene Hamzi","Houman Owhadi"],"pdf_url":"https://arxiv.org/pdf/2408.06465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11798v3","updated":"2024-08-12T19:22:00Z","published":"2023-11-20T14:31:18Z","title":"Neural Dynamical Operator: Continuous Spatial-Temporal Model with\n Gradient-Based and Derivative-Free Optimization Methods","summary":" Data-driven modeling techniques have been explored in the spatial-temporal\nmodeling of complex dynamical systems for many engineering applications.\nHowever, a systematic approach is still lacking to leverage the information\nfrom different types of data, e.g., with different spatial and temporal\nresolutions, and the combined use of short-term trajectories and long-term\nstatistics. In this work, we build on the recent progress of neural operator\nand present a data-driven modeling framework called neural dynamical operator\nthat is continuous in both space and time. A key feature of the neural\ndynamical operator is the resolution-invariance with respect to both spatial\nand temporal discretizations, without demanding abundant training data in\ndifferent temporal resolutions. To improve the long-term performance of the\ncalibrated model, we further propose a hybrid optimization scheme that\nleverages both gradient-based and derivative-free optimization methods and\nefficiently trains on both short-term time series and long-term statistics. We\ninvestigate the performance of the neural dynamical operator with three\nnumerical examples, including the viscous Burgers' equation, the Navier-Stokes\nequations, and the Kuramoto-Sivashinsky equation. The results confirm the\nresolution-invariance of the proposed modeling framework and also demonstrate\nstable long-term simulations with only short-term time series data. In\naddition, we show that the proposed model can better predict long-term\nstatistics via the hybrid optimization scheme with a combined use of short-term\nand long-term data.\n","authors":["Chuanqi Chen","Jin-Long Wu"],"pdf_url":"https://arxiv.org/pdf/2311.11798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06452v1","updated":"2024-08-12T19:01:49Z","published":"2024-08-12T19:01:49Z","title":"Wireless Channel Aware Data Augmentation Methods for Deep Leaning-Based\n Indoor Localization","summary":" Indoor localization is a challenging problem that - unlike outdoor\nlocalization - lacks a universal and robust solution. Machine Learning (ML),\nparticularly Deep Learning (DL), methods have been investigated as a promising\napproach. Although such methods bring remarkable localization accuracy, they\nheavily depend on the training data collected from the environment. The data\ncollection is usually a laborious and time-consuming task, but Data\nAugmentation (DA) can be used to alleviate this issue. In this paper, different\nfrom previously used DA, we propose methods that utilize the domain knowledge\nabout wireless propagation channels and devices. The methods exploit the\ntypical hardware component drift in the transceivers and/or the statistical\nbehavior of the channel, in combination with the measured Power Delay Profile\n(PDP). We comprehensively evaluate the proposed methods to demonstrate their\neffectiveness. This investigation mainly focuses on the impact of factors such\nas the number of measurements, augmentation proportion, and the environment of\ninterest impact the effectiveness of the different DA methods. We show that in\nthe low-data regime (few actual measurements available), localization accuracy\nincreases up to 50%, matching non-augmented results in the high-data regime. In\naddition, the proposed methods may outperform the measurement-only high-data\nperformance by up to 33% using only 1/4 of the amount of measured data. We also\nexhibit the effect of different training data distribution and quality on the\neffectiveness of DA. Finally, we demonstrate the power of the proposed methods\nwhen employed along with Transfer Learning (TL) to address the data scarcity in\ntarget and/or source environments.\n","authors":["Omer Gokalp Serbetci","Daoud Burghal","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2408.06452v1.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.06450v1","updated":"2024-08-12T18:59:13Z","published":"2024-08-12T18:59:13Z","title":"Evaluating Language Models for Efficient Code Generation","summary":" We introduce Differential Performance Evaluation (DPE), a framework designed\nto reliably evaluate Large Language Models (LLMs) for efficient code\ngeneration. Traditional coding benchmarks often fail to provide reliable\ninsights into code efficiency, due to their reliance on simplistic test inputs\nand the absence of effective compound metrics. DPE addresses these issues by\nfocusing on efficiency-demanding programming tasks and establishing an\ninsightful compound metric for performance evaluation. DPE operates in two\nphases: To curate efficiency datasets, it selects efficiency-demanding tasks\nfrom existing coding benchmarks and generates computationally expensive inputs\nto stress the efficiency of LLM solutions. To assess the code efficiency, DPE\nprofiles the new solution and compares it globally against a set of reference\nsolutions that exhibit distinct efficiency levels, where the matched level\ndefines its efficiency score. As a proof of concept, we use DPE to create\nEvalPerf, a benchmark with 121 performance-challenging coding tasks. Our\ncomprehensive evaluation draws interesting findings on the efficiency impact of\nmodel sizes, instruction tuning, and prompting. For example, while the scaling\nlaw fails to account for code efficiency, general instruction tuning benefits\nboth code correctness and efficiency. We also evaluate the evaluation by\nexamining the effectiveness of DPE, showing that EvalPerf is reliable and\nconvenient to use even across platforms.\n","authors":["Jiawei Liu","Songrun Xie","Junhao Wang","Yuxiang Wei","Yifeng Ding","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06445v1","updated":"2024-08-12T18:49:02Z","published":"2024-08-12T18:49:02Z","title":"Multi-View Neural Differential Equations for Continuous-Time Stream Data\n in Long-Term Traffic Forecasting","summary":" Long-term traffic flow forecasting plays a crucial role in intelligent\ntransportation as it allows traffic managers to adjust their decisions in\nadvance. However, the problem is challenging due to spatio-temporal\ncorrelations and complex dynamic patterns in continuous-time stream data.\nNeural Differential Equations (NDEs) are among the state-of-the-art methods for\nlearning continuous-time traffic dynamics. However, the traditional NDE models\nface issues in long-term traffic forecasting due to failures in capturing\ndelayed traffic patterns, dynamic edge (location-to-location correlation)\npatterns, and abrupt trend patterns. To fill this gap, we propose a new NDE\narchitecture called Multi-View Neural Differential Equations. Our model\ncaptures current states, delayed states, and trends in different state\nvariables (views) by learning latent multiple representations within Neural\nDifferential Equations. Extensive experiments conducted on several real-world\ntraffic datasets demonstrate that our proposed method outperforms the\nstate-of-the-art and achieves superior prediction accuracy for long-term\nforecasting and robustness with noisy or missing inputs.\n","authors":["Zibo Liu","Zhe Jiang","Shigang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.06445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18699v2","updated":"2024-08-12T18:12:44Z","published":"2024-04-29T13:47:59Z","title":"Convergence Properties of Score-Based Models for Linear Inverse Problems\n Using Graduated Optimisation","summary":" The incorporation of generative models as regularisers within variational\nformulations for inverse problems has proven effective across numerous image\nreconstruction tasks. However, the resulting optimisation problem is often\nnon-convex and challenging to solve. In this work, we show that score-based\ngenerative models (SGMs) can be used in a graduated optimisation framework to\nsolve inverse problems. We show that the resulting graduated non-convexity flow\nconverge to stationary points of the original problem and provide a numerical\nconvergence analysis of a 2D toy example. We further provide experiments on\ncomputed tomography image reconstruction, where we show that this framework is\nable to recover high-quality images, independent of the initial value. The\nexperiments highlight the potential of using SGMs in graduated optimisation\nframeworks. The source code is publicly available on GitHub.\n","authors":["Pascal Fernsel","Željko Kereta","Alexander Denker"],"pdf_url":"https://arxiv.org/pdf/2404.18699v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.06425v1","updated":"2024-08-12T18:04:59Z","published":"2024-08-12T18:04:59Z","title":"Bayesian Learning in a Nonlinear Multiscale State-Space Model","summary":" The ubiquity of multiscale interactions in complex systems is\nwell-recognized, with development and heredity serving as a prime example of\nhow processes at different temporal scales influence one another. This work\nintroduces a novel multiscale state-space model to explore the dynamic\ninterplay between systems interacting across different time scales, with\nfeedback between each scale. We propose a Bayesian learning framework to\nestimate unknown states by learning the unknown process noise covariances\nwithin this multiscale model. We develop a Particle Gibbs with Ancestor\nSampling (PGAS) algorithm for inference and demonstrate through simulations the\nefficacy of our approach.\n","authors":["Nayely Vélez-Cruz","Manfred D. Laubichler"],"pdf_url":"https://arxiv.org/pdf/2408.06425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06421v1","updated":"2024-08-12T18:01:04Z","published":"2024-08-12T18:01:04Z","title":"Neural Networks as Spin Models: From Glass to Hidden Order Through\n Training","summary":" We explore a one-to-one correspondence between a neural network (NN) and a\nstatistical mechanical spin model where neurons are mapped to Ising spins and\nweights to spin-spin couplings. The process of training an NN produces a family\nof spin Hamiltonians parameterized by training time. We study the magnetic\nphases and the melting transition temperature as training progresses. First, we\nprove analytically that the common initial state before training--an NN with\nindependent random weights--maps to a layered version of the classical\nSherrington-Kirkpatrick spin glass exhibiting a replica symmetry breaking. The\nspin-glass-to-paramagnet transition temperature is calculated. Further, we use\nthe Thouless-Anderson-Palmer (TAP) equations--a theoretical technique to\nanalyze the landscape of energy minima of random systems--to determine the\nevolution of the magnetic phases on two types of NNs (one with continuous and\none with binarized activations) trained on the MNIST dataset. The two NN types\ngive rise to similar results, showing a quick destruction of the spin glass and\nthe appearance of a phase with a hidden order, whose melting transition\ntemperature $T_c$ grows as a power law in training time. We also discuss the\nproperties of the spectrum of the spin system's bond matrix in the context of\nrich vs. lazy learning. We suggest that this statistical mechanical view of NNs\nprovides a useful unifying perspective on the training process, which can be\nviewed as selecting and strengthening a symmetry-broken state associated with\nthe training task.\n","authors":["Richard Barney","Michael Winer","Victor Galitksi"],"pdf_url":"https://arxiv.org/pdf/2408.06421v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.07020v1","updated":"2024-08-12T17:30:17Z","published":"2024-08-12T17:30:17Z","title":"Source Separation of Multi-source Raw Music using a Residual Quantized\n Variational Autoencoder","summary":" I developed a neural audio codec model based on the residual quantized\nvariational autoencoder architecture. I train the model on the Slakh2100\ndataset, a standard dataset for musical source separation, composed of\nmulti-track audio. The model can separate audio sources, achieving almost SoTA\nresults with much less computing power. The code is publicly available at\ngithub.com/LeonardoBerti00/Source-Separation-of-Multi-source-Music-using-Residual-Quantizad-Variational-Autoencoder\n","authors":["Leonardo Berti"],"pdf_url":"https://arxiv.org/pdf/2408.07020v1.pdf","comment":"9 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.06248v1","updated":"2024-08-12T16:00:17Z","published":"2024-08-12T16:00:17Z","title":"Rethinking Video with a Universal Event-Based Representation","summary":" Traditionally, video is structured as a sequence of discrete image frames.\nRecently, however, a novel video sensing paradigm has emerged which eschews\nvideo frames entirely. These \"event\" sensors aim to mimic the human vision\nsystem with asynchronous sensing, where each pixel has an independent, sparse\ndata stream. While these cameras enable high-speed and high-dynamic-range\nsensing, researchers often revert to a framed representation of the event data\nfor existing applications, or build bespoke applications for a particular\ncamera's event data type. At the same time, classical video systems have\nsignificant computational redundancy at the application layer, since pixel\nsamples are repeated across frames in the uncompressed domain.\n To address the shortcomings of existing systems, I introduce Address,\nDecimation, {\\Delta}t Event Representation (AD{\\Delta}ER, pronounced \"adder\"),\na novel intermediate video representation and system framework. The framework\ntranscodes a variety of framed and event camera sources into a single\nevent-based representation, which supports source-modeled lossy compression and\nbackward compatibility with traditional frame-based applications. I demonstrate\nthat AD{\\Delta}ER achieves state-of-the-art application speed and compression\nperformance for scenes with high temporal redundancy. Crucially, I describe how\nAD{\\Delta}ER unlocks an entirely new control mechanism for computer vision:\napplication speed can correlate with both the scene content and the level of\nlossy compression. Finally, I discuss the implications for event-based video on\nlarge-scale video surveillance and resource-constrained sensing.\n","authors":["Andrew Freeman"],"pdf_url":"https://arxiv.org/pdf/2408.06248v1.pdf","comment":"137 pages. PhD dissertation at the University of North Carolina,\n Chapel Hill"},{"id":"http://arxiv.org/abs/2408.06152v1","updated":"2024-08-12T13:48:06Z","published":"2024-08-12T13:48:06Z","title":"Palantir: Towards Efficient Super Resolution for Ultra-high-definition\n Live Streaming","summary":" Neural enhancement through super-resolution deep neural networks opens up new\npossibilities for ultra-high-definition live streaming over existing encoding\nand networking infrastructure. Yet, the heavy SR DNN inference overhead leads\nto severe deployment challenges. To reduce the overhead, existing systems\npropose to apply DNN-based SR only on selected anchor frames while upscaling\nnon-anchor frames via the lightweight reusing-based SR approach. However,\nframe-level scheduling is coarse-grained and fails to deliver optimal\nefficiency. In this work, we propose Palantir, the first neural-enhanced UHD\nlive streaming system with fine-grained patch-level scheduling. In the\npresented solutions, two novel techniques are incorporated to make good\nscheduling decisions for inference overhead optimization and reduce the\nscheduling latency. Firstly, under the guidance of our pioneering and\ntheoretical analysis, Palantir constructs a directed acyclic graph (DAG) for\nlightweight yet accurate quality estimation under any possible anchor patch\nset. Secondly, to further optimize the scheduling latency, Palantir improves\nparallelizability by refactoring the computation subprocedure of the estimation\nprocess into a sparse matrix-matrix multiplication operation. The evaluation\nresults suggest that Palantir incurs a negligible scheduling latency accounting\nfor less than 5.7% of the end-to-end latency requirement. When compared to the\nstate-of-the-art real-time frame-level scheduling strategy, Palantir reduces\nthe energy overhead of SR-integrated mobile clients by 38.1% at most (and 22.4%\non average) and the monetary costs of cloud-based SR by 80.1% at most (and\n38.4% on average).\n","authors":["Xinqi Jin","Zhui Zhu","Xikai Sun","Fan Dang","Jiangchuan Liu","Jingao Xu","Kebin Liu","Xinlei Chen","Yunhao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06123v1","updated":"2024-08-12T13:05:43Z","published":"2024-08-12T13:05:43Z","title":"DPDETR: Decoupled Position Detection Transformer for Infrared-Visible\n Object Detection","summary":" Infrared-visible object detection aims to achieve robust object detection by\nleveraging the complementary information of infrared and visible image pairs.\nHowever, the commonly existing modality misalignment problem presents two\nchallenges: fusing misalignment complementary features is difficult, and\ncurrent methods cannot accurately locate objects in both modalities under\nmisalignment conditions. In this paper, we propose a Decoupled Position\nDetection Transformer (DPDETR) to address these problems. Specifically, we\nexplicitly formulate the object category, visible modality position, and\ninfrared modality position to enable the network to learn the intrinsic\nrelationships and output accurate positions of objects in both modalities. To\nfuse misaligned object features accurately, we propose a Decoupled Position\nMultispectral Cross-attention module that adaptively samples and aggregates\nmultispectral complementary features with the constraint of infrared and\nvisible reference positions. Additionally, we design a query-decoupled\nMultispectral Decoder structure to address the optimization gap among the three\nkinds of object information in our task and propose a Decoupled Position\nContrastive DeNosing Training strategy to enhance the DPDETR's ability to learn\ndecoupled positions. Experiments on DroneVehicle and KAIST datasets demonstrate\nsignificant improvements compared to other state-of-the-art methods. The code\nwill be released at https://github.com/gjj45/DPDETR.\n","authors":["Junjie Guo","Chenqiang Gao","Fangcen Liu","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2408.06123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02113v2","updated":"2024-08-12T11:44:10Z","published":"2024-08-04T18:54:59Z","title":"Diseño de sonido para producciones audiovisuales e historias sonoras\n en el aula. Hacia una docencia creativa mediante el uso de herramientas\n inteligentes","summary":" This study aims to share a teaching experience teaching sound design for\naudiovisual productions and compares different projects tackled by students. It\nis not intended to be a comparative analysis of different types of teaching but\nrather an analysis of different problems observed in different profiles of\nstudents of the subject who study it in different grades. The world of audio\ncan be very interesting for a large part of the students, both those with\ncreative and technical inclinations. Musical creation and production,\nsynchronization with images, dubbing, etc. They are disciplines that are\ngenerally interesting but can have a very high barrier to entry due to their\ngreat technical complexity. Sometimes it can take weeks or even months for the\nuninitiated to begin to use audio editing programs with the necessary ease,\nwhich are not always particularly intuitive for students. Learning through the\nuse of PBL methodologies generates, in our experience, results much superior to\nthose that can be observed through the use of other teaching methods such as\nmaster classes. Students acquire technical skills while developing creative\nprojects in which they get personally involved. Despite everything mentioned\nabove, most interactions between teachers and students focus on aspects of\ntechnical correction. From different parameters in reverbs (such as pre-delay,\ndecay, modulation...) to how to correctly adjust compressors, noise gates,\netc.; The number of tools with which to work with audio is incredibly\nextensive, as well as many of its features that can present serious differences\ndepending on their manufacturers.\n","authors":["Miguel Civit","Francisco Cuadrado"],"pdf_url":"https://arxiv.org/pdf/2408.02113v2.pdf","comment":"11 pages, in Spanish language. 1 figure. Preprint from La nueva era\n del podcast (2023)"},{"id":"http://arxiv.org/abs/2305.13840v3","updated":"2024-08-12T08:30:05Z","published":"2023-05-23T09:03:19Z","title":"Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion\n Prior and Reward Feedback Learning","summary":" Recent advances in text-to-image (T2I) diffusion models have enabled\nimpressive image generation capabilities guided by text prompts. However,\nextending these techniques to video generation remains challenging, with\nexisting text-to-video (T2V) methods often struggling to produce high-quality\nand motion-consistent videos. In this work, we introduce Control-A-Video, a\ncontrollable T2V diffusion model that can generate videos conditioned on text\nprompts and reference control maps like edge and depth maps. To tackle video\nquality and motion consistency issues, we propose novel strategies to\nincorporate content prior and motion prior into the diffusion-based generation\nprocess. Specifically, we employ a first-frame condition scheme to transfer\nvideo generation from the image domain. Additionally, we introduce\nresidual-based and optical flow-based noise initialization to infuse motion\npriors from reference videos, promoting relevance among frame latents for\nreduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback\nLearning (ST-ReFL) algorithm that optimizes the video diffusion model using\nmultiple reward models for video quality and motion consistency, leading to\nsuperior outputs. Comprehensive experiments demonstrate that our framework\ngenerates higher-quality, more consistent videos compared to existing\nstate-of-the-art methods in controllable text-to-video generation\n","authors":["Weifeng Chen","Yatai Ji","Jie Wu","Hefeng Wu","Pan Xie","Jiashi Li","Xin Xia","Xuefeng Xiao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.13840v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05966v1","updated":"2024-08-12T07:44:19Z","published":"2024-08-12T07:44:19Z","title":"Freehand Sketch Generation from Mechanical Components","summary":" Drawing freehand sketches of mechanical components on multimedia devices for\nAI-based engineering modeling has become a new trend. However, its development\nis being impeded because existing works cannot produce suitable sketches for\ndata-driven research. These works either generate sketches lacking a freehand\nstyle or utilize generative models not originally designed for this task\nresulting in poor effectiveness. To address this issue, we design a two-stage\ngenerative framework mimicking the human sketching behavior pattern, called\nMSFormer, which is the first time to produce humanoid freehand sketches\ntailored for mechanical components. The first stage employs Open CASCADE\ntechnology to obtain multi-view contour sketches from mechanical components,\nfiltering perturbing signals for the ensuing generation process. Meanwhile, we\ndesign a view selector to simulate viewpoint selection tasks during human\nsketching for picking out information-rich sketches. The second stage\ntranslates contour sketches into freehand sketches by a transformer-based\ngenerator. To retain essential modeling features as much as possible and\nrationalize stroke distribution, we introduce a novel edge-constraint stroke\ninitialization. Furthermore, we utilize a CLIP vision encoder and a new loss\nfunction incorporating the Hausdorff distance to enhance the generalizability\nand robustness of the model. Extensive experiments demonstrate that our\napproach achieves state-of-the-art performance for generating freehand sketches\nin the mechanical domain. Project page: https://mcfreeskegen.github.io .\n","authors":["Zhichao Liao","Di Huang","Heming Fang","Yue Ma","Fengyuan Piao","Xinghui Li","Long Zeng","Pingfa Feng"],"pdf_url":"https://arxiv.org/pdf/2408.05966v1.pdf","comment":"Published at ACM Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2408.05953v1","updated":"2024-08-12T07:04:52Z","published":"2024-08-12T07:04:52Z","title":"A Simple Task-aware Contrastive Local Descriptor Selection Strategy for\n Few-shot Learning between inter class and intra class","summary":" Few-shot image classification aims to classify novel classes with few labeled\nsamples. Recent research indicates that deep local descriptors have better\nrepresentational capabilities. These studies recognize the impact of background\nnoise on classification performance. They typically filter query descriptors\nusing all local descriptors in the support classes or engage in bidirectional\nselection between local descriptors in support and query sets. However, they\nignore the fact that background features may be useful for the classification\nperformance of specific tasks. This paper proposes a novel task-aware\ncontrastive local descriptor selection network (TCDSNet). First, we calculate\nthe contrastive discriminative score for each local descriptor in the support\nclass, and select discriminative local descriptors to form a support descriptor\nsubset. Finally, we leverage support descriptor subsets to adaptively select\ndiscriminative query descriptors for specific tasks. Extensive experiments\ndemonstrate that our method outperforms state-of-the-art methods on both\ngeneral and fine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Shaoyao Huang","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2408.05953v1.pdf","comment":"Submitted to ICANN 2024"},{"id":"http://arxiv.org/abs/2408.03468v2","updated":"2024-08-12T06:01:33Z","published":"2024-07-28T08:19:09Z","title":"MultiHateClip: A Multilingual Benchmark Dataset for Hateful Video\n Detection on YouTube and Bilibili","summary":" Hate speech is a pressing issue in modern society, with significant effects\nboth online and offline. Recent research in hate speech detection has primarily\ncentered on text-based media, largely overlooking multimodal content such as\nvideos. Existing studies on hateful video datasets have predominantly focused\non English content within a Western context and have been limited to binary\nlabels (hateful or non-hateful), lacking detailed contextual information. This\nstudy presents MultiHateClip1 , an novel multilingual dataset created through\nhate lexicons and human annotation. It aims to enhance the detection of hateful\nvideos on platforms such as YouTube and Bilibili, including content in both\nEnglish and Chinese languages. Comprising 2,000 videos annotated for\nhatefulness, offensiveness, and normalcy, this dataset provides a\ncross-cultural perspective on gender-based hate speech. Through a detailed\nexamination of human annotation results, we discuss the differences between\nChinese and English hateful videos and underscore the importance of different\nmodalities in hateful and offensive video analysis. Evaluations of\nstate-of-the-art video classification models, such as VLM, GPT-4V and Qwen-VL,\non MultiHateClip highlight the existing challenges in accurately distinguishing\nbetween hateful and offensive content and the urgent need for models that are\nboth multimodally and culturally nuanced. MultiHateClip represents a\nfoundational advance in enhancing hateful video detection by underscoring the\nnecessity of a multimodal and culturally sensitive approach in combating online\nhate speech.\n","authors":["Han Wang","Tan Rui Yang","Usman Naseem","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2408.03468v2.pdf","comment":"10 pages, 3 figures, ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2404.03179v2","updated":"2024-08-12T03:31:57Z","published":"2024-04-04T03:28:57Z","title":"UniAV: Unified Audio-Visual Perception for Multi-Task Video Event\n Localization","summary":" Video localization tasks aim to temporally locate specific instances in\nvideos, including temporal action localization (TAL), sound event detection\n(SED) and audio-visual event localization (AVEL). Existing methods\nover-specialize on each task, overlooking the fact that these instances often\noccur in the same video to form the complete video content. In this work, we\npresent UniAV, a Unified Audio-Visual perception network, to achieve joint\nlearning of TAL, SED and AVEL tasks for the first time. UniAV can leverage\ndiverse data available in task-specific datasets, allowing the model to learn\nand share mutually beneficial knowledge across tasks and modalities. To tackle\nthe challenges posed by substantial variations in datasets\n(size/domain/duration) and distinct task characteristics, we propose to\nuniformly encode visual and audio modalities of all videos to derive generic\nrepresentations, while also designing task-specific experts to capture unique\nknowledge for each task. Besides, we develop a unified language-aware\nclassifier by utilizing a pre-trained text encoder, enabling the model to\nflexibly detect various types of instances and previously unseen ones by simply\nchanging prompts during inference. UniAV outperforms its single-task\ncounterparts by a large margin with fewer parameters, achieving on-par or\nsuperior performances compared to state-of-the-art task-specific methods across\nActivityNet 1.3, DESED and UnAV-100 benchmarks.\n","authors":["Tiantian Geng","Teng Wang","Yanfu Zhang","Jinming Duan","Weili Guan","Feng Zheng","Ling shao"],"pdf_url":"https://arxiv.org/pdf/2404.03179v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.06468v1","updated":"2024-08-12T19:42:42Z","published":"2024-08-12T19:42:42Z","title":"FoVNet: Configurable Field-of-View Speech Enhancement with Low\n Computation and Distortion for Smart Glasses","summary":" This paper presents a novel multi-channel speech enhancement approach,\nFoVNet, that enables highly efficient speech enhancement within a configurable\nfield of view (FoV) of a smart-glasses user without needing specific\ntarget-talker(s) directions. It advances over prior works by enhancing all\nspeakers within any given FoV, with a hybrid signal processing and deep\nlearning approach designed with high computational efficiency. The neural\nnetwork component is designed with ultra-low computation (about 50 MMACS). A\nmulti-channel Wiener filter and a post-processing module are further used to\nimprove perceptual quality. We evaluate our algorithm with a microphone array\non smart glasses, providing a configurable, efficient solution for augmented\nhearing on energy-constrained devices. FoVNet excels in both computational\nefficiency and speech quality across multiple scenarios, making it a promising\nsolution for smart glasses applications.\n","authors":["Zhongweiyang Xu","Ali Aroudi","Ke Tan","Ashutosh Pandey","Jung-Suk Lee","Buye Xu","Francesco Nesta"],"pdf_url":"https://arxiv.org/pdf/2408.06468v1.pdf","comment":"Accepted by INTERSPEECH2024"},{"id":"http://arxiv.org/abs/2408.07020v1","updated":"2024-08-12T17:30:17Z","published":"2024-08-12T17:30:17Z","title":"Source Separation of Multi-source Raw Music using a Residual Quantized\n Variational Autoencoder","summary":" I developed a neural audio codec model based on the residual quantized\nvariational autoencoder architecture. I train the model on the Slakh2100\ndataset, a standard dataset for musical source separation, composed of\nmulti-track audio. The model can separate audio sources, achieving almost SoTA\nresults with much less computing power. The code is publicly available at\ngithub.com/LeonardoBerti00/Source-Separation-of-Multi-source-Music-using-Residual-Quantizad-Variational-Autoencoder\n","authors":["Leonardo Berti"],"pdf_url":"https://arxiv.org/pdf/2408.07020v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2408.05926v1","updated":"2024-08-12T05:22:42Z","published":"2024-08-12T05:22:42Z","title":"BI-MDRG: Bridging Image History in Multimodal Dialogue Response\n Generation","summary":" Multimodal Dialogue Response Generation (MDRG) is a recently proposed task\nwhere the model needs to generate responses in texts, images, or a blend of\nboth based on the dialogue context. Due to the lack of a large-scale dataset\nspecifically for this task and the benefits of leveraging powerful pre-trained\nmodels, previous work relies on the text modality as an intermediary step for\nboth the image input and output of the model rather than adopting an end-to-end\napproach. However, this approach can overlook crucial information about the\nimage, hindering 1) image-grounded text response and 2) consistency of objects\nin the image response. In this paper, we propose BI-MDRG that bridges the\nresponse generation path such that the image history information is utilized\nfor enhanced relevance of text responses to the image content and the\nconsistency of objects in sequential image responses. Through extensive\nexperiments on the multimodal dialogue benchmark dataset, we show that BI-MDRG\ncan effectively increase the quality of multimodal dialogue. Additionally,\nrecognizing the gap in benchmark datasets for evaluating the image consistency\nin multimodal dialogue, we have created a curated set of 300 dialogues\nannotated to track object consistency across conversations.\n","authors":["Hee Suk Yoon","Eunseop Yoon","Joshua Tian Jin Tee","Kang Zhang","Yu-Jung Heo","Du-Seong Chang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2408.05926v1.pdf","comment":"ECCV 2024"}]},"2024-08-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.05874v1","updated":"2024-08-11T22:59:32Z","published":"2024-08-11T22:59:32Z","title":"LLM-Based Robust Product Classification in Commerce and Compliance","summary":" Product classification is a crucial task in international trade, as\ncompliance regulations are verified and taxes and duties are applied based on\nproduct categories. Manual classification of products is time-consuming and\nerror-prone, and the sheer volume of products imported and exported renders the\nmanual process infeasible. Consequently, e-commerce platforms and enterprises\ninvolved in international trade have turned to automatic product classification\nusing machine learning. However, current approaches do not consider the\nreal-world challenges associated with product classification, such as very\nabbreviated and incomplete product descriptions. In addition, recent\nadvancements in generative Large Language Models (LLMs) and their reasoning\ncapabilities are mainly untapped in product classification and e-commerce. In\nthis research, we explore the real-life challenges of industrial classification\nand we propose data perturbations that allow for realistic data simulation.\nFurthermore, we employ LLM-based product classification to improve the\nrobustness of the prediction in presence of incomplete data. Our research shows\nthat LLMs with in-context learning outperform the supervised approaches in the\nclean-data scenario. Additionally, we illustrate that LLMs are significantly\nmore robust than the supervised approaches when data attacks are present.\n","authors":["Sina Gholamian","Gianfranco Romani","Bartosz Rudnikowicz","Laura Skylaki"],"pdf_url":"https://arxiv.org/pdf/2408.05874v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.05873v1","updated":"2024-08-11T22:58:23Z","published":"2024-08-11T22:58:23Z","title":"Defining Boundaries: A Spectrum of Task Feasibility for Large Language\n Models","summary":" Large language models (LLMs) have shown remarkable performance in various\ntasks but often fail to handle queries that exceed their knowledge and\ncapabilities, leading to incorrect or fabricated responses. This paper\naddresses the need for LLMs to recognize and refuse infeasible tasks due to the\nrequired skills surpassing their capabilities. We first systematically\nconceptualize infeasible tasks for LLMs, providing formal definitions and\ncategorizations that cover a spectrum of related hallucinations. We develop and\nbenchmark a new dataset comprising diverse infeasible and feasible tasks to\ntest multiple LLMs' abilities on task feasibility. Furthermore, we explore the\npotential of training enhancements to increase LLMs' refusal capabilities with\nfine-tuning. Experiments validate the effectiveness of our methods, offering\npromising directions for refining the operational boundaries of LLMs in real\napplications.\n","authors":["Wenbo Zhang","Zihang Xu","Hengrui Cai"],"pdf_url":"https://arxiv.org/pdf/2408.05873v1.pdf","comment":"20 pages, 9 tables, 15 Figures"},{"id":"http://arxiv.org/abs/2404.05221v2","updated":"2024-08-11T22:20:19Z","published":"2024-04-08T06:35:09Z","title":"LLM Reasoners: New Evaluation, Library, and Analysis of Step-by-Step\n Reasoning with Large Language Models","summary":" Generating accurate step-by-step reasoning is essential for Large Language\nModels (LLMs) to address complex problems and enhance robustness and\ninterpretability. Despite the flux of research on developing advanced reasoning\napproaches, systematically analyzing the diverse LLMs and reasoning strategies\nin generating reasoning chains remains a significant challenge. The\ndifficulties stem from the lack of two key elements: (1) an automatic method\nfor evaluating the generated reasoning chains on different tasks, and (2) a\nunified formalism and implementation of the diverse reasoning approaches for\nsystematic comparison. This paper aims to close the gap: (1) We introduce\nAutoRace for fully automated reasoning chain evaluation. Existing metrics rely\non expensive human annotations or pre-defined LLM prompts not adaptable to\ndifferent tasks. In contrast, AutoRace automatically creates detailed\nevaluation criteria tailored for each task, and uses GPT-4 for accurate\nevaluation following the criteria. (2) We develop LLM Reasoners, a library for\nstandardized modular implementation of existing and new reasoning algorithms,\nunder a unified formulation of the search, reward, and world model components.\nWith the new evaluation and library, (3) we conduct extensive study of\ndifferent reasoning approaches (e.g., CoT, ToT, RAP). The analysis reveals\ninteresting findings about different factors contributing to reasoning,\nincluding the reward-guidance, breadth-vs-depth in search, world model, and\nprompt formats, etc.\n","authors":["Shibo Hao","Yi Gu","Haotian Luo","Tianyang Liu","Xiyan Shao","Xinyuan Wang","Shuhua Xie","Haodi Ma","Adithya Samavedhi","Qiyue Gao","Zhen Wang","Zhiting Hu"],"pdf_url":"https://arxiv.org/pdf/2404.05221v2.pdf","comment":"Project website: https://www.llm-reasoners.net/"},{"id":"http://arxiv.org/abs/2305.17100v4","updated":"2024-08-11T20:03:12Z","published":"2023-05-26T17:14:43Z","title":"BiomedGPT: A Generalist Vision-Language Foundation Model for Diverse\n Biomedical Tasks","summary":" Traditional biomedical artificial intelligence (AI) models, designed for\nspecific tasks or modalities, often exhibit limited flexibility in real-world\ndeployment and struggle to utilize holistic information. Generalist AI holds\nthe potential to address these limitations due to its versatility in\ninterpreting different data types and generating tailored outputs for diverse\nneeds. However, existing biomedical generalist AI solutions are typically\nheavyweight and closed source to researchers, practitioners, and patients.\nHere, we propose BiomedGPT, the first open-source and lightweight\nvision-language foundation model, designed as a generalist capable of\nperforming various biomedical tasks. BiomedGPT achieved state-of-the-art\nresults in 16 out of 25 experiments while maintaining a computing-friendly\nmodel scale. We also conducted human evaluations to assess the capabilities of\nBiomedGPT in radiology visual question answering, report generation, and\nsummarization. BiomedGPT exhibits robust prediction ability with a low error\nrate of 3.8% in question answering, satisfactory performance with an error rate\nof 8.3% in writing complex radiology reports, and competitive summarization\nability with a nearly equivalent preference score to human experts. Our method\ndemonstrates that effective training with diverse data can lead to more\npractical biomedical AI for improving diagnosis and workflow efficiency.\n","authors":["Kai Zhang","Rong Zhou","Eashan Adhikarla","Zhiling Yan","Yixin Liu","Jun Yu","Zhengliang Liu","Xun Chen","Brian D. Davison","Hui Ren","Jing Huang","Chen Chen","Yuyin Zhou","Sunyang Fu","Wei Liu","Tianming Liu","Xiang Li","Yong Chen","Lifang He","James Zou","Quanzheng Li","Hongfang Liu","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2305.17100v4.pdf","comment":"Fix incorrect citations and add journal reference for the published\n version. Nat Med (2024)"},{"id":"http://arxiv.org/abs/2404.01365v3","updated":"2024-08-11T19:43:36Z","published":"2024-04-01T17:56:06Z","title":"Prompt-prompted Adaptive Structured Pruning for Efficient LLM Generation","summary":" With the development of transformer-based large language models (LLMs), they\nhave been applied to many fields due to their remarkable utility, but this\ncomes at a considerable computational cost at deployment. Fortunately, some\nmethods such as pruning or constructing a mixture of experts (MoE) aim at\nexploiting sparsity in transformer feedforward (FF) blocks to gain boosts in\nspeed and reduction in memory requirements. However, these techniques can be\nvery costly and inflexible in practice, as they often require training or are\nrestricted to specific types of architectures. To address this, we introduce\nGRIFFIN, a novel training-free and calibration-free method that selects unique\nFF experts at the sequence level for efficient generation across a plethora of\nLLMs with different non-ReLU activation functions. This is possible due to a\ncritical observation that many trained LLMs naturally produce highly structured\nFF activation patterns within a sequence, which we call flocking. Despite our\nmethod's simplicity, we show with 50% of the FF parameters, GRIFFIN maintains\nthe original model's performance with little to no degradation on a variety of\nclassification and generation tasks, all while improving latency (e.g.\n1.29$\\times$ and 1.25$\\times$ speed-ups in Gemma 7B and Llama 2 13B,\nrespectively, on an NVIDIA L40). Code is available at\nhttps://github.com/hdong920/GRIFFIN.\n","authors":["Harry Dong","Beidi Chen","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2404.01365v3.pdf","comment":"Revision 1: Updated abstract with code link; re-ran top-k + sampling\n rows in Table 4, conclusions unchanged Revision 2: Reframing and new\n experiments, conclusions unchanged"},{"id":"http://arxiv.org/abs/2305.06424v4","updated":"2024-08-11T18:56:50Z","published":"2023-05-10T19:09:24Z","title":"Bot or Human? Detecting ChatGPT Imposters with A Single Question","summary":" Large language models (LLMs) like GPT-4 have recently demonstrated impressive\ncapabilities in natural language understanding and generation. However, there\nis a concern that they can be misused for malicious purposes, such as fraud or\ndenial-of-service attacks. Therefore, it is crucial to develop methods for\ndetecting whether the party involved in a conversation is a bot or a human. In\nthis paper, we propose a framework named FLAIR, Finding Large Language Model\nAuthenticity via a Single Inquiry and Response, to detect conversational bots\nin an online manner. Specifically, we target a single question scenario that\ncan effectively differentiate human users from bots. The questions are divided\ninto two categories: those that are easy for humans but difficult for bots\n(e.g., counting, substitution, searching, and ASCII art reasoning), and those\nthat are easy for bots but difficult for humans (e.g., memorization and\ncomputation). Our approach shows different strengths of these questions in\ntheir effectiveness, providing a new way for online service providers to\nprotect themselves against nefarious activities. Our code and question set are\navailable at https://github.com/hongwang600/FLAIR.\n","authors":["Hong Wang","Xuan Luo","Weizhi Wang","Xifeng Yan"],"pdf_url":"https://arxiv.org/pdf/2305.06424v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05840v1","updated":"2024-08-11T18:22:12Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v1.pdf","comment":"A full draft of the second version of the article"},{"id":"http://arxiv.org/abs/2408.01088v2","updated":"2024-08-11T17:51:21Z","published":"2024-08-02T08:07:15Z","title":"Bridging Information Gaps in Dialogues With Grounded Exchanges Using\n Knowledge Graphs","summary":" Knowledge models are fundamental to dialogue systems for enabling\nconversational interactions, which require handling domain-specific knowledge.\nEnsuring effective communication in information-providing conversations entails\naligning user understanding with the knowledge available to the system.\nHowever, dialogue systems often face challenges arising from semantic\ninconsistencies in how information is expressed in natural language compared to\nhow it is represented within the system's internal knowledge. To address this\nproblem, we study the potential of large language models for conversational\ngrounding, a mechanism to bridge information gaps by establishing shared\nknowledge between dialogue participants. Our approach involves annotating human\nconversations across five knowledge domains to create a new dialogue corpus\ncalled BridgeKG. Through a series of experiments on this dataset, we\nempirically evaluate the capabilities of large language models in classifying\ngrounding acts and identifying grounded information items within a knowledge\ngraph structure. Our findings offer insights into how these models use\nin-context learning for conversational grounding tasks and common prediction\nerrors, which we illustrate with examples from challenging dialogues. We\ndiscuss how the models handle knowledge graphs as a semantic layer between\nunstructured dialogue utterances and structured information items.\n","authors":["Phillip Schneider","Nektarios Machner","Kristiina Jokinen","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2408.01088v2.pdf","comment":"Accepted to SIGDIAL 2024"},{"id":"http://arxiv.org/abs/2404.18243v2","updated":"2024-08-11T17:18:30Z","published":"2024-04-28T16:50:12Z","title":"LEGENT: Open Platform for Embodied Agents","summary":" Despite advancements in Large Language Models (LLMs) and Large Multimodal\nModels (LMMs), their integration into language-grounded, human-like embodied\nagents remains incomplete, hindering complex real-life task performance in\nphysical environments. Existing integrations often feature limited open\nsourcing, challenging collective progress in this field. We introduce LEGENT,\nan open, scalable platform for developing embodied agents using LLMs and LMMs.\nLEGENT offers a dual approach: a rich, interactive 3D environment with\ncommunicable and actionable agents, paired with a user-friendly interface, and\na sophisticated data generation pipeline utilizing advanced algorithms to\nexploit supervision from simulated worlds at scale. In our experiments, an\nembryonic vision-language-action model trained on LEGENT-generated data\nsurpasses GPT-4V in embodied tasks, showcasing promising generalization\ncapabilities.\n","authors":["Zhili Cheng","Zhitong Wang","Jinyi Hu","Shengding Hu","An Liu","Yuge Tu","Pengkai Li","Lei Shi","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2404.18243v2.pdf","comment":"ACL 2024 System Demonstration"},{"id":"http://arxiv.org/abs/2402.18243v3","updated":"2024-08-11T17:15:06Z","published":"2024-02-28T11:16:00Z","title":"Learning or Self-aligning? Rethinking Instruction Fine-tuning","summary":" Instruction Fine-tuning~(IFT) is a critical phase in building large language\nmodels~(LLMs). Previous works mainly focus on the IFT's role in the transfer of\nbehavioral norms and the learning of additional world knowledge. However, the\nunderstanding of the underlying mechanisms of IFT remains significantly\nlimited. In this paper, we design a knowledge intervention framework to\ndecouple the potential underlying factors of IFT, thereby enabling individual\nanalysis of different factors. Surprisingly, our experiments reveal that\nattempting to learn additional world knowledge through IFT often struggles to\nyield positive impacts and can even lead to markedly negative effects. Further,\nwe discover that maintaining internal knowledge consistency before and after\nIFT is a critical factor for achieving successful IFT. Our findings reveal the\nunderlying mechanisms of IFT and provide robust support for some very recent\nand potential future works.\n","authors":["Mengjie Ren","Boxi Cao","Hongyu Lin","Cao Liu","Xianpei Han","Ke Zeng","Guanglu Wan","Xunliang Cai","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2402.18243v3.pdf","comment":"Camera Ready for ACL2024"},{"id":"http://arxiv.org/abs/2406.13408v3","updated":"2024-08-11T16:30:44Z","published":"2024-06-19T09:57:19Z","title":"SQLFixAgent: Towards Semantic-Accurate Text-to-SQL Parsing via\n Consistency-Enhanced Multi-Agent Collaboration","summary":" While fine-tuned large language models (LLMs) excel in generating\ngrammatically valid SQL in Text-to-SQL parsing, they often struggle to ensure\nsemantic accuracy in queries, leading to user confusion and diminished system\nusability. To tackle this challenge, we introduce SQLFixAgent, a new\nconsistency-enhanced multi-agent collaborative framework designed for detecting\nand repairing erroneous SQL. Our framework comprises a core agent, SQLRefiner,\nalongside two auxiliary agents: SQLReviewer and QueryCrafter. The SQLReviewer\nagent employs the rubber duck debugging method to identify potential semantic\nmismatches between SQL and user query. If the error is detected, the\nQueryCrafter agent generates multiple SQL as candidate repairs using a\nfine-tuned SQLTool. Subsequently, leveraging similar repair retrieval and\nfailure memory reflection, the SQLRefiner agent selects the most fitting SQL\nstatement from the candidates as the final repair. We evaluated our proposed\nframework on five Text-to-SQL benchmarks. The experimental results show that\nour method consistently enhances the performance of the baseline model,\nspecifically achieving an execution accuracy improvement of over 3\\% on the\nBird benchmark. Our framework also has a higher token efficiency compared to\nother advanced methods, making it more competitive.\n","authors":["Jipeng Cen","Jiaxin Liu","Zhixu Li","Jingjing Wang"],"pdf_url":"https://arxiv.org/pdf/2406.13408v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03910v2","updated":"2024-08-11T16:23:57Z","published":"2024-08-07T17:13:59Z","title":"CodexGraph: Bridging Large Language Models and Code Repositories via\n Code Graph Databases","summary":" Large Language Models (LLMs) excel in stand-alone code tasks like HumanEval\nand MBPP, but struggle with handling entire code repositories. This challenge\nhas prompted research on enhancing LLM-codebase interaction at a repository\nscale. Current solutions rely on similarity-based retrieval or manual tools and\nAPIs, each with notable drawbacks. Similarity-based retrieval often has low\nrecall in complex tasks, while manual tools and APIs are typically\ntask-specific and require expert knowledge, reducing their generalizability\nacross diverse code tasks and real-world applications. To mitigate these\nlimitations, we introduce CodexGraph, a system that integrates LLM agents with\ngraph database interfaces extracted from code repositories. By leveraging the\nstructural properties of graph databases and the flexibility of the graph query\nlanguage, CodexGraph enables the LLM agent to construct and execute queries,\nallowing for precise, code structure-aware context retrieval and code\nnavigation. We assess CodexGraph using three benchmarks: CrossCodeEval,\nSWE-bench, and EvoCodeBench. Additionally, we develop five real-world coding\napplications. With a unified graph database schema, CodexGraph demonstrates\ncompetitive performance and potential in both academic and real-world\nenvironments, showcasing its versatility and efficacy in software engineering.\nOur application demo:\nhttps://github.com/modelscope/modelscope-agent/tree/master/apps/codexgraph_agent.\n","authors":["Xiangyan Liu","Bo Lan","Zhiyuan Hu","Yang Liu","Zhicheng Zhang","Fei Wang","Michael Shieh","Wenmeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.03910v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2408.05793v1","updated":"2024-08-11T14:52:40Z","published":"2024-08-11T14:52:40Z","title":"SAGA: A Participant-specific Examination of Story Alternatives and Goal\n Applicability for a Deeper Understanding of Complex Events","summary":" Interpreting and assessing goal driven actions is vital to understanding and\nreasoning over complex events. It is important to be able to acquire the\nknowledge needed for this understanding, though doing so is challenging. We\nargue that such knowledge can be elicited through a participant achievement\nlens. We analyze a complex event in a narrative according to the intended\nachievements of the participants in that narrative, the likely future actions\nof the participants, and the likelihood of goal success. We collect 6.3K high\nquality goal and action annotations reflecting our proposed participant\nachievement lens, with an average weighted Fleiss-Kappa IAA of 80%. Our\ncollection contains annotated alternate versions of each narrative. These\nalternate versions vary minimally from the \"original\" story, but can license\ndrastically different inferences. Our findings suggest that while modern large\nlanguage models can reflect some of the goal-based knowledge we study, they\nfind it challenging to fully capture the design and intent behind concerted\nactions, even when the model pretraining included the data from which we\nextracted the goal knowledge. We show that smaller models fine-tuned on our\ndataset can achieve performance surpassing larger models.\n","authors":["Sai Vallurupalli","Katrin Erk","Francis Ferraro"],"pdf_url":"https://arxiv.org/pdf/2408.05793v1.pdf","comment":"Accepted to Findings of the Association for Computational Linguistics\n 2024"},{"id":"http://arxiv.org/abs/2408.05786v1","updated":"2024-08-11T14:26:58Z","published":"2024-08-11T14:26:58Z","title":"HiLight: A Hierarchy-aware Light Global Model with Hierarchical Local\n ConTrastive Learning","summary":" Hierarchical text classification (HTC) is a special sub-task of multi-label\nclassification (MLC) whose taxonomy is constructed as a tree and each sample is\nassigned with at least one path in the tree. Latest HTC models contain three\nmodules: a text encoder, a structure encoder and a multi-label classification\nhead. Specially, the structure encoder is designed to encode the hierarchy of\ntaxonomy. However, the structure encoder has scale problem. As the taxonomy\nsize increases, the learnable parameters of recent HTC works grow rapidly.\nRecursive regularization is another widely-used method to introduce\nhierarchical information but it has collapse problem and generally relaxed by\nassigning with a small weight (ie. 1e-6). In this paper, we propose a\nHierarchy-aware Light Global model with Hierarchical local conTrastive learning\n(HiLight), a lightweight and efficient global model only consisting of a text\nencoder and a multi-label classification head. We propose a new learning task\nto introduce the hierarchical information, called Hierarchical Local\nContrastive Learning (HiLCL). Extensive experiments are conducted on two\nbenchmark datasets to demonstrate the effectiveness of our model.\n","authors":["Zhijian Chen","Zhonghua Li","Jianxin Yang","Ye Qi"],"pdf_url":"https://arxiv.org/pdf/2408.05786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15186v2","updated":"2024-08-11T13:54:21Z","published":"2024-07-21T14:48:23Z","title":"A Survey on Employing Large Language Models for Text-to-SQL Tasks","summary":" The increasing volume of data stored in relational databases has led to the\nneed for efficient querying and utilization of this data in various sectors.\nHowever, writing SQL queries requires specialized knowledge, which poses a\nchallenge for non-professional users trying to access and query databases.\nText-to-SQL parsing solves this issue by converting natural language queries\ninto SQL queries, thus making database access more accessible for non-expert\nusers. To take advantage of the recent developments in Large Language Models\n(LLMs), a range of new methods have emerged, with a primary focus on prompt\nengineering and fine-tuning. This survey provides a comprehensive overview of\nLLMs in text-to-SQL tasks, discussing benchmark datasets, prompt engineering,\nfine-tuning methods, and future research directions. We hope this review will\nenable readers to gain a broader understanding of the recent advances in this\nfield and offer some insights into its future trajectory.\n","authors":["Liang Shi","Zhengju Tang","Nan Zhang","Xiaotong Zhang","Zhi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.15186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05769v1","updated":"2024-08-11T13:19:27Z","published":"2024-08-11T13:19:27Z","title":"LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech\n Recognition","summary":" Test-Time Adaptation (TTA) has emerged as a crucial solution to the domain\nshift challenge, wherein the target environment diverges from the original\ntraining environment. A prime exemplification is TTA for Automatic Speech\nRecognition (ASR), which enhances model performance by leveraging output\nprediction entropy minimization as a self-supervision signal. However, a key\nlimitation of this self-supervision lies in its primary focus on acoustic\nfeatures, with minimal attention to the linguistic properties of the input. To\naddress this gap, we propose Language Informed Test-Time Adaptation (LI-TTA),\nwhich incorporates linguistic insights during TTA for ASR. LI-TTA integrates\ncorrections from an external language model to merge linguistic with acoustic\ninformation by minimizing the CTC loss from the correction alongside the\nstandard TTA loss. With extensive experiments, we show that LI-TTA effectively\nimproves the performance of TTA for ASR in various distribution shift\nsituations.\n","authors":["Eunseop Yoon","Hee Suk Yoon","John Harvill","Mark Hasegawa-Johnson","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2408.05769v1.pdf","comment":"INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.05767v1","updated":"2024-08-11T13:17:14Z","published":"2024-08-11T13:17:14Z","title":"Reference-free Hallucination Detection for Large Vision-Language Models","summary":" Large vision-language models (LVLMs) have made significant progress in recent\nyears. While LVLMs exhibit excellent ability in language understanding,\nquestion answering, and conversations of visual inputs, they are prone to\nproducing hallucinations. While several methods are proposed to evaluate the\nhallucinations in LVLMs, most are reference-based and depend on external tools,\nwhich complicates their practical application. To assess the viability of\nalternative methods, it is critical to understand whether the reference-free\napproaches, which do not rely on any external tools, can efficiently detect\nhallucinations. Therefore, we initiate an exploratory study to demonstrate the\neffectiveness of different reference-free solutions in detecting hallucinations\nin LVLMs. In particular, we conduct an extensive study on three kinds of\ntechniques: uncertainty-based, consistency-based, and supervised uncertainty\nquantification methods on four representative LVLMs across two different tasks.\nThe empirical results show that the reference-free approaches are capable of\neffectively detecting non-factual responses in LVLMs, with the supervised\nuncertainty quantification method outperforming the others, achieving the best\nperformance across different settings.\n","authors":["Qing Li","Chenyang Lyu","Jiahui Geng","Derui Zhu","Maxim Panov","Fakhri Karray"],"pdf_url":"https://arxiv.org/pdf/2408.05767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05758v1","updated":"2024-08-11T12:24:23Z","published":"2024-08-11T12:24:23Z","title":"VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for\n Speech Processing","summary":" Deep learning has brought significant improvements to the field of\ncross-modal representation learning. For tasks such as text-to-speech (TTS),\nvoice conversion (VC), and automatic speech recognition (ASR), a cross-modal\nfine-grained (frame-level) sequence representation is desired, emphasizing the\nsemantic content of the text modality while de-emphasizing the paralinguistic\ninformation of the speech modality. We propose a method called \"Vector\nQuantized Contrastive Token-Acoustic Pre-training (VQ-CTAP)\", which uses the\ncross-modal aligned sequence transcoder to bring text and speech into a joint\nmultimodal space, learning how to connect text and speech at the frame level.\nThe proposed VQ-CTAP is a paradigm for cross-modal sequence representation\nlearning, offering a promising solution for fine-grained generation and\nrecognition tasks in speech processing. The VQ-CTAP can be directly applied to\nVC and ASR tasks without fine-tuning or additional structures. We propose a\nsequence-aware semantic connector, which connects multiple frozen pre-trained\nmodules for the TTS task, exhibiting a plug-and-play capability. We design a\nstepping optimization strategy to ensure effective model convergence by\ngradually injecting and adjusting the influence of various loss components.\nFurthermore, we propose a semantic-transfer-wise paralinguistic consistency\nloss to enhance representational capabilities, allowing the model to better\ngeneralize to unseen data and capture the nuances of paralinguistic\ninformation. In addition, VQ-CTAP achieves high-compression speech coding at a\nrate of 25Hz from 24kHz input waveforms, which is a 960-fold reduction in the\nsampling rate. The audio demo is available at\nhttps://qiangchunyu.github.io/VQCTAP/\n","authors":["Chunyu Qiang","Wang Geng","Yi Zhao","Ruibo Fu","Tao Wang","Cheng Gong","Tianrui Wang","Qiuyu Liu","Jiangyan Yi","Zhengqi Wen","Chen Zhang","Hao Che","Longbiao Wang","Jianwu Dang","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2408.05758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13235v3","updated":"2024-08-11T11:43:23Z","published":"2023-05-22T17:06:41Z","title":"SPARSEFIT: Few-shot Prompting with Sparse Fine-tuning for Jointly\n Generating Predictions and Natural Language Explanations","summary":" Models that generate natural language explanations (NLEs) for their\npredictions have recently gained increasing interest. However, this approach\nusually demands large datasets of human-written NLEs for the ground-truth\nanswers at training time, which can be expensive and potentially infeasible for\nsome applications. When only a few NLEs are available (a few-shot setup),\nfine-tuning pre-trained language models (PLMs) in conjunction with prompt-based\nlearning has recently shown promising results. However, PLMs typically have\nbillions of parameters, making full fine-tuning expensive. We propose\nSparseFit, a sparse few-shot fine-tuning strategy that leverages discrete\nprompts to jointly generate predictions and NLEs. We experiment with SparseFit\non three sizes of the T5 language model and four datasets and compare it\nagainst existing state-of-the-art Parameter-Efficient Fine-Tuning (PEFT)\ntechniques. We find that fine-tuning only 6.8% of the model parameters leads to\ncompetitive results for both the task performance and the quality of the\ngenerated NLEs compared to full fine-tuning of the model and produces better\nresults on average than other PEFT methods in terms of predictive accuracy and\nNLE quality.\n","authors":["Jesus Solano","Mardhiyah Sanni","Oana-Maria Camburu","Pasquale Minervini"],"pdf_url":"https://arxiv.org/pdf/2305.13235v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05738v1","updated":"2024-08-11T09:57:46Z","published":"2024-08-11T09:57:46Z","title":"Language-Informed Beam Search Decoding for Multilingual Machine\n Translation","summary":" Beam search decoding is the de-facto method for decoding auto-regressive\nNeural Machine Translation (NMT) models, including multilingual NMT where the\ntarget language is specified as an input. However, decoding multilingual NMT\nmodels commonly produces ``off-target'' translations -- yielding translation\noutputs not in the intended language. In this paper, we first conduct an error\nanalysis of off-target translations for a strong multilingual NMT model and\nidentify how these decodings are produced during beam search. We then propose\nLanguage-informed Beam Search (LiBS), a general decoding algorithm\nincorporating an off-the-shelf Language Identification (LiD) model into beam\nsearch decoding to reduce off-target translations. LiBS is an inference-time\nprocedure that is NMT-model agnostic and does not require any additional\nparallel data. Results show that our proposed LiBS algorithm on average\nimproves +1.1 BLEU and +0.9 BLEU on WMT and OPUS datasets, and reduces\noff-target rates from 22.9\\% to 7.7\\% and 65.8\\% to 25.3\\% respectively.\n","authors":["Yilin Yang","Stefan Lee","Prasad Tadepalli"],"pdf_url":"https://arxiv.org/pdf/2408.05738v1.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.19290v3","updated":"2024-08-11T06:48:40Z","published":"2024-05-29T17:19:04Z","title":"Integrating Multi-scale Contextualized Information for Byte-based Neural\n Machine Translation","summary":" Subword tokenization is a common method for vocabulary building in Neural\nMachine Translation (NMT) models. However, increasingly complex tasks have\nrevealed its disadvantages. First, a vocabulary cannot be modified once it is\nlearned, making it hard to adapt to new words. Second, in multilingual\ntranslation, the imbalance in data volumes across different languages spreads\nto the vocabulary, exacerbating translations involving low-resource languages.\nWhile byte-based tokenization addresses these issues, byte-based models\nstruggle with the low information density inherent in UTF-8 byte sequences.\nPrevious works enhance token semantics through local contextualization but fail\nto select an appropriate contextualizing scope based on the input.\nConsequently, we propose the Multi-Scale Contextualization (MSC) method, which\nlearns contextualized information of varying scales across different hidden\nstate dimensions. It then leverages the attention module to dynamically\nintegrate the multi-scale contextualized information. Experiments show that MSC\nsignificantly outperforms subword-based and other byte-based methods in both\nmultilingual and out-of-domain scenarios. Code can be found in\nhttps://github.com/ictnlp/Multiscale-Contextualization.\n","authors":["Langlin Huang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2405.19290v3.pdf","comment":"Accepted by ACL2024 Findings, renew author's email"},{"id":"http://arxiv.org/abs/2311.09613v3","updated":"2024-08-11T05:46:15Z","published":"2023-11-16T06:51:46Z","title":"Digital Socrates: Evaluating LLMs through Explanation Critiques","summary":" While LLMs can provide reasoned explanations along with their answers, the\nnature and quality of those explanations are still poorly understood. In\nresponse, our goal is to define a detailed way of characterizing the\nexplanation capabilities of modern models and to create a nuanced,\ninterpretable explanation evaluation tool that can generate such\ncharacterizations automatically, without relying on expensive API calls or\nhuman annotations. Our approach is to (a) define the new task of explanation\ncritiquing - identifying and categorizing any main flaw in an explanation and\nproviding suggestions to address the flaw, (b) create a sizeable,\nhuman-verified dataset for this task, and (c) train an open-source, automatic\ncritique model (called Digital Socrates) using this data. Through quantitative\nand qualitative analysis, we demonstrate how Digital Socrates is useful for\nrevealing insights about student models by examining their reasoning chains,\nand how it can provide high-quality, nuanced, automatic evaluation of those\nmodel explanations for the first time. Digital Socrates thus fills an important\ngap in evaluation tools for understanding and improving the explanation\nbehavior of models.\n","authors":["Yuling Gu","Oyvind Tafjord","Peter Clark"],"pdf_url":"https://arxiv.org/pdf/2311.09613v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.17638v2","updated":"2024-08-11T05:21:33Z","published":"2024-07-24T21:06:40Z","title":"Time Matters: Examine Temporal Effects on Biomedical Language Models","summary":" Time roots in applying language models for biomedical applications: models\nare trained on historical data and will be deployed for new or future data,\nwhich may vary from training data. While increasing biomedical tasks have\nemployed state-of-the-art language models, there are very few studies have\nexamined temporal effects on biomedical models when data usually shifts across\ndevelopment and deployment. This study fills the gap by statistically probing\nrelations between language model performance and data shifts across three\nbiomedical tasks. We deploy diverse metrics to evaluate model performance,\ndistance methods to measure data drifts, and statistical methods to quantify\ntemporal effects on biomedical language models. Our study shows that time\nmatters for deploying biomedical language models, while the degree of\nperformance degradation varies by biomedical tasks and statistical\nquantification approaches. We believe this study can establish a solid\nbenchmark to evaluate and assess temporal effects on deploying biomedical\nlanguage models.\n","authors":["Weisi Liu","Zhe He","Xiaolei Huang"],"pdf_url":"https://arxiv.org/pdf/2407.17638v2.pdf","comment":"Accepted to AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2406.02539v2","updated":"2024-08-11T05:15:46Z","published":"2024-06-04T17:56:28Z","title":"Parrot: Multilingual Visual Instruction Tuning","summary":" The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V\nhas marked a significant step towards artificial general intelligence. Existing\nmethods mainly focus on aligning vision encoders with LLMs through supervised\nfine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'\ninherent ability to react to multiple languages progressively deteriorate as\nthe training process evolves. We empirically find that the imbalanced SFT\ndatasets, primarily composed of English-centric image-text pairs, lead to\nsignificantly reduced performance in non-English languages. This is due to the\nfailure of aligning the vision encoder and LLM with multilingual tokens during\nthe SFT process. In this paper, we introduce Parrot, a novel method that\nutilizes textual guidance to drive visual token alignment at the language\nlevel. Parrot makes the visual tokens condition on diverse language inputs and\nuses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.\nSpecifically, to enhance non-English visual tokens alignment, we compute the\ncross-attention using the initial visual features and textual embeddings, the\nresult of which is then fed into the MoE router to select the most relevant\nexperts. The selected experts subsequently convert the initial visual tokens\ninto language-specific visual tokens. Moreover, considering the current lack of\nbenchmarks for evaluating multilingual capabilities within the field, we\ncollect and make available a Massive Multilingual Multimodal Benchmark which\nincludes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our\nmethod not only demonstrates state-of-the-art performance on multilingual\nMMBench and MMMB, but also excels across a broad range of multimodal tasks.\nBoth the source code and the training dataset of Parrot will be made publicly\navailable. Code is available at: https://github.com/AIDC-AI/Parrot.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Yang Li","Shiyin Lu","Chao Yi","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.02539v2.pdf","comment":"Code is available at: https://github.com/AIDC-AI/Parrot"},{"id":"http://arxiv.org/abs/2302.06674v4","updated":"2024-08-11T05:14:28Z","published":"2023-02-13T20:27:26Z","title":"PK-ICR: Persona-Knowledge Interactive Context Retrieval for Grounded\n Dialogue","summary":" Identifying relevant persona or knowledge for conversational systems is\ncritical to grounded dialogue response generation. However, each grounding has\nbeen mostly researched in isolation with more practical multi-context dialogue\ntasks introduced in recent works. We define Persona and Knowledge Dual Context\nIdentification as the task to identify persona and knowledge jointly for a\ngiven dialogue, which could be of elevated importance in complex multi-context\ndialogue settings. We develop a novel grounding retrieval method that utilizes\nall contexts of dialogue simultaneously. Our method requires less computational\npower via utilizing neural QA retrieval models. We further introduce our novel\nnull-positive rank test which measures ranking performance on semantically\ndissimilar samples (i.e. hard negatives) in relation to data augmentation.\n","authors":["Minsik Oh","Joosung Lee","Jiwei Li","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2302.06674v4.pdf","comment":"Accepted to EMNLP 2023 main conference (Oral). Code available at\n https://github.com/minsik-ai/PK-ICR"},{"id":"http://arxiv.org/abs/2407.15720v2","updated":"2024-08-11T04:39:16Z","published":"2024-07-22T15:22:34Z","title":"Do Large Language Models Have Compositional Ability? An Investigation\n into Limitations and Scalability","summary":" Large language models (LLMs) have emerged as powerful tools for many AI\nproblems and exhibit remarkable in-context learning (ICL) capabilities.\nCompositional ability, solving unseen complex tasks that combine two or more\nsimple tasks, is an essential reasoning ability for Artificial General\nIntelligence. Despite the tremendous success of LLMs, how they approach\ncomposite tasks, especially those not encountered during the pretraining phase,\nremains an open and largely underexplored question. In this study, we delve\ninto the ICL capabilities of LLMs on composite tasks, with only simple tasks as\nin-context examples. We develop a test suite of composite tasks including\nlinguistic and logical challenges and perform empirical studies across\ndifferent LLM families. We observe that models exhibit divergent behaviors: (1)\nFor simpler composite tasks that apply distinct mapping mechanisms to different\ninput segments, the models demonstrate decent compositional ability, while\nscaling up the model enhances this ability; (2) for more complex composite\ntasks involving reasoning multiple steps, where each step represents one task,\nmodels typically underperform, and scaling up generally provides no\nimprovements. We offer theoretical analysis in a simplified setting, explaining\nthat models exhibit compositional capability when the task handles different\ninput parts separately. We believe our work sheds new light on the capabilities\nof LLMs in solving composite tasks regarding the nature of the tasks and model\nscale. Our dataset and code are available at\n{\\url{https://github.com/OliverXUZY/LLM_Compose}}.\n","authors":["Zhuoyan Xu","Zhenmei Shi","Yingyu Liang"],"pdf_url":"https://arxiv.org/pdf/2407.15720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11214v2","updated":"2024-08-11T01:17:41Z","published":"2024-06-17T05:13:25Z","title":"Large Language Model Tokenizer Bias: A Case Study and Solution on GPT-4o","summary":" Recent advancements in large language models (LLMs), such as GPT-4 and\nGPT-4o, have shown exceptional performance, especially in languages with\nabundant resources like English, thanks to extensive datasets that ensure\nrobust training. Conversely, these models exhibit limitations when processing\nunder-resourced languages such as Chinese and Korean, where issues including\nhallucinatory responses remain prevalent. This paper traces the roots of these\ndisparities to the tokenization process inherent to these models. Specifically,\nit explores how the tokenizer vocabulary, often used to speed up the\ntokenization process and reduce tokens but constructed independently of the\nactual model training data, inadequately represents non-English languages. This\nmisrepresentation results in the propagation of 'under-trained' or 'untrained'\ntokens, which perpetuate biases and pose serious concerns related to data\nsecurity and ethical standards. We aim to dissect the tokenization mechanics of\nGPT-4o, illustrating how its simplified token-handling methods amplify these\nrisks and offer strategic solutions to mitigate associated security and ethical\nissues. Through this study, we emphasize the critical need to rethink\ntokenization frameworks to foster more equitable and secure AI technologies.\n","authors":["Jin Yang","Zhiqiang Wang","Yanbin Lin","Zunduo Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.11214v2.pdf","comment":"6 pages, 3 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2408.05664v1","updated":"2024-08-11T00:50:59Z","published":"2024-08-11T00:50:59Z","title":"Training an NLP Scholar at a Small Liberal Arts College: A Backwards\n Designed Course Proposal","summary":" The rapid growth in natural language processing (NLP) over the last couple\nyears has generated student interest and excitement in learning more about the\nfield. In this paper, we present two types of students that NLP courses might\nwant to train. First, an \"NLP engineer\" who is able to flexibly design, build\nand apply new technologies in NLP for a wide range of tasks. Second, an \"NLP\nscholar\" who is able to pose, refine and answer questions in NLP and how it\nrelates to the society, while also learning to effectively communicate these\nanswers to a broader audience. While these two types of skills are not mutually\nexclusive -- NLP engineers should be able to think critically, and NLP scholars\nshould be able to build systems -- we think that courses can differ in the\nbalance of these skills. As educators at Small Liberal Arts Colleges, the\nstrengths of our students and our institution favors an approach that is better\nsuited to train NLP scholars. In this paper we articulate what kinds of skills\nan NLP scholar should have, and then adopt a backwards design to propose course\ncomponents that can aid the acquisition of these skills.\n","authors":["Grusha Prasad","Forrest Davis"],"pdf_url":"https://arxiv.org/pdf/2408.05664v1.pdf","comment":"9 pages, Presented at 6th Workshop on Teaching NLP"},{"id":"http://arxiv.org/abs/2407.04620v2","updated":"2024-08-11T00:42:18Z","published":"2024-07-05T16:23:20Z","title":"Learning to (Learn at Test Time): RNNs with Expressive Hidden States","summary":" Self-attention performs well in long context but has quadratic complexity.\nExisting RNN layers have linear complexity, but their performance in long\ncontext is limited by the expressive power of their hidden state. We propose a\nnew class of sequence modeling layers with linear complexity and an expressive\nhidden state. The key idea is to make the hidden state a machine learning model\nitself, and the update rule a step of self-supervised learning. Since the\nhidden state is updated by training even on test sequences, our layers are\ncalled Test-Time Training (TTT) layers. We consider two instantiations:\nTTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer\nMLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B\nparameters, comparing with a strong Transformer and Mamba, a modern RNN. Both\nTTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer,\nthey can keep reducing perplexity by conditioning on more tokens, while Mamba\ncannot after 16k context. With preliminary systems optimization, TTT-Linear is\nalready faster than Transformer at 8k context and matches Mamba in wall-clock\ntime. TTT-MLP still faces challenges in memory I/O, but shows larger potential\nin long context, pointing to a promising direction for future research.\n","authors":["Yu Sun","Xinhao Li","Karan Dalal","Jiarui Xu","Arjun Vikram","Genghan Zhang","Yann Dubois","Xinlei Chen","Xiaolong Wang","Sanmi Koyejo","Tatsunori Hashimoto","Carlos Guestrin"],"pdf_url":"https://arxiv.org/pdf/2407.04620v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.05868v1","updated":"2024-08-11T22:03:45Z","published":"2024-08-11T22:03:45Z","title":"LaWa: Using Latent Space for In-Generation Image Watermarking","summary":" With generative models producing high quality images that are\nindistinguishable from real ones, there is growing concern regarding the\nmalicious usage of AI-generated images. Imperceptible image watermarking is one\nviable solution towards such concerns. Prior watermarking methods map the image\nto a latent space for adding the watermark. Moreover, Latent Diffusion Models\n(LDM) generate the image in the latent space of a pre-trained autoencoder. We\nargue that this latent space can be used to integrate watermarking into the\ngeneration process. To this end, we present LaWa, an in-generation image\nwatermarking method designed for LDMs. By using coarse-to-fine watermark\nembedding modules, LaWa modifies the latent space of pre-trained autoencoders\nand achieves high robustness against a wide range of image transformations\nwhile preserving perceptual quality of the image. We show that LaWa can also be\nused as a general image watermarking method. Through extensive experiments, we\ndemonstrate that LaWa outperforms previous works in perceptual quality,\nrobustness against attacks, and computational complexity, while having very low\nfalse positive rate. Code is available here.\n","authors":["Ahmad Rezaei","Mohammad Akbari","Saeed Ranjbar Alvar","Arezou Fatemi","Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05867v1","updated":"2024-08-11T21:59:34Z","published":"2024-08-11T21:59:34Z","title":"SABER-6D: Shape Representation Based Implicit Object Pose Estimation","summary":" In this paper, we propose a novel encoder-decoder architecture, named SABER,\nto learn the 6D pose of the object in the embedding space by learning shape\nrepresentation at a given pose. This model enables us to learn pose by\nperforming shape representation at a target pose from RGB image input. We\nperform shape representation as an auxiliary task which helps us in learning\nrotations space for an object based on 2D images. An image encoder predicts the\nrotation in the embedding space and the DeepSDF based decoder learns to\nrepresent the object's shape at the given pose. As our approach is shape based,\nthe pipeline is suitable for any type of object irrespective of the symmetry.\nMoreover, we need only a CAD model of the objects to train SABER. Our pipeline\nis synthetic data based and can also handle symmetric objects without symmetry\nlabels and, thus, no additional labeled training data is needed. The\nexperimental evaluation shows that our method achieves close to benchmark\nresults for both symmetric objects and asymmetric objects on Occlusion-LineMOD,\nand T-LESS datasets.\n","authors":["Shishir Reddy Vutukur","Mengkejiergeli Ba","Benjamin Busam","Matthias Kayser","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.05867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01877v2","updated":"2024-08-11T21:26:41Z","published":"2024-08-03T22:55:26Z","title":"Is Generative Communication between Embodied Agents Good for Zero-Shot\n ObjectNav?","summary":" In Zero-Shot ObjectNav, an embodied ground agent is expected to navigate to a\ntarget object specified by a natural language label without any\nenvironment-specific fine-tuning. This is challenging, given the limited view\nof a ground agent and its independent exploratory behavior. To address these\nissues, we consider an assistive overhead agent with a bounded global view\nalongside the ground agent and present two coordinated navigation schemes for\njudicious exploration. We establish the influence of the Generative\nCommunication (GC) between the embodied agents equipped with Vision-Language\nModels (VLMs) in improving zero-shot ObjectNav, achieving a 10% improvement in\nthe ground agent's ability to find the target object in comparison with an\nunassisted setup in simulation. We further analyze the GC for unique traits\nquantifying the presence of hallucination and cooperation. In particular, we\nidentify a unique trait of \"preemptive hallucination\" specific to our embodied\nsetting, where the overhead agent assumes that the ground agent has executed an\naction in the dialogue when it is yet to move. Finally, we conduct real-world\ninferences with GC and showcase qualitative examples where countering\npre-emptive hallucination via prompt finetuning improves real-world ObjectNav\nperformance.\n","authors":["Vishnu Sashank Dorbala","Vishnu Dutt Sharma","Pratap Tokekar","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2408.01877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05839v1","updated":"2024-08-11T18:20:08Z","published":"2024-08-11T18:20:08Z","title":"Deep Learning in Medical Image Registration: Magic or Mirage?","summary":" Classical optimization and learning-based methods are the two reigning\nparadigms in deformable image registration. While optimization-based methods\nboast generalizability across modalities and robust performance, learning-based\nmethods promise peak performance, incorporating weak supervision and amortized\noptimization. However, the exact conditions for either paradigm to perform well\nover the other are shrouded and not explicitly outlined in the existing\nliterature. In this paper, we make an explicit correspondence between the\nmutual information of the distribution of per-pixel intensity and labels, and\nthe performance of classical registration methods. This strong correlation\nhints to the fact that architectural designs in learning-based methods is\nunlikely to affect this correlation, and therefore, the performance of\nlearning-based methods. This hypothesis is thoroughly validated with\nstate-of-the-art classical and learning-based methods. However, learning-based\nmethods with weak supervision can perform high-fidelity intensity and label\nregistration, which is not possible with classical methods. Next, we show that\nthis high-fidelity feature learning does not translate to invariance to domain\nshift, and learning-based methods are sensitive to such changes in the data\ndistribution. Finally, we propose a general recipe to choose the best paradigm\nfor a given registration problem, based on these observations.\n","authors":["Rohit Jena","Deeksha Sethi","Pratik Chaudhari","James C. Gee"],"pdf_url":"https://arxiv.org/pdf/2408.05839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05836v1","updated":"2024-08-11T17:34:24Z","published":"2024-08-11T17:34:24Z","title":"Real-Time Drowsiness Detection Using Eye Aspect Ratio and Facial\n Landmark Detection","summary":" Drowsiness detection is essential for improving safety in areas such as\ntransportation and workplace health. This study presents a real-time system\ndesigned to detect drowsiness using the Eye Aspect Ratio (EAR) and facial\nlandmark detection techniques. The system leverages Dlibs pre-trained shape\npredictor model to accurately detect and monitor 68 facial landmarks, which are\nused to compute the EAR. By establishing a threshold for the EAR, the system\nidentifies when eyes are closed, indicating potential drowsiness. The process\ninvolves capturing a live video stream, detecting faces in each frame,\nextracting eye landmarks, and calculating the EAR to assess alertness. Our\nexperiments show that the system reliably detects drowsiness with high accuracy\nwhile maintaining low computational demands. This study offers a strong\nsolution for real-time drowsiness detection, with promising applications in\ndriver monitoring and workplace safety. Future research will investigate\nincorporating additional physiological and contextual data to further enhance\ndetection accuracy and reliability.\n","authors":["Varun Shiva Krishna Rupani","Velpooru Venkata Sai Thushar","Kondadi Tejith"],"pdf_url":"https://arxiv.org/pdf/2408.05836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09476v2","updated":"2024-08-11T17:32:55Z","published":"2024-04-15T06:02:31Z","title":"FreqMamba: Viewing Mamba from a Frequency Perspective for Image\n Deraining","summary":" Images corrupted by rain streaks often lose vital frequency information for\nperception, and image deraining aims to solve this issue which relies on global\nand local degradation modeling. Recent studies have witnessed the effectiveness\nand efficiency of Mamba for perceiving global and local information based on\nits exploiting local correlation among patches, however, rarely attempts have\nbeen explored to extend it with frequency analysis for image deraining,\nlimiting its ability to perceive global degradation that is relevant to\nfrequency modeling (e.g. Fourier transform). In this paper, we propose\nFreqMamba, an effective and efficient paradigm that leverages the complementary\nbetween Mamba and frequency analysis for image deraining. The core of our\nmethod lies in extending Mamba with frequency analysis from two perspectives:\nextending it with frequency-band for exploiting frequency correlation, and\nconnecting it with Fourier transform for global degradation modeling.\nSpecifically, FreqMamba introduces complementary triple interaction structures\nincluding spatial Mamba, frequency band Mamba, and Fourier global modeling.\nFrequency band Mamba decomposes the image into sub-bands of different\nfrequencies to allow 2D scanning from the frequency dimension. Furthermore,\nleveraging Mamba's unique data-dependent properties, we use rainy images at\ndifferent scales to provide degradation priors to the network, thereby\nfacilitating efficient training. Extensive experiments show that our method\noutperforms state-of-the-art methods both visually and quantitatively.\n","authors":["Zou Zhen","Yu Hu","Zhao Feng"],"pdf_url":"https://arxiv.org/pdf/2404.09476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05831v1","updated":"2024-08-11T17:13:21Z","published":"2024-08-11T17:13:21Z","title":"Robust Domain Generalization for Multi-modal Object Recognition","summary":" In multi-label classification, machine learning encounters the challenge of\ndomain generalization when handling tasks with distributions differing from the\ntraining data. Existing approaches primarily focus on vision object recognition\nand neglect the integration of natural language. Recent advancements in\nvision-language pre-training leverage supervision from extensive\nvisual-language pairs, enabling learning across diverse domains and enhancing\nrecognition in multi-modal scenarios. However, these approaches face\nlimitations in loss function utilization, generality across backbones, and\nclass-aware visual fusion. This paper proposes solutions to these limitations\nby inferring the actual loss, broadening evaluations to larger vision-language\nbackbones, and introducing Mixup-CLIPood, which incorporates a novel mix-up\nloss for enhanced class-aware visual fusion. Our method demonstrates superior\nperformance in domain generalization across multiple datasets.\n","authors":["Yuxin Qiao","Keqin Li","Junhong Lin","Rong Wei","Chufeng Jiang","Yang Luo","Haoyu Yang"],"pdf_url":"https://arxiv.org/pdf/2408.05831v1.pdf","comment":"6 pages, 2 figures. This is a preprint version of the article. The\n final version will be published in the proceedings of the IEEE conference"},{"id":"http://arxiv.org/abs/2408.05822v1","updated":"2024-08-11T16:53:09Z","published":"2024-08-11T16:53:09Z","title":"Sampling Foundational Transformer: A Theoretical Perspective","summary":" The versatility of self-attention mechanism earned transformers great success\nin almost all data modalities, with limitations on the quadratic complexity and\ndifficulty of training. To apply transformers across different data modalities,\npractitioners have to make specific clever data-modality-dependent\nconstructions. In this paper, we propose Sampling Foundational Transformer\n(SFT) that can work on multiple data modalities (e.g., point cloud, graph, and\nsequence) and constraints (e.g., rotational-invariant). The existence of such\nmodel is important as contemporary foundational modeling requires operability\non multiple data sources. For efficiency on large number of tokens, our model\nrelies on our context aware sampling-without-replacement mechanism for both\nlinear asymptotic computational complexity and real inference time gain. For\nefficiency, we rely on our newly discovered pseudoconvex formulation of\ntransformer layer to increase model's convergence rate. As a model working on\nmultiple data modalities, SFT has achieved competitive results on many\nbenchmarks, while being faster in inference, compared to other very specialized\nmodels.\n","authors":["Viet Anh Nguyen","Minh Lenhat","Khoa Nguyen","Duong Duc Hieu","Dao Huu Hung","Truong Son Hy"],"pdf_url":"https://arxiv.org/pdf/2408.05822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05815v1","updated":"2024-08-11T16:31:39Z","published":"2024-08-11T16:31:39Z","title":"HySparK: Hybrid Sparse Masking for Large Scale Medical Image\n Pre-Training","summary":" The generative self-supervised learning strategy exhibits remarkable learning\nrepresentational capabilities. However, there is limited attention to\nend-to-end pre-training methods based on a hybrid architecture of CNN and\nTransformer, which can learn strong local and global representations\nsimultaneously. To address this issue, we propose a generative pre-training\nstrategy called Hybrid Sparse masKing (HySparK) based on masked image modeling\nand apply it to large-scale pre-training on medical images. First, we perform a\nbottom-up 3D hybrid masking strategy on the encoder to keep consistency\nmasking. Then we utilize sparse convolution for the top CNNs and encode\nunmasked patches for the bottom vision Transformers. Second, we employ a simple\nhierarchical decoder with skip-connections to achieve dense multi-scale feature\nreconstruction. Third, we implement our pre-training method on a collection of\nmultiple large-scale 3D medical imaging datasets. Extensive experiments\nindicate that our proposed pre-training strategy demonstrates robust\ntransfer-ability in supervised downstream tasks and sheds light on HySparK's\npromising prospects. The code is available at\nhttps://github.com/FengheTan9/HySparK\n","authors":["Fenghe Tang","Ronghao Xu","Qingsong Yao","Xueming Fu","Quan Quan","Heqin Zhu","Zaiyi Liu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.05815v1.pdf","comment":"Early accept at MICCAI 2024"},{"id":"http://arxiv.org/abs/2302.01647v2","updated":"2024-08-11T15:59:30Z","published":"2023-02-03T10:48:24Z","title":"Blockwise Self-Supervised Learning at Scale","summary":" Current state-of-the-art deep networks are all powered by backpropagation. In\nthis paper, we explore alternatives to full backpropagation in the form of\nblockwise learning rules, leveraging the latest developments in self-supervised\nlearning. We show that a blockwise pretraining procedure consisting of training\nindependently the 4 main blocks of layers of a ResNet-50 with Barlow Twins'\nloss function at each block performs almost as well as end-to-end\nbackpropagation on ImageNet: a linear probe trained on top of our blockwise\npretrained model obtains a top-1 classification accuracy of 70.48%, only 1.1%\nbelow the accuracy of an end-to-end pretrained network (71.57% accuracy). We\nperform extensive experiments to understand the impact of different components\nwithin our method and explore a variety of adaptations of self-supervised\nlearning to the blockwise paradigm, building an exhaustive understanding of the\ncritical avenues for scaling local learning rules to large networks, with\nimplications ranging from hardware design to neuroscience.\n","authors":["Shoaib Ahmed Siddiqui","David Krueger","Yann LeCun","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2302.01647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05803v1","updated":"2024-08-11T15:46:00Z","published":"2024-08-11T15:46:00Z","title":"Prototype Learning Guided Hybrid Network for Breast Tumor Segmentation\n in DCE-MRI","summary":" Automated breast tumor segmentation on the basis of dynamic\ncontrast-enhancement magnetic resonance imaging (DCE-MRI) has shown great\npromise in clinical practice, particularly for identifying the presence of\nbreast disease. However, accurate segmentation of breast tumor is a challenging\ntask, often necessitating the development of complex networks. To strike an\noptimal trade-off between computational costs and segmentation performance, we\npropose a hybrid network via the combination of convolution neural network\n(CNN) and transformer layers. Specifically, the hybrid network consists of a\nencoder-decoder architecture by stacking convolution and decovolution layers.\nEffective 3D transformer layers are then implemented after the encoder\nsubnetworks, to capture global dependencies between the bottleneck features. To\nimprove the efficiency of hybrid network, two parallel encoder subnetworks are\ndesigned for the decoder and the transformer layers, respectively. To further\nenhance the discriminative capability of hybrid network, a prototype learning\nguided prediction module is proposed, where the category-specified prototypical\nfeatures are calculated through on-line clustering. All learned prototypical\nfeatures are finally combined with the features from decoder for tumor mask\nprediction. The experimental results on private and public DCE-MRI datasets\ndemonstrate that the proposed hybrid network achieves superior performance than\nthe state-of-the-art (SOTA) methods, while maintaining balance between\nsegmentation accuracy and computation cost. Moreover, we demonstrate that\nautomatically generated tumor masks can be effectively applied to identify\nHER2-positive subtype from HER2-negative subtype with the similar accuracy to\nthe analysis based on manual tumor segmentation. The source code is available\nat https://github.com/ZhouL-lab/PLHN.\n","authors":["Lei Zhou","Yuzhong Zhang","Jiadong Zhang","Xuejun Qian","Chen Gong","Kun Sun","Zhongxiang Ding","Xing Wang","Zhenhui Li","Zaiyi Liu","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2408.05803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05802v1","updated":"2024-08-11T15:37:29Z","published":"2024-08-11T15:37:29Z","title":"Egocentric Vision Language Planning","summary":" We explore leveraging large multi-modal models (LMMs) and text2image models\nto build a more general embodied agent. LMMs excel in planning long-horizon\ntasks over symbolic abstractions but struggle with grounding in the physical\nworld, often failing to accurately identify object positions in images. A\nbridge is needed to connect LMMs to the physical world. The paper proposes a\nnovel approach, egocentric vision language planning (EgoPlan), to handle\nlong-horizon tasks from an egocentric perspective in varying household\nscenarios. This model leverages a diffusion model to simulate the fundamental\ndynamics between states and actions, integrating techniques like style transfer\nand optical flow to enhance generalization across different environmental\ndynamics. The LMM serves as a planner, breaking down instructions into\nsub-goals and selecting actions based on their alignment with these sub-goals,\nthus enabling more generalized and effective decision-making. Experiments show\nthat EgoPlan improves long-horizon task success rates from the egocentric view\ncompared to baselines across household scenarios.\n","authors":["Zhirui Fang","Ming Yang","Weishuai Zeng","Boyu Li","Junpeng Yue","Ziluo Ding","Xiu Li","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2408.05802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10522v6","updated":"2024-08-11T15:17:12Z","published":"2023-11-17T13:43:43Z","title":"Enhancing Object Coherence in Layout-to-Image Synthesis","summary":" Layout-to-image synthesis is an emerging technique in conditional image\ngeneration. It aims to generate complex scenes, where users require fine\ncontrol over the layout of the objects in a scene. However, it remains\nchallenging to control the object coherence, including semantic coherence\n(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the\nhand and the racket should not be misaligned). In this paper, we propose a\nnovel diffusion model with effective global semantic fusion (GSF) and\nself-similarity feature enhancement modules to guide the object coherence for\nthis task. For semantic coherence, we argue that the image caption contains\nrich information for defining the semantic relationship within the objects in\nthe images. Instead of simply employing cross-attention between captions and\nlatent images, which addresses the highly relevant layout restriction and\nsemantic coherence requirement separately and thus leads to unsatisfying\nresults shown in our experiments, we develop GSF to fuse the supervision from\nthe layout restriction and semantic coherence requirement and exploit it to\nguide the image synthesis process. Moreover, to improve the physical coherence,\nwe develop a Self-similarity Coherence Attention (SCA) module to explicitly\nintegrate local contextual physical coherence relation into each pixel's\ngeneration process. Specifically, we adopt a self-similarity map to encode the\nphysical coherence restrictions and employ it to extract coherent features from\ntext embedding. Through visualization of our self-similarity map, we explore\nthe essence of SCA, revealing that its effectiveness is not only in capturing\nreliable physical coherence patterns but also in enhancing complex texture\ngeneration. Extensive experiments demonstrate the superiority of our proposed\nmethod in both image generation quality and controllability.\n","authors":["Yibin Wang","Honghui Xu","Changhai Zhou","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2311.10522v6.pdf","comment":"Code: https://github.com/CodeGoat24/EOCNet"},{"id":"http://arxiv.org/abs/2312.07495v2","updated":"2024-08-11T14:27:16Z","published":"2023-12-12T18:28:59Z","title":"Exploring Plain ViT Reconstruction for Multi-class Unsupervised Anomaly\n Detection","summary":" This work studies a challenging and practical issue known as multi-class\nunsupervised anomaly detection (MUAD). This problem requires only normal images\nfor training while simultaneously testing both normal and anomaly images across\nmultiple classes. Existing reconstruction-based methods typically adopt\npyramidal networks as encoders and decoders to obtain multi-resolution\nfeatures, often involving complex sub-modules with extensive handcraft\nengineering. In contrast, a plain Vision Transformer (ViT) showcasing a more\nstraightforward architecture has proven effective in multiple domains,\nincluding detection and segmentation tasks. It is simpler, more effective, and\nelegant. Following this spirit, we explore the use of only plain ViT features\nfor MUAD. We first abstract a Meta-AD concept by synthesizing current\nreconstruction-based methods. Subsequently, we instantiate a novel ViT-based\nViTAD structure, designed incrementally from both global and local\nperspectives. This model provide a strong baseline to facilitate future\nresearch. Additionally, this paper uncovers several intriguing findings for\nfurther investigation. Finally, we comprehensively and fairly benchmark various\napproaches using eight metrics. Utilizing a basic training regimen with only an\nMSE loss, ViTAD achieves state-of-the-art results and efficiency on MVTec AD,\nVisA, and Uni-Medical datasets. \\Eg, achieving 85.4 mAD that surpasses UniAD by\n+3.0 for the MVTec AD dataset, and it requires only 1.1 hours and 2.3G GPU\nmemory to complete model training on a single V100 that can serve as a strong\nbaseline to facilitate the development of future research. Full code is\navailable at https://zhangzjn.github.io/projects/ViTAD/.\n","authors":["Jiangning Zhang","Xuhai Chen","Yabiao Wang","Chengjie Wang","Yong Liu","Xiangtai Li","Ming-Hsuan Yang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2312.07495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05781v1","updated":"2024-08-11T14:13:22Z","published":"2024-08-11T14:13:22Z","title":"CURLing the Dream: Contrastive Representations for World Modeling in\n Reinforcement Learning","summary":" In this work, we present Curled-Dreamer, a novel reinforcement learning\nalgorithm that integrates contrastive learning into the DreamerV3 framework to\nenhance performance in visual reinforcement learning tasks. By incorporating\nthe contrastive loss from the CURL algorithm and a reconstruction loss from\nautoencoder, Curled-Dreamer achieves significant improvements in various\nDeepMind Control Suite tasks. Our extensive experiments demonstrate that\nCurled-Dreamer consistently outperforms state-of-the-art algorithms, achieving\nhigher mean and median scores across a diverse set of tasks. The results\nindicate that the proposed approach not only accelerates learning but also\nenhances the robustness of the learned policies. This work highlights the\npotential of combining different learning paradigms to achieve superior\nperformance in reinforcement learning applications.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.05781v1.pdf","comment":"Paper accepted for 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2408.05780v1","updated":"2024-08-11T14:11:45Z","published":"2024-08-11T14:11:45Z","title":"U-DECN: End-to-End Underwater Object Detection ConvNet with Improved\n DeNoising Training","summary":" Underwater object detection has higher requirements of running speed and\ndeployment efficiency for the detector due to its specific environmental\nchallenges. NMS of two- or one-stage object detectors and transformer\narchitecture of query-based end-to-end object detectors are not conducive to\ndeployment on underwater embedded devices with limited processing power. As for\nthe detrimental effect of underwater color cast noise, recent underwater object\ndetectors make network architecture or training complex, which also hinders\ntheir application and deployment on underwater vehicle platforms. In this\npaper, we propose the Underwater DECO with improved deNoising training\n(U-DECN), the query-based end-to-end object detector (with ConvNet\nencoder-decoder architecture) for underwater color cast noise that addresses\nthe above problems. We integrate advanced technologies from DETR variants into\nDECO and design optimization methods specifically for the ConvNet architecture,\nincluding Separate Contrastive DeNoising Forward and Deformable Convolution in\nSIM. To address the underwater color cast noise issue, we propose an underwater\ncolor denoising query to improve the generalization of the model for the biased\nobject feature information by different color cast noise. Our U-DECN, with\nResNet-50 backbone, achieves 61.4 AP (50 epochs), 63.3 AP (72 epochs), 64.0 AP\n(100 epochs) on DUO, and 21 FPS (5 times faster than Deformable DETR and DINO 4\nFPS) on NVIDIA AGX Orin by TensorRT FP16, outperforming the other\nstate-of-the-art query-based end-to-end object detectors. The code is available\nat https://github.com/LEFTeyex/U-DECN.\n","authors":["Zhuoyan Liu","Bo Wang","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2408.05780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09325v3","updated":"2024-08-11T14:09:09Z","published":"2022-06-19T04:49:35Z","title":"EATFormer: Improving Vision Transformer Inspired by Evolutionary\n Algorithm","summary":" Motivated by biological evolution, this paper explains the rationality of\nVision Transformer by analogy with the proven practical evolutionary algorithm\n(EA) and derives that both have consistent mathematical formulation. Then\ninspired by effective EA variants, we propose a novel pyramid EATFormer\nbackbone that only contains the proposed EA-based transformer (EAT) block,\nwhich consists of three residual parts, i.e., Multi-scale region aggregation,\nglobal and local interaction, and feed-forward network modules, to model\nmulti-scale, interactive, and individual information separately. Moreover, we\ndesign a task-related head docked with transformer backbone to complete final\ninformation fusion more flexibly and improve a modulated deformable MSA to\ndynamically model irregular locations. Massive quantitative and quantitative\nexperiments on image classification, downstream tasks, and explanatory\nexperiments demonstrate the effectiveness and superiority of our approach over\nstate-of-the-art methods. E.g., our Mobile (1.8 M), Tiny (6.1 M), Small (24.3\nM), and Base (49.0 M) models achieve 69.4, 78.4, 83.1, and 83.9 Top-1 only\ntrained on ImageNet-1K with naive training recipe; EATFormer-Tiny/Small/Base\narmed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and 41.4/42.9/44.2 mask AP on\nCOCO detection, surpassing contemporary MPViT-T, Swin-T, and Swin-S by\n0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately with less FLOPs; Our\nEATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by Upernet that exceeds\nSwin-T/S by 2.8/1.7. Code is available at\nhttps://github.com/zhangzjn/EATFormer.\n","authors":["Jiangning Zhang","Xiangtai Li","Yabiao Wang","Chengjie Wang","Yibo Yang","Yong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2206.09325v3.pdf","comment":"IJCV'2024"},{"id":"http://arxiv.org/abs/2408.05777v1","updated":"2024-08-11T14:01:21Z","published":"2024-08-11T14:01:21Z","title":"Seg-CycleGAN : SAR-to-optical image translation guided by a downstream\n task","summary":" Optical remote sensing and Synthetic Aperture Radar(SAR) remote sensing are\ncrucial for earth observation, offering complementary capabilities. While\noptical sensors provide high-quality images, they are limited by weather and\nlighting conditions. In contrast, SAR sensors can operate effectively under\nadverse conditions. This letter proposes a GAN-based SAR-to-optical image\ntranslation method named Seg-CycleGAN, designed to enhance the accuracy of ship\ntarget translation by leveraging semantic information from a pre-trained\nsemantic segmentation model. Our method utilizes the downstream task of ship\ntarget semantic segmentation to guide the training of image translation\nnetwork, improving the quality of output Optical-styled images. The potential\nof foundation-model-annotated datasets in SAR-to-optical translation tasks is\nrevealed. This work suggests broader research and applications for\ndownstream-task-guided frameworks. The code will be available at\nhttps://github.com/NPULHH/\n","authors":["Hannuo Zhang","Huihui Li","Jiarui Lin","Yujie Zhang","Jianghua Fan","Hang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.05777v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.05775v1","updated":"2024-08-11T13:55:58Z","published":"2024-08-11T13:55:58Z","title":"Efficient Test-Time Prompt Tuning for Vision-Language Models","summary":" Vision-language models have showcased impressive zero-shot classification\ncapabilities when equipped with suitable text prompts. Previous studies have\nshown the effectiveness of test-time prompt tuning; however, these methods\ntypically require per-image prompt adaptation during inference, which incurs\nhigh computational budgets and limits scalability and practical deployment. To\novercome this issue, we introduce Self-TPT, a novel framework leveraging\nSelf-supervised learning for efficient Test-time Prompt Tuning. The key aspect\nof Self-TPT is that it turns to efficient predefined class adaptation via\nself-supervised learning, thus avoiding computation-heavy per-image adaptation\nat inference. Self-TPT begins by co-training the self-supervised and the\nclassification task using source data, then applies the self-supervised task\nexclusively for test-time new class adaptation. Specifically, we propose\nContrastive Prompt Learning (CPT) as the key task for self-supervision. CPT is\ndesigned to minimize the intra-class distances while enhancing inter-class\ndistinguishability via contrastive learning. Furthermore, empirical evidence\nsuggests that CPT could closely mimic back-propagated gradients of the\nclassification task, offering a plausible explanation for its effectiveness.\nMotivated by this finding, we further introduce a gradient matching loss to\nexplicitly enhance the gradient similarity. We evaluated Self-TPT across three\nchallenging zero-shot benchmarks. The results consistently demonstrate that\nSelf-TPT not only significantly reduces inference costs but also achieves\nstate-of-the-art performance, effectively balancing the efficiency-efficacy\ntrade-off.\n","authors":["Yuhan Zhu","Guozhen Zhang","Chen Xu","Haocheng Shen","Xiaoxin Chen","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05772v1","updated":"2024-08-11T13:40:02Z","published":"2024-08-11T13:40:02Z","title":"An analysis of HOI: using a training-free method with multimodal visual\n foundation models when only the test set is available, without the training\n set","summary":" Human-Object Interaction (HOI) aims to identify the pairs of humans and\nobjects in images and to recognize their relationships, ultimately forming\n$\\langle human, object, verb \\rangle$ triplets. Under default settings, HOI\nperformance is nearly saturated, with many studies focusing on long-tail\ndistribution and zero-shot/few-shot scenarios. Let us consider an intriguing\nproblem:``What if there is only test dataset without training dataset, using\nmultimodal visual foundation model in a training-free manner? '' This study\nuses two experimental settings: grounding truth and random arbitrary\ncombinations. We get some interesting conclusion and find that the open\nvocabulary capabilities of the multimodal visual foundation model are not yet\nfully realized. Additionally, replacing the feature extraction with grounding\nDINO further confirms these findings.\n","authors":["Chaoyi Ai"],"pdf_url":"https://arxiv.org/pdf/2408.05772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10942v5","updated":"2024-08-11T13:24:22Z","published":"2023-10-17T02:38:09Z","title":"UNK-VQA: A Dataset and a Probe into the Abstention Ability of\n Multi-modal Large Models","summary":" Teaching Visual Question Answering (VQA) models to refrain from answering\nunanswerable questions is necessary for building a trustworthy AI system.\nExisting studies, though have explored various aspects of VQA but somewhat\nignored this particular attribute. This paper aims to bridge the research gap\nby contributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that models do not\nknow. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By this means, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover their\nsignificant limitations when applied to our dataset. Additionally, we also\npropose a straightforward method to tackle these unanswerable questions. This\ndataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems. We have made the dataset\n(https://github.com/guoyang9/UNK-VQA) available to facilitate further\nexploration in this area.\n","authors":["Yangyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v5.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2408.05754v1","updated":"2024-08-11T12:05:32Z","published":"2024-08-11T12:05:32Z","title":"PRECISe : Prototype-Reservation for Explainable Classification under\n Imbalanced and Scarce-Data Settings","summary":" Deep learning models used for medical image classification tasks are often\nconstrained by the limited amount of training data along with severe class\nimbalance. Despite these problems, models should be explainable to enable human\ntrust in the models' decisions to ensure wider adoption in high-risk\nsituations. In this paper, we propose PRECISe, an explainable-by-design model\nmeticulously constructed to concurrently address all three challenges.\nEvaluation on 2 imbalanced medical image datasets reveals that PRECISe\noutperforms the current state-of-the-art methods on data efficient\ngeneralization to minority classes, achieving an accuracy of ~87% in detecting\npneumonia in chest x-rays upon training on <60 images only. Additionally, a\ncase study is presented to highlight the model's ability to produce easily\ninterpretable predictions, reinforcing its practical utility and reliability\nfor medical imaging tasks.\n","authors":["Vaibhav Ganatra","Drishti Goel"],"pdf_url":"https://arxiv.org/pdf/2408.05754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05752v1","updated":"2024-08-11T11:53:29Z","published":"2024-08-11T11:53:29Z","title":"RTF-Q: Unsupervised domain adaptation based retraining-free quantization\n network","summary":" Performing unsupervised domain adaptation on resource-constrained edge\ndevices is a significant task. Although existing research allows edge devices\nto use subnets with different computational budgets for inference, they often\nrequire expensive pre-training and do not consider the issues of parameter\nprecision redundancy in the model, which is not conducive to the deployment of\nthe model on edge devices. In this paper, we introduce a ReTraining-Free\nQuantized (RTF-Q) network based on unsupervised domain adaptation, featuring\nquantized subnets of varying computational costs that can operate on devices\nwith dynamically changing computation budgets. Our network has three switchable\ndimensions: width (number of channels), input resolution, and quantization\nbit-width. Specifically, we choose subnet dimensions that have minimal impact\non network performance and then directly load the official weight files without\nrequiring expensive and time-consuming pre-training on Imagenet-1K. To further\nreduce the network's computational load and memory usage, we use\nquantization-aware training, reducing the BitOPs of full-precision networks by\nat least 1/16. We propose a training method called SandwichQ for multiple\nquantization bit widths, which can efficiently train multiple quantization\nsubnets. By training in multiple quantization bit-width spaces simultaneously\nand using the proposed SandwichQ rule, we achieve better network performance\ncompared to using a single quantization bit-width alone. Experimental results\nshow that our method achieves classification accuracy comparable to SOTA\nmethods on various UDA tasks, significantly reducing network size and\ncomputational overhead. Code will be available at\nhttps://github.com/dunanyang/RTF-Q.\n","authors":["Nanyang Du","Chen Tang","Yuan Meng","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05751v1","updated":"2024-08-11T11:46:21Z","published":"2024-08-11T11:46:21Z","title":"Advancing Re-Ranking with Multimodal Fusion and Target-Oriented\n Auxiliary Tasks in E-Commerce Search","summary":" In the rapidly evolving field of e-commerce, the effectiveness of search\nre-ranking models is crucial for enhancing user experience and driving\nconversion rates. Despite significant advancements in feature representation\nand model architecture, the integration of multimodal information remains\nunderexplored. This study addresses this gap by investigating the computation\nand fusion of textual and visual information in the context of re-ranking. We\npropose \\textbf{A}dvancing \\textbf{R}e-Ranking with\n\\textbf{M}ulti\\textbf{m}odal Fusion and \\textbf{T}arget-Oriented Auxiliary\nTasks (ARMMT), which integrates an attention-based multimodal fusion technique\nand an auxiliary ranking-aligned task to enhance item representation and\nimprove targeting capabilities. This method not only enriches the understanding\nof product attributes but also enables more precise and personalized\nrecommendations. Experimental evaluations on JD.com's search platform\ndemonstrate that ARMMT achieves state-of-the-art performance in multimodal\ninformation integration, evidenced by a 0.22\\% increase in the Conversion Rate\n(CVR), significantly contributing to Gross Merchandise Volume (GMV). This\npioneering approach has the potential to revolutionize e-commerce re-ranking,\nleading to elevated user satisfaction and business growth.\n","authors":["Enqiang Xu","Xinhui Li","Zhigong Zhou","Jiahao Ji","Jinyuan Zhao","Dadong Miao","Songlin Wang","Lin Liu","Sulong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.05751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05750v1","updated":"2024-08-11T11:43:56Z","published":"2024-08-11T11:43:56Z","title":"FADE: A Dataset for Detecting Falling Objects around Buildings in Video","summary":" Falling objects from buildings can cause severe injuries to pedestrians due\nto the great impact force they exert. Although surveillance cameras are\ninstalled around some buildings, it is challenging for humans to capture such\nevents in surveillance videos due to the small size and fast motion of falling\nobjects, as well as the complex background. Therefore, it is necessary to\ndevelop methods to automatically detect falling objects around buildings in\nsurveillance videos. To facilitate the investigation of falling object\ndetection, we propose a large, diverse video dataset called FADE (FAlling\nObject DEtection around Buildings) for the first time. FADE contains 1,881\nvideos from 18 scenes, featuring 8 falling object categories, 4 weather\nconditions, and 4 video resolutions. Additionally, we develop a new object\ndetection method called FADE-Net, which effectively leverages motion\ninformation and produces small-sized but high-quality proposals for detecting\nfalling objects around buildings. Importantly, our method is extensively\nevaluated and analyzed by comparing it with the previous approaches used for\ngeneric object detection, video object detection, and moving object detection\non the FADE dataset. Experimental results show that the proposed FADE-Net\nsignificantly outperforms other methods, providing an effective baseline for\nfuture research. The dataset and code are publicly available at\nhttps://fadedataset.github.io/FADE.github.io/.\n","authors":["Zhigang Tu","Zitao Gao","Zhengbo Zhang","Chunluan Zhou","Junsong Yuan","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2408.05750v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.05264v2","updated":"2024-08-11T11:39:54Z","published":"2024-04-08T07:54:18Z","title":"Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in\n Multimodal Large Language Model Security","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities\nthat increasingly influence various aspects of our daily lives, constantly\ndefining the new boundary of Artificial General Intelligence (AGI). Image\nmodalities, enriched with profound semantic information and a more continuous\nmathematical nature compared to other modalities, greatly enhance the\nfunctionalities of MLLMs when integrated. However, this integration serves as a\ndouble-edged sword, providing attackers with expansive vulnerabilities to\nexploit for highly covert and harmful attacks. The pursuit of reliable AI\nsystems like powerful MLLMs has emerged as a pivotal area of contemporary\nresearch. In this paper, we endeavor to demostrate the multifaceted risks\nassociated with the incorporation of image modalities into MLLMs. Initially, we\ndelineate the foundational components and training processes of MLLMs.\nSubsequently, we construct a threat model, outlining the security\nvulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing\nscholarly discourses on MLLMs' attack and defense mechanisms, culminating in\nsuggestions for the future research on MLLM security. Through this\ncomprehensive analysis, we aim to deepen the academic understanding of MLLM\nsecurity challenges and propel forward the development of trustworthy MLLM\nsystems.\n","authors":["Yihe Fan","Yuxin Cao","Ziyu Zhao","Ziyao Liu","Shaofeng Li"],"pdf_url":"https://arxiv.org/pdf/2404.05264v2.pdf","comment":"8 pages, 1 figure. Accepted to 2024 IEEE International Conference on\n Systems, Man, and Cybernetics"},{"id":"http://arxiv.org/abs/2408.05749v1","updated":"2024-08-11T11:37:43Z","published":"2024-08-11T11:37:43Z","title":"Efficient and Versatile Robust Fine-Tuning of Zero-shot Models","summary":" Large-scale image-text pre-trained models enable zero-shot classification and\nprovide consistent accuracy across various data distributions. Nonetheless,\noptimizing these models in downstream tasks typically requires fine-tuning,\nwhich reduces generalization to out-of-distribution (OOD) data and demands\nextensive computational resources. We introduce Robust Adapter (R-Adapter), a\nnovel method for fine-tuning zero-shot models to downstream tasks while\nsimultaneously addressing both these issues. Our method integrates lightweight\nmodules into the pre-trained model and employs novel self-ensemble techniques\nto boost OOD robustness and reduce storage expenses substantially. Furthermore,\nwe propose MPM-NCE loss designed for fine-tuning on vision-language downstream\ntasks. It ensures precise alignment of multiple image-text pairs and\ndiscriminative feature learning. By extending the benchmark for robust\nfine-tuning beyond classification to include diverse tasks such as cross-modal\nretrieval and open vocabulary segmentation, we demonstrate the broad\napplicability of R-Adapter. Our extensive experiments demonstrate that\nR-Adapter achieves state-of-the-art performance across a diverse set of tasks,\ntuning only 13% of the parameters of the CLIP encoders.\n","authors":["Sungyeon Kim","Boseung Jeong","Donghyun Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2408.05749v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2403.11854v3","updated":"2024-08-11T11:36:17Z","published":"2024-03-18T15:03:56Z","title":"denoiSplit: a method for joint microscopy image splitting and\n unsupervised denoising","summary":" In this work, we present denoiSplit, a method to tackle a new analysis task,\ni.e. the challenge of joint semantic image splitting and unsupervised\ndenoising. This dual approach has important applications in fluorescence\nmicroscopy, where semantic image splitting has important applications but noise\ndoes generally hinder the downstream analysis of image content. Image splitting\ninvolves dissecting an image into its distinguishable semantic structures. We\nshow that the current state-of-the-art method for this task struggles in the\npresence of image noise, inadvertently also distributing the noise across the\npredicted outputs. The method we present here can deal with image noise by\nintegrating an unsupervised denoising subtask. This integration results in\nimproved semantic image unmixing, even in the presence of notable and realistic\nlevels of imaging noise. A key innovation in denoiSplit is the use of\nspecifically formulated noise models and the suitable adjustment of\nKL-divergence loss for the high-dimensional hierarchical latent space we are\ntraining. We showcase the performance of denoiSplit across multiple tasks on\nreal-world microscopy images. Additionally, we perform qualitative and\nquantitative evaluations and compare the results to existing benchmarks,\ndemonstrating the effectiveness of using denoiSplit: a single Variational\nSplitting Encoder-Decoder (VSE) Network using two suitable noise models to\njointly perform semantic splitting and denoising.\n","authors":["Ashesh Ashesh","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2403.11854v3.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2405.14701v2","updated":"2024-08-11T11:31:23Z","published":"2024-05-23T15:35:48Z","title":"High Fidelity Scene Text Synthesis","summary":" Scene text synthesis involves rendering specified texts onto arbitrary\nimages. Current methods typically formulate this task in an end-to-end manner\nbut lack effective character-level guidance during training. Besides, their\ntext encoders, pre-trained on a single font type, struggle to adapt to the\ndiverse font styles encountered in practical applications. Consequently, these\nmethods suffer from character distortion, repetition, and absence, particularly\nin polystylistic scenarios. To this end, this paper proposes DreamText for\nhigh-fidelity scene text synthesis. Our key idea is to reconstruct the\ndiffusion training process, introducing more refined guidance tailored to this\ntask, to expose and rectify the model's attention at the character level and\nstrengthen its learning of text regions. This transformation poses a hybrid\noptimization challenge, involving both discrete and continuous variables. To\neffectively tackle this challenge, we employ a heuristic alternate optimization\nstrategy. Meanwhile, we jointly train the text encoder and generator to\ncomprehensively learn and utilize the diverse font present in the training\ndataset. This joint training is seamlessly integrated into the alternate\noptimization process, fostering a synergistic relationship between learning\ncharacter embedding and re-estimating character attention. Specifically, in\neach step, we first encode potential character-generated position information\nfrom cross-attention maps into latent character masks. These masks are then\nutilized to update the representation of specific characters in the current\nstep, which, in turn, enables the generator to correct the character's\nattention in the subsequent steps. Both qualitative and quantitative results\ndemonstrate the superiority of our method to the state of the art.\n","authors":["Yibin Wang","Weizhong Zhang","Changhai Zhou","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2405.14701v2.pdf","comment":"Code: https://github.com/CodeGoat24/DreamText, Project page:\n https://codegoat24.github.io/DreamText/"},{"id":"http://arxiv.org/abs/2408.05745v1","updated":"2024-08-11T10:46:49Z","published":"2024-08-11T10:46:49Z","title":"Improving Adversarial Transferability with Neighbourhood Gradient\n Information","summary":" Deep neural networks (DNNs) are known to be susceptible to adversarial\nexamples, leading to significant performance degradation. In black-box attack\nscenarios, a considerable attack performance gap between the surrogate model\nand the target model persists. This work focuses on enhancing the\ntransferability of adversarial examples to narrow this performance gap. We\nobserve that the gradient information around the clean image, i.e.\nNeighbourhood Gradient Information, can offer high transferability. Leveraging\nthis, we propose the NGI-Attack, which incorporates Example Backtracking and\nMultiplex Mask strategies, to use this gradient information and enhance\ntransferability fully. Specifically, we first adopt Example Backtracking to\naccumulate Neighbourhood Gradient Information as the initial momentum term.\nMultiplex Mask, which forms a multi-way attack strategy, aims to force the\nnetwork to focus on non-discriminative regions, which can obtain richer\ngradient information during only a few iterations. Extensive experiments\ndemonstrate that our approach significantly enhances adversarial\ntransferability. Especially, when attacking numerous defense models, we achieve\nan average attack success rate of 95.8%. Notably, our method can plugin with\nany off-the-shelf algorithm to improve their attack performance without\nadditional time cost.\n","authors":["Haijing Guo","Jiafeng Wang","Zhaoyu Chen","Kaixun Jiang","Lingyi Hong","Pinxue Guo","Jinglun Li","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05743v1","updated":"2024-08-11T10:42:22Z","published":"2024-08-11T10:42:22Z","title":"Neural Architecture Search based Global-local Vision Mamba for Palm-Vein\n Recognition","summary":" Due to the advantages such as high security, high privacy, and liveness\nrecognition, vein recognition has been received more and more attention in past\nyears. Recently, deep learning models, e.g., Mamba has shown robust feature\nrepresentation with linear computational complexity and successfully applied\nfor visual tasks. However, vision Manba can capture long-distance feature\ndependencies but unfortunately deteriorate local feature details. Besides,\nmanually designing a Mamba architecture based on human priori knowledge is very\ntime-consuming and error-prone. In this paper, first, we propose a hybrid\nnetwork structure named Global-local Vision Mamba (GLVM), to learn the local\ncorrelations in images explicitly and global dependencies among tokens for vein\nfeature representation. Secondly, we design a Multi-head Mamba to learn the\ndependencies along different directions, so as to improve the feature\nrepresentation ability of vision Mamba. Thirdly, to learn the complementary\nfeatures, we propose a ConvMamba block consisting of three branches, named\nMulti-head Mamba branch (MHMamba), Feature Iteration Unit branch (FIU), and\nConvolutional Neural Network (CNN) branch, where the Feature Iteration Unit\nbranch aims to fuse convolutional local features with Mamba-based global\nrepresentations. Finally, a Globallocal Alternate Neural Architecture Search\n(GLNAS) method is proposed to search the optimal architecture of GLVM\nalternately with the evolutionary algorithm, thereby improving the recognition\nperformance for vein recognition tasks. We conduct rigorous experiments on\nthree public palm-vein databases to estimate the performance. The experimental\nresults demonstrate that the proposed method outperforms the representative\napproaches and achieves state-of-the-art recognition accuracy.\n","authors":["Huafeng Qin","Yuming Fu","Jing Chen","Mounim A. El-Yacoubi","Xinbo Gao","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00334v4","updated":"2024-08-11T10:01:03Z","published":"2023-04-01T15:10:02Z","title":"TalkCLIP: Talking Head Generation with Text-Guided Expressive Speaking\n Styles","summary":" Audio-driven talking head generation has drawn growing attention. To produce\ntalking head videos with desired facial expressions, previous methods rely on\nextra reference videos to provide expression information, which may be\ndifficult to find and hence limits their usage. In this work, we propose\nTalkCLIP, a framework that can generate talking heads where the expressions are\nspecified by natural language, hence allowing for specifying expressions more\nconveniently. To model the mapping from text to expressions, we first construct\na text-video paired talking head dataset where each video has diverse text\ndescriptions that depict both coarse-grained emotions and fine-grained facial\nmovements. Leveraging the proposed dataset, we introduce a CLIP-based style\nencoder that projects natural language-based descriptions to the\nrepresentations of expressions. TalkCLIP can even infer expressions for\ndescriptions unseen during training. TalkCLIP can also use text to modulate\nexpression intensity and edit expressions. Extensive experiments demonstrate\nthat TalkCLIP achieves the advanced capability of generating photo-realistic\ntalking heads with vivid facial expressions guided by text descriptions.\n","authors":["Yifeng Ma","Suzhen Wang","Yu Ding","Bowen Ma","Tangjie Lv","Changjie Fan","Zhipeng Hu","Zhidong Deng","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2304.00334v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18292v4","updated":"2024-08-11T09:31:51Z","published":"2024-02-28T12:37:30Z","title":"FSL-Rectifier: Rectify Outliers in Few-Shot Learning via Test-Time\n Augmentation","summary":" Few-shot-learning (FSL) commonly requires a model to identify images\n(queries) that belong to classes unseen during training, based on a few labeled\nsamples of the new classes (support set) as reference. So far, plenty of\nalgorithms involve training data augmentation to improve the generalization\ncapability of FSL models, but outlier queries or support images during\ninference can still pose great generalization challenges. In this work, to\nreduce the bias caused by the outlier samples, we generate additional\ntest-class samples by combining original samples with suitable train-class\nsamples via a generative image combiner. Then, we obtain averaged features via\nan augmentor, which leads to more typical representations through the\naveraging. We experimentally and theoretically demonstrate the effectiveness of\nour method, e.g., obtaining a test accuracy improvement proportion of around\n10% (e.g., from 46.86% to 53.28%) for trained FSL models. Importantly, given\npretrained image combiner, our method is training-free for off-the-shelf FSL\nmodels, whose performance can be improved without extra datasets nor further\ntraining of the models themselves.\n","authors":["Yunwei Bai","Ying Kiat Tan","Shiming Chen","Yao Shu","Tsuhan Chen"],"pdf_url":"https://arxiv.org/pdf/2402.18292v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06035v2","updated":"2024-08-11T09:30:27Z","published":"2024-01-11T16:48:44Z","title":"RAVEN: Rethinking Adversarial Video Generation with Efficient Tri-plane\n Networks","summary":" We present a novel unconditional video generative model designed to address\nlong-term spatial and temporal dependencies, with attention to computational\nand dataset efficiency. To capture long spatio-temporal dependencies, our\napproach incorporates a hybrid explicit-implicit tri-plane representation\ninspired by 3D-aware generative frameworks developed for three-dimensional\nobject representation and employs a single latent code to model an entire video\nclip. Individual video frames are then synthesized from an intermediate\ntri-plane representation, which itself is derived from the primary latent code.\nThis novel strategy more than halves the computational complexity measured in\nFLOPs compared to the most efficient state-of-the-art methods. Consequently,\nour approach facilitates the efficient and temporally coherent generation of\nvideos. Moreover, our joint frame modeling approach, in contrast to\nautoregressive methods, mitigates the generation of visual artifacts. We\nfurther enhance the model's capabilities by integrating an optical flow-based\nmodule within our Generative Adversarial Network (GAN) based generator\narchitecture, thereby compensating for the constraints imposed by a smaller\ngenerator size. As a result, our model synthesizes high-fidelity video clips at\na resolution of $256\\times256$ pixels, with durations extending to more than\n$5$ seconds at a frame rate of 30 fps. The efficacy and versatility of our\napproach are empirically validated through qualitative and quantitative\nassessments across three different datasets comprising both synthetic and real\nvideo clips. We will make our training and inference code public.\n","authors":["Partha Ghosh","Soubhik Sanyal","Cordelia Schmid","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2401.06035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05729v1","updated":"2024-08-11T08:42:02Z","published":"2024-08-11T08:42:02Z","title":"A Training-Free Framework for Video License Plate Tracking and\n Recognition with Only One-Shot","summary":" Traditional license plate detection and recognition models are often trained\non closed datasets, limiting their ability to handle the diverse license plate\nformats across different regions. The emergence of large-scale pre-trained\nmodels has shown exceptional generalization capabilities, enabling few-shot and\nzero-shot learning. We propose OneShotLP, a training-free framework for\nvideo-based license plate detection and recognition, leveraging these advanced\nmodels. Starting with the license plate position in the first video frame, our\nmethod tracks this position across subsequent frames using a point tracking\nmodule, creating a trajectory of prompts. These prompts are input into a\nsegmentation module that uses a promptable large segmentation model to generate\nlocal masks of the license plate regions. The segmented areas are then\nprocessed by multimodal large language models (MLLMs) for accurate license\nplate recognition. OneShotLP offers significant advantages, including the\nability to function effectively without extensive training data and\nadaptability to various license plate styles. Experimental results on UFPR-ALPR\nand SSIG-SegPlate datasets demonstrate the superior accuracy of our approach\ncompared to traditional methods. This highlights the potential of leveraging\npre-trained models for diverse real-world applications in intelligent\ntransportation systems. The code is available at\nhttps://github.com/Dinghaoxuan/OneShotLP.\n","authors":["Haoxuan Ding","Qi Wang","Junyu Gao","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.05729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05723v1","updated":"2024-08-11T08:26:43Z","published":"2024-08-11T08:26:43Z","title":"Deep Learning with Data Privacy via Residual Perturbation","summary":" Protecting data privacy in deep learning (DL) is of crucial importance.\nSeveral celebrated privacy notions have been established and used for\nprivacy-preserving DL. However, many existing mechanisms achieve privacy at the\ncost of significant utility degradation and computational overhead. In this\npaper, we propose a stochastic differential equation-based residual\nperturbation for privacy-preserving DL, which injects Gaussian noise into each\nresidual mapping of ResNets. Theoretically, we prove that residual perturbation\nguarantees differential privacy (DP) and reduces the generalization gap of DL.\nEmpirically, we show that residual perturbation is computationally efficient\nand outperforms the state-of-the-art differentially private stochastic gradient\ndescent (DPSGD) in utility maintenance without sacrificing membership privacy.\n","authors":["Wenqi Tao","Huaming Ling","Zuoqiang Shi","Bao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03027v2","updated":"2024-08-11T08:10:32Z","published":"2023-12-05T10:12:59Z","title":"Stable Diffusion Exposed: Gender Bias from Prompt to Image","summary":" Several studies have raised awareness about social biases in image generative\nmodels, demonstrating their predisposition towards stereotypes and imbalances.\nThis paper contributes to this growing body of research by introducing an\nevaluation protocol that analyzes the impact of gender indicators at every step\nof the generation process on Stable Diffusion images. Leveraging insights from\nprior work, we explore how gender indicators not only affect gender\npresentation but also the representation of objects and layouts within the\ngenerated images. Our findings include the existence of differences in the\ndepiction of objects, such as instruments tailored for specific genders, and\nshifts in overall layouts. We also reveal that neutral prompts tend to produce\nimages more aligned with masculine prompts than their feminine counterparts. We\nfurther explore where bias originates through representational disparities and\nhow it manifests in the images via prompt-image dependencies, and provide\nrecommendations for developers and users to mitigate potential bias in image\ngeneration.\n","authors":["Yankun Wu","Yuta Nakashima","Noa Garcia"],"pdf_url":"https://arxiv.org/pdf/2312.03027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05717v1","updated":"2024-08-11T08:02:28Z","published":"2024-08-11T08:02:28Z","title":"Deformable Image Registration with Multi-scale Feature Fusion from\n Shared Encoder, Auxiliary and Pyramid Decoders","summary":" In this work, we propose a novel deformable convolutional pyramid network for\nunsupervised image registration. Specifically, the proposed network enhances\nthe traditional pyramid network by adding an additional shared auxiliary\ndecoder for image pairs. This decoder provides multi-scale high-level feature\ninformation from unblended image pairs for the registration task. During the\nregistration process, we also design a multi-scale feature fusion block to\nextract the most beneficial features for the registration task from both global\nand local contexts. Validation results indicate that this method can capture\ncomplex deformations while achieving higher registration accuracy and\nmaintaining smooth and plausible deformations.\n","authors":["Hongchao Zhou","Shunbo Hu"],"pdf_url":"https://arxiv.org/pdf/2408.05717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05713v1","updated":"2024-08-11T07:46:06Z","published":"2024-08-11T07:46:06Z","title":"SSL: A Self-similarity Loss for Improving Generative Image\n Super-resolution","summary":" Generative adversarial networks (GAN) and generative diffusion models (DM)\nhave been widely used in real-world image super-resolution (Real-ISR) to\nenhance the image perceptual quality. However, these generative models are\nprone to generating visual artifacts and false image structures, resulting in\nunnatural Real-ISR results. Based on the fact that natural images exhibit high\nself-similarities, i.e., a local patch can have many similar patches to it in\nthe whole image, in this work we propose a simple yet effective self-similarity\nloss (SSL) to improve the performance of generative Real-ISR models, enhancing\nthe hallucination of structural and textural details while reducing the\nunpleasant visual artifacts. Specifically, we compute a self-similarity graph\n(SSG) of the ground-truth image, and enforce the SSG of Real-ISR output to be\nclose to it. To reduce the training cost and focus on edge areas, we generate\nan edge mask from the ground-truth image, and compute the SSG only on the\nmasked pixels. The proposed SSL serves as a general plug-and-play penalty,\nwhich could be easily applied to the off-the-shelf Real-ISR models. Our\nexperiments demonstrate that, by coupling with SSL, the performance of many\nstate-of-the-art Real-ISR models, including those GAN and DM based ones, can be\nlargely improved, reproducing more perceptually realistic image details and\neliminating many false reconstructions and visual artifacts. Codes and\nsupplementary material can be found at https://github.com/ChrisDud0257/SSL\n","authors":["Du Chen","Zhengqiang Zhang","Jie Liang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05713v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2307.09165v2","updated":"2024-08-11T07:35:40Z","published":"2023-07-18T11:43:01Z","title":"Towards Trustworthy Dataset Distillation","summary":" Efficiency and trustworthiness are two eternal pursuits when applying deep\nlearning in real-world applications. With regard to efficiency, dataset\ndistillation (DD) endeavors to reduce training costs by distilling the large\ndataset into a tiny synthetic dataset. However, existing methods merely\nconcentrate on in-distribution (InD) classification in a closed-world setting,\ndisregarding out-of-distribution (OOD) samples. On the other hand, OOD\ndetection aims to enhance models' trustworthiness, which is always\ninefficiently achieved in full-data settings. For the first time, we\nsimultaneously consider both issues and propose a novel paradigm called\nTrustworthy Dataset Distillation (TrustDD). By distilling both InD samples and\noutliers, the condensed datasets are capable of training models competent in\nboth InD classification and OOD detection. To alleviate the requirement of real\noutlier data, we further propose to corrupt InD samples to generate\npseudo-outliers, namely Pseudo-Outlier Exposure (POE). Comprehensive\nexperiments on various settings demonstrate the effectiveness of TrustDD, and\nPOE surpasses the state-of-the-art method Outlier Exposure (OE). Compared with\nthe preceding DD, TrustDD is more trustworthy and applicable to open-world\nscenarios. Our code is available at https://github.com/mashijie1028/TrustDD\n","authors":["Shijie Ma","Fei Zhu","Zhen Cheng","Xu-Yao Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.09165v2.pdf","comment":"Accepted to Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2408.05711v1","updated":"2024-08-11T07:03:21Z","published":"2024-08-11T07:03:21Z","title":"Contrastive masked auto-encoders based self-supervised hashing for 2D\n image and 3D point cloud cross-modal retrieval","summary":" Implementing cross-modal hashing between 2D images and 3D point-cloud data is\na growing concern in real-world retrieval systems. Simply applying existing\ncross-modal approaches to this new task fails to adequately capture latent\nmulti-modal semantics and effectively bridge the modality gap between 2D and\n3D. To address these issues without relying on hand-crafted labels, we propose\ncontrastive masked autoencoders based self-supervised hashing (CMAH) for\nretrieval between images and point-cloud data. We start by contrasting 2D-3D\npairs and explicitly constraining them into a joint Hamming space. This\ncontrastive learning process ensures robust discriminability for the generated\nhash codes and effectively reduces the modality gap. Moreover, we utilize\nmulti-modal auto-encoders to enhance the model's understanding of multi-modal\nsemantics. By completing the masked image/point-cloud data modeling task, the\nmodel is encouraged to capture more localized clues. In addition, the proposed\nmulti-modal fusion block facilitates fine-grained interactions among different\nmodalities. Extensive experiments on three public datasets demonstrate that the\nproposed CMAH significantly outperforms all baseline methods.\n","authors":["Rukai Wei","Heng Cui","Yu Liu","Yufeng Hou","Yanzhao Xie","Ke Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.05711v1.pdf","comment":"Accepted by ICME 2024"},{"id":"http://arxiv.org/abs/2408.05710v1","updated":"2024-08-11T07:01:39Z","published":"2024-08-11T07:01:39Z","title":"Efficient Diffusion Transformer with Step-wise Dynamic Attention\n Mediators","summary":" This paper identifies significant redundancy in the query-key interactions\nwithin self-attention mechanisms of diffusion transformer models, particularly\nduring the early stages of denoising diffusion steps. In response to this\nobservation, we present a novel diffusion transformer framework incorporating\nan additional set of mediator tokens to engage with queries and keys\nseparately. By modulating the number of mediator tokens during the denoising\ngeneration phases, our model initiates the denoising process with a precise,\nnon-ambiguous stage and gradually transitions to a phase enriched with detail.\nConcurrently, integrating mediator tokens simplifies the attention module's\ncomplexity to a linear scale, enhancing the efficiency of global attention\nprocesses. Additionally, we propose a time-step dynamic mediator token\nadjustment mechanism that further decreases the required computational FLOPs\nfor generation, simultaneously facilitating the generation of high-quality\nimages within the constraints of varied inference budgets. Extensive\nexperiments demonstrate that the proposed method can improve the generated\nimage quality while also reducing the inference cost of diffusion transformers.\nWhen integrated with the recent work SiT, our method achieves a\nstate-of-the-art FID score of 2.01. The source code is available at\nhttps://github.com/LeapLabTHU/Attention-Mediators.\n","authors":["Yifan Pu","Zhuofan Xia","Jiayi Guo","Dongchen Han","Qixiu Li","Duo Li","Yuhui Yuan","Ji Li","Yizeng Han","Shiji Song","Gao Huang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2408.05710v1.pdf","comment":"ECCV 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.05840v1","updated":"2024-08-11T18:22:12Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v1.pdf","comment":"A full draft of the second version of the article"},{"id":"http://arxiv.org/abs/2408.05792v1","updated":"2024-08-11T14:47:34Z","published":"2024-08-11T14:47:34Z","title":"GraphTransfer: A Generic Feature Fusion Framework for Collaborative\n Filtering","summary":" Graph Neural Networks (GNNs) have demonstrated effectiveness in collaborative\nfiltering tasks due to their ability to extract powerful structural features.\nHowever, combining the graph features extracted from user-item interactions and\nauxiliary features extracted from user genres and item properties remains a\nchallenge. Currently available fusion methods face two major issues: 1) simple\nmethods such as concatenation and summation are generic, but not accurate in\ncapturing feature relationships; 2) task-specific methods like attention\nmechanisms and meta paths may not be suitable for general feature fusion. To\naddress these challenges, we present GraphTransfer, a simple but universal\nfeature fusion framework for GNN-based collaborative filtering. Our method\naccurately fuses different types of features by first extracting graph features\nfrom the user-item interaction graph and auxiliary features from users and\nitems using GCN. The proposed cross fusion module then effectively bridges the\nsemantic gaps between the interaction scores of different features. Theoretical\nanalysis and experiments on public datasets show that GraphTransfer outperforms\nother feature fusion methods in CF tasks. Additionally, we demonstrate the\nuniversality of our framework via empirical studies in three other scenarios,\nshowing that GraphTransfer leads to significant improvements in the performance\nof CF algorithms.\n","authors":["Jiafeng Xia","Dongsheng Li","Hansu Gu","Tun Lu","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2408.05792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05751v1","updated":"2024-08-11T11:46:21Z","published":"2024-08-11T11:46:21Z","title":"Advancing Re-Ranking with Multimodal Fusion and Target-Oriented\n Auxiliary Tasks in E-Commerce Search","summary":" In the rapidly evolving field of e-commerce, the effectiveness of search\nre-ranking models is crucial for enhancing user experience and driving\nconversion rates. Despite significant advancements in feature representation\nand model architecture, the integration of multimodal information remains\nunderexplored. This study addresses this gap by investigating the computation\nand fusion of textual and visual information in the context of re-ranking. We\npropose \\textbf{A}dvancing \\textbf{R}e-Ranking with\n\\textbf{M}ulti\\textbf{m}odal Fusion and \\textbf{T}arget-Oriented Auxiliary\nTasks (ARMMT), which integrates an attention-based multimodal fusion technique\nand an auxiliary ranking-aligned task to enhance item representation and\nimprove targeting capabilities. This method not only enriches the understanding\nof product attributes but also enables more precise and personalized\nrecommendations. Experimental evaluations on JD.com's search platform\ndemonstrate that ARMMT achieves state-of-the-art performance in multimodal\ninformation integration, evidenced by a 0.22\\% increase in the Conversion Rate\n(CVR), significantly contributing to Gross Merchandise Volume (GMV). This\npioneering approach has the potential to revolutionize e-commerce re-ranking,\nleading to elevated user satisfaction and business growth.\n","authors":["Enqiang Xu","Xinhui Li","Zhigong Zhou","Jiahao Ji","Jinyuan Zhao","Dadong Miao","Songlin Wang","Lin Liu","Sulong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.05751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03533v2","updated":"2024-08-11T09:08:59Z","published":"2024-08-07T04:20:28Z","title":"Lifelong Personalized Low-Rank Adaptation of Large Language Models for\n Recommendation","summary":" We primarily focus on the field of large language models (LLMs) for\nrecommendation, which has been actively explored recently and poses a\nsignificant challenge in effectively enhancing recommender systems with logical\nreasoning abilities and open-world knowledge. Current mainstream efforts mainly\ncenter around injecting personalized information from recommendation models\ninto LLMs by customizing input templates or aligning representations between\nsemantic and recommendation spaces at the prediction layer. However, they face\nthree significant limitations: (1) LoRA is mostly used as a core component in\nexisting works, but personalization is not well established in LoRA parameters\nas the LoRA matrix shared by every user may not cater to different users'\ncharacteristics, leading to suboptimal performance. (2) Although lifelong\npersonalized behavior sequences are ideal for personalization, their use raises\neffectiveness and efficiency issues since LLMs require escalating training and\ninference time to extend text lengths. (3) Existing approaches aren't scalable\nfor large datasets due to training efficiency constraints. Thus, LLMs only see\na small fraction of the datasets (e.g., less than 10%) instead of the whole\ndatasets, limiting their exposure to the full training space. To address these\nproblems, we propose RecLoRA. This model incorporates a Personalized LoRA\nmodule that maintains independent LoRAs for different users and a Long-Short\nModality Retriever that retrieves different history lengths for different\nmodalities, significantly improving performance while adding minimal time cost.\nFurthermore, we design a Few2Many Learning Strategy, using a conventional\nrecommendation model as a lens to magnify small training spaces to full spaces.\nExtensive experiments on public datasets demonstrate the efficacy of our\nRecLoRA compared to existing baseline models.\n","authors":["Jiachen Zhu","Jianghao Lin","Xinyi Dai","Bo Chen","Rong Shan","Jieming Zhu","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.03533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05709v1","updated":"2024-08-11T07:00:27Z","published":"2024-08-11T07:00:27Z","title":"Moment&Cross: Next-Generation Real-Time Cross-Domain CTR Prediction for\n Live-Streaming Recommendation at Kuaishou","summary":" Kuaishou, is one of the largest short-video and live-streaming platform,\ncompared with short-video recommendations, live-streaming recommendation is\nmore complex because of: (1) temporarily-alive to distribution, (2) user may\nwatch for a long time with feedback delay, (3) content is unpredictable and\nchanges over time. Actually, even if a user is interested in the live-streaming\nauthor, it still may be an negative watching (e.g., short-view < 3s) since the\nreal-time content is not attractive enough. Therefore, for live-streaming\nrecommendation, there exists a challenging task: how do we recommend the\nlive-streaming at right moment for users? Additionally, our platform's major\nexposure content is short short-video, and the amount of exposed short-video is\n9x more than exposed live-streaming. Thus users will leave more behaviors on\nshort-videos, which leads to a serious data imbalance problem making the\nlive-streaming data could not fully reflect user interests. In such case, there\nraises another challenging task: how do we utilize users' short-video behaviors\nto make live-streaming recommendation better?\n","authors":["Jiangxia Cao","Shen Wang","Yue Li","Shenghui Wang","Jian Tang","Shiyao Wang","Shuang Yang","Zhaojie Liu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.05709v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2302.06674v4","updated":"2024-08-11T05:14:28Z","published":"2023-02-13T20:27:26Z","title":"PK-ICR: Persona-Knowledge Interactive Context Retrieval for Grounded\n Dialogue","summary":" Identifying relevant persona or knowledge for conversational systems is\ncritical to grounded dialogue response generation. However, each grounding has\nbeen mostly researched in isolation with more practical multi-context dialogue\ntasks introduced in recent works. We define Persona and Knowledge Dual Context\nIdentification as the task to identify persona and knowledge jointly for a\ngiven dialogue, which could be of elevated importance in complex multi-context\ndialogue settings. We develop a novel grounding retrieval method that utilizes\nall contexts of dialogue simultaneously. Our method requires less computational\npower via utilizing neural QA retrieval models. We further introduce our novel\nnull-positive rank test which measures ranking performance on semantically\ndissimilar samples (i.e. hard negatives) in relation to data augmentation.\n","authors":["Minsik Oh","Joosung Lee","Jiwei Li","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2302.06674v4.pdf","comment":"Accepted to EMNLP 2023 main conference (Oral). Code available at\n https://github.com/minsik-ai/PK-ICR"},{"id":"http://arxiv.org/abs/2408.05676v1","updated":"2024-08-11T02:31:13Z","published":"2024-08-11T02:31:13Z","title":"A Decoding Acceleration Framework for Industrial Deployable LLM-based\n Recommender Systems","summary":" Recently, increasing attention has been paid to LLM-based recommender\nsystems, but their deployment is still under exploration in the industry. Most\ndeployments utilize LLMs as feature enhancers, generating augmentation\nknowledge in the offline stage. However, in recommendation scenarios, involving\nnumerous users and items, even offline generation with LLMs consumes\nconsiderable time and resources. This generation inefficiency stems from the\nautoregressive nature of LLMs, and a promising direction for acceleration is\nspeculative decoding, a Draft-then-Verify paradigm that increases the number of\ngenerated tokens per decoding step. In this paper, we first identify that\nrecommendation knowledge generation is suitable for retrieval-based speculative\ndecoding. Then, we discern two characteristics: (1) extensive items and users\nin RSs bring retrieval inefficiency, and (2) RSs exhibit high diversity\ntolerance for text generated by LLMs. Based on the above insights, we propose a\nDecoding Acceleration Framework for LLM-based Recommendation (dubbed DARE),\nwith Customized Retrieval Pool to improve retrieval efficiency and Relaxed\nVerification to increase the acceptance rate of draft tokens, respectively.\nExtensive experiments demonstrate that DARE achieves a 3-5x speedup and is\ncompatible with various frameworks and backbone LLMs. DARE has also been\ndeployed to online advertising scenarios within a large-scale commercial\nenvironment, achieving a 3.45x speedup while maintaining the downstream\nperformance.\n","authors":["Yunjia Xi","Hangyu Wang","Bo Chen","Jianghao Lin","Menghui Zhu","Weiwen Liu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.05676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05667v1","updated":"2024-08-11T01:14:13Z","published":"2024-08-11T01:14:13Z","title":"Utilizing Large Language Models to Optimize the Detection and\n Explainability of Phishing Websites","summary":" In this paper, we introduce PhishLang, an open-source, lightweight Large\nLanguage Model (LLM) specifically designed for phishing website detection\nthrough contextual analysis of the website. Unlike traditional heuristic or\nmachine learning models that rely on static features and struggle to adapt to\nnew threats and deep learning models that are computationally intensive, our\nmodel utilizes the advanced language processing capabilities of LLMs to learn\ngranular features that are characteristic of phishing attacks. Furthermore,\nPhishLang operates with minimal data preprocessing and offers performance\ncomparable to leading deep learning tools, while being significantly faster and\nless resource-intensive. Over a 3.5-month testing period, PhishLang\nsuccessfully identified approximately 26K phishing URLs, many of which were\nundetected by popular antiphishing blocklists, thus demonstrating its potential\nto aid current detection measures. We also evaluate PhishLang against several\nrealistic adversarial attacks and develop six patches that make it very robust\nagainst such threats. Furthermore, we integrate PhishLang with GPT-3.5 Turbo to\ncreate \\textit{explainable blocklisting} - warnings that provide users with\ncontextual information about different features that led to a website being\nmarked as phishing. Finally, we have open-sourced the PhishLang framework and\ndeveloped a Chromium-based browser extension and URL scanner website, which\nimplement explainable warnings for end-users.\n","authors":["Sayak Saha Roy","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2408.05667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05843v1","updated":"2024-08-11T18:49:52Z","published":"2024-08-11T18:49:52Z","title":"Online Matrix Completion: A Collaborative Approach with Hott Items","summary":" We investigate the low rank matrix completion problem in an online setting\nwith ${M}$ users, ${N}$ items, ${T}$ rounds, and an unknown rank-$r$ reward\nmatrix ${R}\\in \\mathbb{R}^{{M}\\times {N}}$. This problem has been well-studied\nin the literature and has several applications in practice. In each round, we\nrecommend ${S}$ carefully chosen distinct items to every user and observe noisy\nrewards. In the regime where ${M},{N} >> {T}$, we propose two distinct\ncomputationally efficient algorithms for recommending items to users and\nanalyze them under the benign \\emph{hott items} assumption.1) First, for\n${S}=1$, under additional incoherence/smoothness assumptions on ${R}$, we\npropose the phased algorithm \\textsc{PhasedClusterElim}. Our algorithm obtains\na near-optimal per-user regret of\n$\\tilde{O}({N}{M}^{-1}(\\Delta^{-1}+\\Delta_{{hott}}^{-2}))$ where\n$\\Delta_{{hott}},\\Delta$ are problem-dependent gap parameters with\n$\\Delta_{{hott}} >> \\Delta$ almost always. 2) Second, we consider a simplified\nsetting with ${S}=r$ where we make significantly milder assumptions on ${R}$.\nHere, we introduce another phased algorithm, \\textsc{DeterminantElim}, to\nderive a regret guarantee of $\\widetilde{O}({N}{M}^{-1/r}\\Delta_{det}^{-1}))$\nwhere $\\Delta_{{det}}$ is another problem-dependent gap. Both algorithms\ncrucially use collaboration among users to jointly eliminate sub-optimal items\nfor groups of users successively in phases, but with distinctive and novel\napproaches.\n","authors":["Dheeraj Baby","Soumyabrata Pal"],"pdf_url":"https://arxiv.org/pdf/2408.05843v1.pdf","comment":"Appeared at the Forty-first International Conference on Machine\n Learning, 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.05874v1","updated":"2024-08-11T22:59:32Z","published":"2024-08-11T22:59:32Z","title":"LLM-Based Robust Product Classification in Commerce and Compliance","summary":" Product classification is a crucial task in international trade, as\ncompliance regulations are verified and taxes and duties are applied based on\nproduct categories. Manual classification of products is time-consuming and\nerror-prone, and the sheer volume of products imported and exported renders the\nmanual process infeasible. Consequently, e-commerce platforms and enterprises\ninvolved in international trade have turned to automatic product classification\nusing machine learning. However, current approaches do not consider the\nreal-world challenges associated with product classification, such as very\nabbreviated and incomplete product descriptions. In addition, recent\nadvancements in generative Large Language Models (LLMs) and their reasoning\ncapabilities are mainly untapped in product classification and e-commerce. In\nthis research, we explore the real-life challenges of industrial classification\nand we propose data perturbations that allow for realistic data simulation.\nFurthermore, we employ LLM-based product classification to improve the\nrobustness of the prediction in presence of incomplete data. Our research shows\nthat LLMs with in-context learning outperform the supervised approaches in the\nclean-data scenario. Additionally, we illustrate that LLMs are significantly\nmore robust than the supervised approaches when data attacks are present.\n","authors":["Sina Gholamian","Gianfranco Romani","Bartosz Rudnikowicz","Laura Skylaki"],"pdf_url":"https://arxiv.org/pdf/2408.05874v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.07867v2","updated":"2024-08-11T21:46:29Z","published":"2024-02-12T18:28:36Z","title":"PoisonedRAG: Knowledge Corruption Attacks to Retrieval-Augmented\n Generation of Large Language Models","summary":" Large language models (LLMs) have achieved remarkable success due to their\nexceptional generative capabilities. Despite their success, they also have\ninherent limitations such as a lack of up-to-date knowledge and hallucination.\nRetrieval-Augmented Generation (RAG) is a state-of-the-art technique to\nmitigate these limitations. The key idea of RAG is to ground the answer\ngeneration of an LLM on external knowledge retrieved from a knowledge database.\nExisting studies mainly focus on improving the accuracy or efficiency of RAG,\nleaving its security largely unexplored. We aim to bridge the gap in this work.\nWe find that the knowledge database in a RAG system introduces a new and\npractical attack surface. Based on this attack surface, we propose PoisonedRAG,\nthe first knowledge corruption attack to RAG, where an attacker could inject a\nfew malicious texts into the knowledge database of a RAG system to induce an\nLLM to generate an attacker-chosen target answer for an attacker-chosen target\nquestion. We formulate knowledge corruption attacks as an optimization problem,\nwhose solution is a set of malicious texts. Depending on the background\nknowledge (e.g., black-box and white-box settings) of an attacker on a RAG\nsystem, we propose two solutions to solve the optimization problem,\nrespectively. Our results show PoisonedRAG could achieve a 90% attack success\nrate when injecting five malicious texts for each target question into a\nknowledge database with millions of texts. We also evaluate several defenses\nand our results show they are insufficient to defend against PoisonedRAG,\nhighlighting the need for new defenses.\n","authors":["Wei Zou","Runpeng Geng","Binghui Wang","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2402.07867v2.pdf","comment":"To appear in USENIX Security Symposium 2025. The code is available at\n https://github.com/sleeepeer/PoisonedRAG"},{"id":"http://arxiv.org/abs/2402.09558v2","updated":"2024-08-11T21:45:58Z","published":"2024-02-14T20:19:24Z","title":"Bidirectional Generative Pre-training for Improving Time Series\n Representation Learning","summary":" Learning time-series representations for discriminative tasks, such as\nclassification and regression, has been a long-standing challenge in the\nhealthcare domain. Current pre-training methods are limited in either\nunidirectional next-token prediction or randomly masked token prediction. We\npropose a novel architecture called Bidirectional Timely Generative Pre-trained\nTransformer (BiTimelyGPT), which pre-trains on biosignals and longitudinal\nclinical records by both next-token and previous-token prediction in\nalternating transformer layers. This pre-training task preserves original\ndistribution and data shapes of the time-series. Additionally, the full-rank\nforward and backward attention matrices exhibit more expressive representation\ncapabilities. Using biosignals and longitudinal clinical records, BiTimelyGPT\ndemonstrates superior performance in predicting neurological functionality,\ndisease diagnosis, and physiological signs. By visualizing the attention\nheatmap, we observe that the pre-trained BiTimelyGPT can identify\ndiscriminative segments from biosignal time-series sequences, even more so\nafter fine-tuning on the task.\n","authors":["Ziyang Song","Qincheng Lu","He Zhu","David Buckeridge","Yue Li"],"pdf_url":"https://arxiv.org/pdf/2402.09558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15214v4","updated":"2024-08-11T21:36:46Z","published":"2023-09-24T19:57:22Z","title":"Residual Corrective Diffusion Modeling for Km-scale Atmospheric\n Downscaling","summary":" The state of the art for physical hazard prediction from weather and climate\nrequires expensive km-scale numerical simulations driven by coarser resolution\nglobal inputs. Here, a generative diffusion architecture is explored for\ndownscaling such global inputs to km-scale, as a cost-effective machine\nlearning alternative. The model is trained to predict 2km data from a regional\nweather model over Taiwan, conditioned on a 25km global reanalysis. To address\nthe large resolution ratio, different physics involved at different scales and\nprediction of channels beyond those in the input data, we employ a two-step\napproach where a UNet predicts the mean and a corrector diffusion (CorrDiff)\nmodel predicts the residual. CorrDiff exhibits encouraging skill in bulk MAE\nand CRPS scores. The predicted spectra and distributions from CorrDiff\nfaithfully recover important power law relationships in the target data. Case\nstudies of coherent weather phenomena show that CorrDiff can help sharpen wind\nand temperature gradients that co-locate with intense rainfall in cold front,\nand can help intensify typhoons and synthesize rain band structures.\nCalibration of model uncertainty remains challenging. The prospect of unifying\nmethods like CorrDiff with coarser resolution global weather models implies a\npotential for global-to-regional multi-scale machine learning simulation.\n","authors":["Morteza Mardani","Noah Brenowitz","Yair Cohen","Jaideep Pathak","Chieh-Yu Chen","Cheng-Chin Liu","Arash Vahdat","Mohammad Amin Nabian","Tao Ge","Akshay Subramaniam","Karthik Kashinath","Jan Kautz","Mike Pritchard"],"pdf_url":"https://arxiv.org/pdf/2309.15214v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05861v1","updated":"2024-08-11T21:04:14Z","published":"2024-08-11T21:04:14Z","title":"Leveraging Knowledge Graph-Based Human-Like Memory Systems to Solve\n Partially Observable Markov Decision Processes","summary":" Humans observe only part of their environment at any moment but can still\nmake complex, long-term decisions thanks to our long-term memory system. To\ntest how an AI can learn and utilize its long-term memory system, we have\ndeveloped a partially observable Markov decision processes (POMDP) environment,\nwhere the agent has to answer questions while navigating a maze. The\nenvironment is completely knowledge graph (KG) based, where the hidden states\nare dynamic KGs. A KG is both human- and machine-readable, making it easy to\nsee what the agents remember and forget. We train and compare agents with\ndifferent memory systems, to shed light on how human brains work when it comes\nto managing its own memory systems. By repurposing the given learning objective\nas learning a memory management policy, we were able to capture the most likely\nbelief state, which is not only interpretable but also reusable.\n","authors":["Taewoon Kim","Vincent François-Lavet","Michael Cochez"],"pdf_url":"https://arxiv.org/pdf/2408.05861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05857v1","updated":"2024-08-11T20:02:51Z","published":"2024-08-11T20:02:51Z","title":"Comparative Evaluation of Memory Technologies for Synaptic Crossbar\n Arrays- Part 2: Design Knobs and DNN Accuracy Trends","summary":" Crossbar memory arrays have been touted as the workhorse of in-memory\ncomputing (IMC)-based acceleration of Deep Neural Networks (DNNs), but the\nassociated hardware non-idealities limit their efficacy. To address this,\ncross-layer design solutions that reduce the impact of hardware non-idealities\non DNN accuracy are needed. In Part 1 of this paper, we established the\nco-optimization strategies for various memory technologies and their crossbar\narrays, and conducted a comparative technology evaluation in the context of IMC\nrobustness. In this part, we analyze various design knobs such as array size\nand bit-slice (number of bits per device) and their impact on the performance\nof 8T SRAM, ferroelectric transistor (FeFET), Resistive RAM (ReRAM) and\nspin-orbit-torque magnetic RAM (SOT-MRAM) in the context of inference accuracy\nat 7nm technology node. Further, we study the effect of circuit design\nsolutions such as Partial Wordline Activation (PWA) and custom ADC reference\nlevels that reduce the hardware non-idealities and comparatively analyze the\nresponse of each technology to such accuracy enhancing techniques. Our results\non ResNet-20 (with CIFAR-10) show that PWA increases accuracy by up to 32.56%\nwhile custom ADC reference levels yield up to 31.62% accuracy enhancement. We\nobserve that compared to the other technologies, FeFET, by virtue of its small\nlayout height and high distinguishability of its memory states, is best suited\nfor large arrays. For higher bit-slices and a more complex dataset (ResNet-50\nwith Cifar-100) we found that ReRAM matches the performance of FeFET.\n","authors":["Jeffry Victor","Chunguang Wang","Sumeet K. Gupta"],"pdf_url":"https://arxiv.org/pdf/2408.05857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05855v1","updated":"2024-08-11T19:59:08Z","published":"2024-08-11T19:59:08Z","title":"Using Retriever Augmented Large Language Models for Attack Graph\n Generation","summary":" As the complexity of modern systems increases, so does the importance of\nassessing their security posture through effective vulnerability management and\nthreat modeling techniques. One powerful tool in the arsenal of cybersecurity\nprofessionals is the attack graph, a representation of all potential attack\npaths within a system that an adversary might exploit to achieve a certain\nobjective. Traditional methods of generating attack graphs involve expert\nknowledge, manual curation, and computational algorithms that might not cover\nthe entire threat landscape due to the ever-evolving nature of vulnerabilities\nand exploits. This paper explores the approach of leveraging large language\nmodels (LLMs), such as ChatGPT, to automate the generation of attack graphs by\nintelligently chaining Common Vulnerabilities and Exposures (CVEs) based on\ntheir preconditions and effects. It also shows how to utilize LLMs to create\nattack graphs from threat reports.\n","authors":["Renascence Tarafder Prapty","Ashish Kundu","Arun Iyengar"],"pdf_url":"https://arxiv.org/pdf/2408.05855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05854v1","updated":"2024-08-11T19:54:50Z","published":"2024-08-11T19:54:50Z","title":"On the Robustness of Kernel Goodness-of-Fit Tests","summary":" Goodness-of-fit testing is often criticized for its lack of practical\nrelevance; since ``all models are wrong'', the null hypothesis that the data\nconform to our model is ultimately always rejected when the sample size is\nlarge enough. Despite this, probabilistic models are still used extensively,\nraising the more pertinent question of whether the model is good enough for a\nspecific task. This question can be formalized as a robust goodness-of-fit\ntesting problem by asking whether the data were generated by a distribution\ncorresponding to our model up to some mild perturbation. In this paper, we show\nthat existing kernel goodness-of-fit tests are not robust according to common\nnotions of robustness including qualitative and quantitative robustness. We\nalso show that robust techniques based on tilted kernels from the parameter\nestimation literature are not sufficient for ensuring both types of robustness\nin the context of goodness-of-fit testing. We therefore propose the first\nrobust kernel goodness-of-fit test which resolves this open problem using\nkernel Stein discrepancy balls, which encompass perturbation models such as\nHuber contamination models and density uncertainty bands.\n","authors":["Xing Liu","François-Xavier Briol"],"pdf_url":"https://arxiv.org/pdf/2408.05854v1.pdf","comment":"50 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.04607v2","updated":"2024-08-11T19:50:59Z","published":"2024-08-08T17:27:29Z","title":"Risk and cross validation in ridge regression with correlated samples","summary":" Recent years have seen substantial advances in our understanding of\nhigh-dimensional ridge regression, but existing theories assume that training\nexamples are independent. By leveraging recent techniques from random matrix\ntheory and free probability, we provide sharp asymptotics for the in- and\nout-of-sample risks of ridge regression when the data points have arbitrary\ncorrelations. We demonstrate that in this setting, the generalized cross\nvalidation estimator (GCV) fails to correctly predict the out-of-sample risk.\nHowever, in the case where the noise residuals have the same correlations as\nthe data points, one can modify the GCV to yield an efficiently-computable\nunbiased estimator that concentrates in the high-dimensional limit, which we\ndub CorrGCV. We further extend our asymptotic analysis to the case where the\ntest point has nontrivial correlations with the training set, a setting often\nencountered in time series forecasting. Assuming knowledge of the correlation\nstructure of the time series, this again yields an extension of the GCV\nestimator, and sharply characterizes the degree to which such test points yield\nan overly optimistic prediction of long-time risk. We validate the predictions\nof our theory across a variety of high dimensional data.\n","authors":["Alexander Atanasov","Jacob A. Zavatone-Veth","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2408.04607v2.pdf","comment":"44 pages, 18 figures. v2: updated funding acknowledgements"},{"id":"http://arxiv.org/abs/2404.01365v3","updated":"2024-08-11T19:43:36Z","published":"2024-04-01T17:56:06Z","title":"Prompt-prompted Adaptive Structured Pruning for Efficient LLM Generation","summary":" With the development of transformer-based large language models (LLMs), they\nhave been applied to many fields due to their remarkable utility, but this\ncomes at a considerable computational cost at deployment. Fortunately, some\nmethods such as pruning or constructing a mixture of experts (MoE) aim at\nexploiting sparsity in transformer feedforward (FF) blocks to gain boosts in\nspeed and reduction in memory requirements. However, these techniques can be\nvery costly and inflexible in practice, as they often require training or are\nrestricted to specific types of architectures. To address this, we introduce\nGRIFFIN, a novel training-free and calibration-free method that selects unique\nFF experts at the sequence level for efficient generation across a plethora of\nLLMs with different non-ReLU activation functions. This is possible due to a\ncritical observation that many trained LLMs naturally produce highly structured\nFF activation patterns within a sequence, which we call flocking. Despite our\nmethod's simplicity, we show with 50% of the FF parameters, GRIFFIN maintains\nthe original model's performance with little to no degradation on a variety of\nclassification and generation tasks, all while improving latency (e.g.\n1.29$\\times$ and 1.25$\\times$ speed-ups in Gemma 7B and Llama 2 13B,\nrespectively, on an NVIDIA L40). Code is available at\nhttps://github.com/hdong920/GRIFFIN.\n","authors":["Harry Dong","Beidi Chen","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2404.01365v3.pdf","comment":"Revision 1: Updated abstract with code link; re-ran top-k + sampling\n rows in Table 4, conclusions unchanged Revision 2: Reframing and new\n experiments, conclusions unchanged"},{"id":"http://arxiv.org/abs/2408.05849v1","updated":"2024-08-11T19:39:12Z","published":"2024-08-11T19:39:12Z","title":"An End-to-End Model for Time Series Classification In the Presence of\n Missing Values","summary":" Time series classification with missing data is a prevalent issue in time\nseries analysis, as temporal data often contain missing values in practical\napplications. The traditional two-stage approach, which handles imputation and\nclassification separately, can result in sub-optimal performance as label\ninformation is not utilized in the imputation process. On the other hand, a\none-stage approach can learn features under missing information, but feature\nrepresentation is limited as imputed errors are propagated in the\nclassification process. To overcome these challenges, this study proposes an\nend-to-end neural network that unifies data imputation and representation\nlearning within a single framework, allowing the imputation process to take\nadvantage of label information. Differing from previous methods, our approach\nplaces less emphasis on the accuracy of imputation data and instead prioritizes\nclassification performance. A specifically designed multi-scale feature\nlearning module is implemented to extract useful information from the\nnoise-imputation data. The proposed model is evaluated on 68 univariate time\nseries datasets from the UCR archive, as well as a multivariate time series\ndataset with various missing data ratios and 4 real-world datasets with missing\ninformation. The results indicate that the proposed model outperforms\nstate-of-the-art approaches for incomplete time series classification,\nparticularly in scenarios with high levels of missing data.\n","authors":["Pengshuai Yao","Mengna Liu","Xu Cheng","Fan Shi","Huan Li","Xiufeng Liu","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.05849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00856v2","updated":"2024-08-11T18:54:17Z","published":"2024-08-01T18:10:05Z","title":"Deep Learning Approach for Changepoint Detection: Penalty Parameter\n Optimization","summary":" Changepoint detection, a technique for identifying significant shifts within\ndata sequences, is crucial in various fields such as finance, genomics,\nmedicine, etc. Dynamic programming changepoint detection algorithms are\nemployed to identify the locations of changepoints within a sequence, which\nrely on a penalty parameter to regulate the number of changepoints. To estimate\nthis penalty parameter, previous work uses simple models such as linear models\nor decision trees. This study introduces a novel deep learning method for\npredicting penalty parameters, leading to demonstrably improved changepoint\ndetection accuracy on large benchmark supervised labeled datasets compared to\nprevious methods.\n","authors":["Tung L Nguyen","Toby Dylan Hocking"],"pdf_url":"https://arxiv.org/pdf/2408.00856v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.05843v1","updated":"2024-08-11T18:49:52Z","published":"2024-08-11T18:49:52Z","title":"Online Matrix Completion: A Collaborative Approach with Hott Items","summary":" We investigate the low rank matrix completion problem in an online setting\nwith ${M}$ users, ${N}$ items, ${T}$ rounds, and an unknown rank-$r$ reward\nmatrix ${R}\\in \\mathbb{R}^{{M}\\times {N}}$. This problem has been well-studied\nin the literature and has several applications in practice. In each round, we\nrecommend ${S}$ carefully chosen distinct items to every user and observe noisy\nrewards. In the regime where ${M},{N} >> {T}$, we propose two distinct\ncomputationally efficient algorithms for recommending items to users and\nanalyze them under the benign \\emph{hott items} assumption.1) First, for\n${S}=1$, under additional incoherence/smoothness assumptions on ${R}$, we\npropose the phased algorithm \\textsc{PhasedClusterElim}. Our algorithm obtains\na near-optimal per-user regret of\n$\\tilde{O}({N}{M}^{-1}(\\Delta^{-1}+\\Delta_{{hott}}^{-2}))$ where\n$\\Delta_{{hott}},\\Delta$ are problem-dependent gap parameters with\n$\\Delta_{{hott}} >> \\Delta$ almost always. 2) Second, we consider a simplified\nsetting with ${S}=r$ where we make significantly milder assumptions on ${R}$.\nHere, we introduce another phased algorithm, \\textsc{DeterminantElim}, to\nderive a regret guarantee of $\\widetilde{O}({N}{M}^{-1/r}\\Delta_{det}^{-1}))$\nwhere $\\Delta_{{det}}$ is another problem-dependent gap. Both algorithms\ncrucially use collaboration among users to jointly eliminate sub-optimal items\nfor groups of users successively in phases, but with distinctive and novel\napproaches.\n","authors":["Dheeraj Baby","Soumyabrata Pal"],"pdf_url":"https://arxiv.org/pdf/2408.05843v1.pdf","comment":"Appeared at the Forty-first International Conference on Machine\n Learning, 2024"},{"id":"http://arxiv.org/abs/2405.06724v3","updated":"2024-08-11T17:54:22Z","published":"2024-05-10T09:51:06Z","title":"Boolean matrix logic programming for active learning of gene functions\n in genome-scale metabolic network models","summary":" Techniques to autonomously drive research have been prominent in\nComputational Scientific Discovery, while Synthetic Biology is a field of\nscience that focuses on designing and constructing new biological systems for\nuseful purposes. Here we seek to apply logic-based machine learning techniques\nto facilitate cellular engineering and drive biological discovery.\nComprehensive databases of metabolic processes called genome-scale metabolic\nnetwork models (GEMs) are often used to evaluate cellular engineering\nstrategies to optimise target compound production. However, predicted host\nbehaviours are not always correctly described by GEMs, often due to errors in\nthe models. The task of learning the intricate genetic interactions within GEMs\npresents computational and empirical challenges. To address these, we describe\na novel approach called Boolean Matrix Logic Programming (BMLP) by leveraging\nboolean matrices to evaluate large logic programs. We introduce a new system,\n$BMLP_{active}$, which efficiently explores the genomic hypothesis space by\nguiding informative experimentation through active learning. In contrast to\nsub-symbolic methods, $BMLP_{active}$ encodes a state-of-the-art GEM of a\nwidely accepted bacterial host in an interpretable and logical representation\nusing datalog logic programs. Notably, $BMLP_{active}$ can successfully learn\nthe interaction between a gene pair with fewer training examples than random\nexperimentation, overcoming the increase in experimental design space.\n$BMLP_{active}$ enables rapid optimisation of metabolic models to reliably\nengineer biological systems for producing useful compounds. It offers a\nrealistic approach to creating a self-driving lab for microbial engineering.\n","authors":["Lun Ai","Stephen H. Muggleton","Shi-Shun Liang","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2405.06724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05834v1","updated":"2024-08-11T17:29:03Z","published":"2024-08-11T17:29:03Z","title":"Divide-and-Conquer Predictive Coding: a structured Bayesian inference\n algorithm","summary":" Unexpected stimuli induce \"error\" or \"surprise\" signals in the brain. The\ntheory of predictive coding promises to explain these observations in terms of\nBayesian inference by suggesting that the cortex implements variational\ninference in a probabilistic graphical model. However, when applied to machine\nlearning tasks, this family of algorithms has yet to perform on par with other\nvariational approaches in high-dimensional, structured inference problems. To\naddress this, we introduce a novel predictive coding algorithm for structured\ngenerative models, that we call divide-and-conquer predictive coding (DCPC).\nDCPC differs from other formulations of predictive coding, as it respects the\ncorrelation structure of the generative model and provably performs\nmaximum-likelihood updates of model parameters, all without sacrificing\nbiological plausibility. Empirically, DCPC achieves better numerical\nperformance than competing algorithms and provides accurate inference in a\nnumber of problems not previously addressed with predictive coding. We provide\nan open implementation of DCPC in Pyro on Github.\n","authors":["Eli Sennesh","Hao Wu","Tommaso Salvatori"],"pdf_url":"https://arxiv.org/pdf/2408.05834v1.pdf","comment":"22 pages, 5 figures, submitted to Neural Information Processing\n Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2408.05822v1","updated":"2024-08-11T16:53:09Z","published":"2024-08-11T16:53:09Z","title":"Sampling Foundational Transformer: A Theoretical Perspective","summary":" The versatility of self-attention mechanism earned transformers great success\nin almost all data modalities, with limitations on the quadratic complexity and\ndifficulty of training. To apply transformers across different data modalities,\npractitioners have to make specific clever data-modality-dependent\nconstructions. In this paper, we propose Sampling Foundational Transformer\n(SFT) that can work on multiple data modalities (e.g., point cloud, graph, and\nsequence) and constraints (e.g., rotational-invariant). The existence of such\nmodel is important as contemporary foundational modeling requires operability\non multiple data sources. For efficiency on large number of tokens, our model\nrelies on our context aware sampling-without-replacement mechanism for both\nlinear asymptotic computational complexity and real inference time gain. For\nefficiency, we rely on our newly discovered pseudoconvex formulation of\ntransformer layer to increase model's convergence rate. As a model working on\nmultiple data modalities, SFT has achieved competitive results on many\nbenchmarks, while being faster in inference, compared to other very specialized\nmodels.\n","authors":["Viet Anh Nguyen","Minh Lenhat","Khoa Nguyen","Duong Duc Hieu","Dao Huu Hung","Truong Son Hy"],"pdf_url":"https://arxiv.org/pdf/2408.05822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05819v1","updated":"2024-08-11T16:46:42Z","published":"2024-08-11T16:46:42Z","title":"On the Convergence of a Federated Expectation-Maximization Algorithm","summary":" Data heterogeneity has been a long-standing bottleneck in studying the\nconvergence rates of Federated Learning algorithms. In order to better\nunderstand the issue of data heterogeneity, we study the convergence rate of\nthe Expectation-Maximization (EM) algorithm for the Federated Mixture of $K$\nLinear Regressions model. We fully characterize the convergence rate of the EM\nalgorithm under all regimes of $m/n$ where $m$ is the number of clients and $n$\nis the number of data points per client. We show that with a\nsignal-to-noise-ratio (SNR) of order $\\Omega(\\sqrt{K})$, the well-initialized\nEM algorithm converges within the minimax distance of the ground truth under\neach of the regimes. Interestingly, we identify that when $m$ grows\nexponentially in $n$, the EM algorithm only requires a constant number of\niterations to converge. We perform experiments on synthetic datasets to\nillustrate our results. Surprisingly, the results show that rather than being a\nbottleneck, data heterogeneity can accelerate the convergence of federated\nlearning algorithms.\n","authors":["Zhixu Tao","Rajita Chandak","Sanjeev Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2408.05819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01647v2","updated":"2024-08-11T15:59:30Z","published":"2023-02-03T10:48:24Z","title":"Blockwise Self-Supervised Learning at Scale","summary":" Current state-of-the-art deep networks are all powered by backpropagation. In\nthis paper, we explore alternatives to full backpropagation in the form of\nblockwise learning rules, leveraging the latest developments in self-supervised\nlearning. We show that a blockwise pretraining procedure consisting of training\nindependently the 4 main blocks of layers of a ResNet-50 with Barlow Twins'\nloss function at each block performs almost as well as end-to-end\nbackpropagation on ImageNet: a linear probe trained on top of our blockwise\npretrained model obtains a top-1 classification accuracy of 70.48%, only 1.1%\nbelow the accuracy of an end-to-end pretrained network (71.57% accuracy). We\nperform extensive experiments to understand the impact of different components\nwithin our method and explore a variety of adaptations of self-supervised\nlearning to the blockwise paradigm, building an exhaustive understanding of the\ncritical avenues for scaling local learning rules to large networks, with\nimplications ranging from hardware design to neuroscience.\n","authors":["Shoaib Ahmed Siddiqui","David Krueger","Yann LeCun","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2302.01647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05807v1","updated":"2024-08-11T15:56:44Z","published":"2024-08-11T15:56:44Z","title":"Kernel Density Estimators in Large Dimensions","summary":" This paper studies Kernel density estimation for a high-dimensional\ndistribution $\\rho(x)$. Traditional approaches have focused on the limit of\nlarge number of data points $n$ and fixed dimension $d$. We analyze instead the\nregime where both the number $n$ of data points $y_i$ and their dimensionality\n$d$ grow with a fixed ratio $\\alpha=(\\log n)/d$. Our study reveals three\ndistinct statistical regimes for the kernel-based estimate of the density $\\hat\n\\rho_h^{\\mathcal {D}}(x)=\\frac{1}{n h^d}\\sum_{i=1}^n\nK\\left(\\frac{x-y_i}{h}\\right)$, depending on the bandwidth $h$: a classical\nregime for large bandwidth where the Central Limit Theorem (CLT) holds, which\nis akin to the one found in traditional approaches. Below a certain value of\nthe bandwidth, $h_{CLT}(\\alpha)$, we find that the CLT breaks down. The\nstatistics of $\\hat \\rho_h^{\\mathcal {D}}(x)$ for a fixed $x$ drawn from\n$\\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable\ndistribution). In particular below a value $h_G(\\alpha)$, we find that $\\hat\n\\rho_h^{\\mathcal {D}}(x)$ is governed by extreme value statistics: only a few\npoints in the database matter and give the dominant contribution to the density\nestimator. We provide a detailed analysis for high-dimensional multivariate\nGaussian data. We show that the optimal bandwidth threshold based on\nKullback-Leibler divergence lies in the new statistical regime identified in\nthis paper. Our findings reveal limitations of classical approaches, show the\nrelevance of these new statistical regimes, and offer new insights for Kernel\ndensity estimation in high-dimensional settings.\n","authors":["Giulio Biroli","Marc Mézard"],"pdf_url":"https://arxiv.org/pdf/2408.05807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05804v1","updated":"2024-08-11T15:49:00Z","published":"2024-08-11T15:49:00Z","title":"A Single Goal is All You Need: Skills and Exploration Emerge from\n Contrastive RL without Rewards, Demonstrations, or Subgoals","summary":" In this paper, we present empirical evidence of skills and directed\nexploration emerging from a simple RL algorithm long before any successful\ntrials are observed. For example, in a manipulation task, the agent is given a\nsingle observation of the goal state and learns skills, first for moving its\nend-effector, then for pushing the block, and finally for picking up and\nplacing the block. These skills emerge before the agent has ever successfully\nplaced the block at the goal location and without the aid of any reward\nfunctions, demonstrations, or manually-specified distance metrics. Once the\nagent has learned to reach the goal state reliably, exploration is reduced.\nImplementing our method involves a simple modification of prior work and does\nnot require density estimates, ensembles, or any additional hyperparameters.\nIntuitively, the proposed method seems like it should be terrible at\nexploration, and we lack a clear theoretical understanding of why it works so\neffectively, though our experiments provide some hints.\n","authors":["Grace Liu","Michael Tang","Benjamin Eysenbach"],"pdf_url":"https://arxiv.org/pdf/2408.05804v1.pdf","comment":"Code and videos: https://graliuce.github.io/sgcrl/"},{"id":"http://arxiv.org/abs/2211.15931v3","updated":"2024-08-11T15:46:31Z","published":"2022-11-29T05:09:35Z","title":"Posterior Sampling for Continuing Environments","summary":" We develop an extension of posterior sampling for reinforcement learning\n(PSRL) that is suited for a continuing agent-environment interface and\nintegrates naturally into agent designs that scale to complex environments. The\napproach, continuing PSRL, maintains a statistically plausible model of the\nenvironment and follows a policy that maximizes expected $\\gamma$-discounted\nreturn in that model. At each time, with probability $1-\\gamma$, the model is\nreplaced by a sample from the posterior distribution over environments. For a\nchoice of discount factor that suitably depends on the horizon $T$, we\nestablish an $\\tilde{O}(\\tau S \\sqrt{A T})$ bound on the Bayesian regret, where\n$S$ is the number of environment states, $A$ is the number of actions, and\n$\\tau$ denotes the reward averaging time, which is a bound on the duration\nrequired to accurately estimate the average reward of any policy. Our work is\nthe first to formalize and rigorously analyze the resampling approach with\nrandomized exploration.\n","authors":["Wanqiao Xu","Shi Dong","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2211.15931v3.pdf","comment":"RLC 2024"},{"id":"http://arxiv.org/abs/2401.01923v3","updated":"2024-08-11T15:31:20Z","published":"2024-01-03T18:08:57Z","title":"IoT in the Era of Generative AI: Vision and Challenges","summary":" Advancements in Generative AI hold immense promise to push Internet of Things\n(IoT) to the next level. In this article, we share our vision on IoT in the era\nof Generative AI. We discuss some of the most important applications of\nGenerative AI in IoT-related domains. We also identify some of the most\ncritical challenges and discuss current gaps as well as promising opportunities\non enabling Generative AI for IoT. We hope this article can inspire new\nresearch on IoT in the era of Generative AI.\n","authors":["Xin Wang","Zhongwei Wan","Arvin Hekmati","Mingyu Zong","Samiul Alam","Mi Zhang","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2401.01923v3.pdf","comment":"8 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.05798v1","updated":"2024-08-11T15:17:11Z","published":"2024-08-11T15:17:11Z","title":"Time Makes Space: Emergence of Place Fields in Networks Encoding\n Temporally Continuous Sensory Experiences","summary":" The vertebrate hippocampus is believed to use recurrent connectivity in area\nCA3 to support episodic memory recall from partial cues. This brain area also\ncontains place cells, whose location-selective firing fields implement maps\nsupporting spatial memory. Here we show that place cells emerge in networks\ntrained to remember temporally continuous sensory episodes. We model CA3 as a\nrecurrent autoencoder that recalls and reconstructs sensory experiences from\nnoisy and partially occluded observations by agents traversing simulated rooms.\nThe agents move in realistic trajectories modeled from rodents and environments\nare modeled as high-dimensional sensory experience maps. Training our\nautoencoder to pattern-complete and reconstruct experiences with a constraint\non total activity causes spatially localized firing fields, i.e., place cells,\nto emerge in the encoding layer. The emergent place fields reproduce key\naspects of hippocampal phenomenology: a) remapping (maintenance of and\nreversion to distinct learned maps in different environments), implemented via\nrepositioning of experience manifolds in the network's hidden layer, b)\northogonality of spatial representations in different arenas, c) robust place\nfield emergence in differently shaped rooms, with single units showing multiple\nplace fields in large or complex spaces, and d) slow representational drift of\nplace fields. We argue that these results arise because continuous traversal of\nspace makes sensory experience temporally continuous. We make testable\npredictions: a) rapidly changing sensory context will disrupt place fields, b)\nplace fields will form even if recurrent connections are blocked, but reversion\nto previously learned representations upon remapping will be abolished, c) the\ndimension of temporally smooth experience sets the dimensionality of place\nfields, including during virtual navigation of abstract spaces.\n","authors":["Zhaoze Wang","Ronald W. Di Tullio","Spencer Rooke","Vijay Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2408.05798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05797v1","updated":"2024-08-11T15:12:21Z","published":"2024-08-11T15:12:21Z","title":"A Comparative Study of Convolutional and Recurrent Neural Networks for\n Storm Surge Prediction in Tampa Bay","summary":" In this paper, we compare the performance of three common deep learning\narchitectures, CNN-LSTM, LSTM, and 3D-CNN, in the context of surrogate storm\nsurge modeling. The study site for this paper is the Tampa Bay area in Florida.\nUsing high-resolution atmospheric data from the reanalysis models and\nhistorical water level data from NOAA tide stations, we trained and tested\nthese models to evaluate their performance. Our findings indicate that the\nCNN-LSTM model outperforms the other architectures, achieving a test loss of\n0.010 and an R-squared (R2) score of 0.84. The LSTM model, although it achieved\nthe lowest training loss of 0.007 and the highest training R2 of 0.88,\nexhibited poorer generalization with a test loss of 0.014 and an R2 of 0.77.\nThe 3D-CNN model showed reasonable performance with a test loss of 0.011 and an\nR2 of 0.82 but displayed instability under extreme conditions. A case study on\nHurricane Ian, which caused a significant negative surge of -1.5 meters in\nTampa Bay indicates the CNN-LSTM model's robustness and accuracy in extreme\nscenarios.\n","authors":["Mandana Farhang Ghahfarokhi","Seyed Hossein Sonbolestan","Mahta Zamanizadeh"],"pdf_url":"https://arxiv.org/pdf/2408.05797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05788v1","updated":"2024-08-11T14:33:37Z","published":"2024-08-11T14:33:37Z","title":"Continual Learning of Nonlinear Independent Representations","summary":" Identifying the causal relations between interested variables plays a pivotal\nrole in representation learning as it provides deep insights into the dataset.\nIdentifiability, as the central theme of this approach, normally hinges on\nleveraging data from multiple distributions (intervention, distribution shift,\ntime series, etc.). Despite the exciting development in this field, a practical\nbut often overlooked problem is: what if those distribution shifts happen\nsequentially? In contrast, any intelligence possesses the capacity to abstract\nand refine learned knowledge sequentially -- lifelong learning. In this paper,\nwith a particular focus on the nonlinear independent component analysis (ICA)\nframework, we move one step forward toward the question of enabling models to\nlearn meaningful (identifiable) representations in a sequential manner, termed\ncontinual causal representation learning. We theoretically demonstrate that\nmodel identifiability progresses from a subspace level to a component-wise\nlevel as the number of distributions increases. Empirically, we show that our\nmethod achieves performance comparable to nonlinear ICA methods trained jointly\non multiple offline distributions and, surprisingly, the incoming new\ndistribution does not necessarily benefit the identification of all latent\nvariables.\n","authors":["Boyang Sun","Ignavier Ng","Guangyi Chen","Yifan Shen","Qirong Ho","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05788v1.pdf","comment":"9 pages, 5 Figures"},{"id":"http://arxiv.org/abs/2408.05787v1","updated":"2024-08-11T14:33:23Z","published":"2024-08-11T14:33:23Z","title":"On zero-shot learning in neural state estimation of power distribution\n systems","summary":" This paper addresses the challenge of neural state estimation in power\ndistribution systems. We identified a research gap in the current state of the\nart, which lies in the inability of models to adapt to changes in the power\ngrid, such as loss of sensors and branch switching. Our experiments demonstrate\nthat graph neural networks are the most promising models for this use case and\nthat their performance can degrade with scale. We propose augmentations to\nremedy this issue and perform a comprehensive grid search of different model\nconfigurations for common zero-shot learning scenarios in neural state\nestimation.\n","authors":["Aleksandr Berezin","Stephan Balduin","Thomas Oberließen","Sebastian Peter","Eric MSP Veith"],"pdf_url":"https://arxiv.org/pdf/2408.05787v1.pdf","comment":"13 pages, 2 figures, associated source code available at\n https://gitlab.com/transense/nse-tl-paper"},{"id":"http://arxiv.org/abs/2408.05781v1","updated":"2024-08-11T14:13:22Z","published":"2024-08-11T14:13:22Z","title":"CURLing the Dream: Contrastive Representations for World Modeling in\n Reinforcement Learning","summary":" In this work, we present Curled-Dreamer, a novel reinforcement learning\nalgorithm that integrates contrastive learning into the DreamerV3 framework to\nenhance performance in visual reinforcement learning tasks. By incorporating\nthe contrastive loss from the CURL algorithm and a reconstruction loss from\nautoencoder, Curled-Dreamer achieves significant improvements in various\nDeepMind Control Suite tasks. Our extensive experiments demonstrate that\nCurled-Dreamer consistently outperforms state-of-the-art algorithms, achieving\nhigher mean and median scores across a diverse set of tasks. The results\nindicate that the proposed approach not only accelerates learning but also\nenhances the robustness of the learned policies. This work highlights the\npotential of combining different learning paradigms to achieve superior\nperformance in reinforcement learning applications.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.05781v1.pdf","comment":"Paper accepted for 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2408.05778v1","updated":"2024-08-11T14:09:40Z","published":"2024-08-11T14:09:40Z","title":"Pareto Front Shape-Agnostic Pareto Set Learning in Multi-Objective\n Optimization","summary":" Pareto set learning (PSL) is an emerging approach for acquiring the complete\nPareto set of a multi-objective optimization problem. Existing methods\nprimarily rely on the mapping of preference vectors in the objective space to\nPareto optimal solutions in the decision space. However, the sampling of\npreference vectors theoretically requires prior knowledge of the Pareto front\nshape to ensure high performance of the PSL methods. Designing a sampling\nstrategy of preference vectors is difficult since the Pareto front shape cannot\nbe known in advance. To make Pareto set learning work effectively in any Pareto\nfront shape, we propose a Pareto front shape-agnostic Pareto Set Learning\n(GPSL) that does not require the prior information about the Pareto front. The\nfundamental concept behind GPSL is to treat the learning of the Pareto set as a\ndistribution transformation problem. Specifically, GPSL can transform an\narbitrary distribution into the Pareto set distribution. We demonstrate that\ntraining a neural network by maximizing hypervolume enables the process of\ndistribution transformation. Our proposed method can handle any shape of the\nPareto front and learn the Pareto set without requiring prior knowledge.\nExperimental results show the high performance of our proposed method on\ndiverse test problems compared with recent Pareto set learning algorithms.\n","authors":["Rongguang Ye","Longcan Chen","Wei-Bin Kou","Jinyuan Zhang","Hisao Ishibuchi"],"pdf_url":"https://arxiv.org/pdf/2408.05778v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2408.01981v2","updated":"2024-08-11T13:00:14Z","published":"2024-08-04T10:16:11Z","title":"Multiview learning with twin parametric margin SVM","summary":" Multiview learning (MVL) seeks to leverage the benefits of diverse\nperspectives to complement each other, effectively extracting and utilizing the\nlatent information within the dataset. Several twin support vector\nmachine-based MVL (MvTSVM) models have been introduced and demonstrated\noutstanding performance in various learning tasks. However, MvTSVM-based models\nface significant challenges in the form of computational complexity due to four\nmatrix inversions, the need to reformulate optimization problems in order to\nemploy kernel-generated surfaces for handling non-linear cases, and the\nconstraint of uniform noise assumption in the training data. Particularly in\ncases where the data possesses a heteroscedastic error structure, these\nchallenges become even more pronounced. In view of the aforementioned\nchallenges, we propose multiview twin parametric margin support vector machine\n(MvTPMSVM). MvTPMSVM constructs parametric margin hyperplanes corresponding to\nboth classes, aiming to regulate and manage the impact of the heteroscedastic\nnoise structure existing within the data. The proposed MvTPMSVM model avoids\nthe explicit computation of matrix inversions in the dual formulation, leading\nto enhanced computational efficiency. We perform an extensive assessment of the\nMvTPMSVM model using benchmark datasets such as UCI, KEEL, synthetic, and\nAnimals with Attributes (AwA). Our experimental results, coupled with rigorous\nstatistical analyses, confirm the superior generalization capabilities of the\nproposed MvTPMSVM model compared to the baseline models. The source code of the\nproposed MvTPMSVM model is available at\n\\url{https://github.com/mtanveer1/MvTPMSVM}.\n","authors":["A. Quadir","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2408.01981v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.05683v1","updated":"2024-08-11T03:29:27Z","published":"2024-08-11T03:29:27Z","title":"Single Image Dehazing Using Scene Depth Ordering","summary":" Images captured in hazy weather generally suffer from quality degradation,\nand many dehazing methods have been developed to solve this problem. However,\nsingle image dehazing problem is still challenging due to its ill-posed nature.\nIn this paper, we propose a depth order guided single image dehazing method,\nwhich utilizes depth order in hazy images to guide the dehazing process to\nachieve a similar depth perception in corresponding dehazing results. The\nconsistency of depth perception ensures that the regions that look farther or\ncloser in hazy images also appear farther or closer in the corresponding\ndehazing results, and thus effectively avoid the undesired visual effects. To\nachieve this goal, a simple yet effective strategy is proposed to extract the\ndepth order in hazy images, which offers a reference for depth perception in\nhazy weather. Additionally, a depth order embedded transformation model is\ndevised, which performs transmission estimation under the guidance of depth\norder to realize an unchanged depth order in the dehazing results. The\nextracted depth order provides a powerful global constraint for the dehazing\nprocess, which contributes to the efficient utilization of global information,\nthereby bringing an overall improvement in restoration quality. Extensive\nexperiments demonstrate that the proposed method can better recover potential\nstructure and vivid color with higher computational efficiency than the\nstate-of-the-art dehazing methods.\n","authors":["Pengyang Ling","Huaian Chen","Xiao Tan","Yimeng Shan","Yi Jin"],"pdf_url":"https://arxiv.org/pdf/2408.05683v1.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2408.05794v1","updated":"2024-08-11T14:56:06Z","published":"2024-08-11T14:56:06Z","title":"HateSieve: A Contrastive Learning Framework for Detecting and Segmenting\n Hateful Content in Multimodal Memes","summary":" Amidst the rise of Large Multimodal Models (LMMs) and their widespread\napplication in generating and interpreting complex content, the risk of\npropagating biased and harmful memes remains significant. Current safety\nmeasures often fail to detect subtly integrated hateful content within\n``Confounder Memes''. To address this, we introduce \\textsc{HateSieve}, a new\nframework designed to enhance the detection and segmentation of hateful\nelements in memes. \\textsc{HateSieve} features a novel Contrastive Meme\nGenerator that creates semantically paired memes, a customized triplet dataset\nfor contrastive learning, and an Image-Text Alignment module that produces\ncontext-aware embeddings for accurate meme segmentation. Empirical experiments\non the Hateful Meme Dataset show that \\textsc{HateSieve} not only surpasses\nexisting LMMs in performance with fewer trainable parameters but also offers a\nrobust mechanism for precisely identifying and isolating hateful content.\n\\textcolor{red}{Caution: Contains academic discussions of hate speech; viewer\ndiscretion advised.}\n","authors":["Xuanyu Su","Yansong Li","Diana Inkpen","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2408.05794v1.pdf","comment":"8 pages overall, the accepted paper at the 3rd Workshop on Advances\n in Language and Vision Research (ALVR 2024) ACL workshops"}]},"2024-08-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.07065v1","updated":"2024-08-13T17:57:14Z","published":"2024-08-13T17:57:14Z","title":"Fingerspelling within Sign Language Translation","summary":" Fingerspelling poses challenges for sign language processing due to its\nhigh-frequency motion and use for open-vocabulary terms. While prior work has\nstudied fingerspelling recognition, there has been little attention to\nevaluating how well sign language translation models understand fingerspelling\nin the context of entire sentences -- and improving this capability. We\nmanually annotate instances of fingerspelling within FLEURS-ASL and use them to\nevaluate the effect of two simple measures to improve fingerspelling\nrecognition within American Sign Language to English translation: 1) use a\nmodel family (ByT5) with character- rather than subword-level tokenization, and\n2) mix fingerspelling recognition data into the translation training mixture.\nWe find that 1) substantially improves understanding of fingerspelling (and\ntherefore translation quality overall), but the effect of 2) is mixed.\n","authors":["Garrett Tanzer"],"pdf_url":"https://arxiv.org/pdf/2408.07065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07060v1","updated":"2024-08-13T17:50:28Z","published":"2024-08-13T17:50:28Z","title":"Diversity Empowers Intelligence: Integrating Expertise of Software\n Engineering Agents","summary":" Large language model (LLM) agents have shown great potential in solving\nreal-world software engineering (SWE) problems. The most advanced open-source\nSWE agent can resolve over 27% of real GitHub issues in SWE-Bench Lite.\nHowever, these sophisticated agent frameworks exhibit varying strengths,\nexcelling in certain tasks while underperforming in others. To fully harness\nthe diversity of these agents, we propose DEI (Diversity Empowered\nIntelligence), a framework that leverages their unique expertise. DEI functions\nas a meta-module atop existing SWE agent frameworks, managing agent collectives\nfor enhanced problem-solving. Experimental results show that a DEI-guided\ncommittee of agents is able to surpass the best individual agent's performance\nby a large margin. For instance, a group of open-source SWE agents, with a\nmaximum individual resolve rate of 27.3% on SWE-Bench Lite, can achieve a 34.3%\nresolve rate with DEI, making a 25% improvement and beating most closed-source\nsolutions. Our best-performing group excels with a 55% resolve rate, securing\nthe highest ranking on SWE-Bench Lite. Our findings contribute to the growing\nbody of research on collaborative AI systems and their potential to solve\ncomplex software engineering challenges.\n","authors":["Kexun Zhang","Weiran Yao","Zuxin Liu","Yihao Feng","Zhiwei Liu","Rithesh Murthy","Tian Lan","Lei Li","Renze Lou","Jiacheng Xu","Bo Pang","Yingbo Zhou","Shelby Heinecke","Silvio Savarese","Huan Wang","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.07060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07057v1","updated":"2024-08-13T17:49:00Z","published":"2024-08-13T17:49:00Z","title":"A Survey on Model MoErging: Recycling and Routing Among Specialized\n Experts for Collaborative Learning","summary":" The availability of performant pre-trained models has led to a proliferation\nof fine-tuned expert models that are specialized to a particular domain or\ntask. Model MoErging methods aim to recycle expert models to create an\naggregate system with improved performance or generalization. A key component\nof MoErging methods is the creation of a router that decides which expert\nmodel(s) to use for a particular input or application. The promise,\neffectiveness, and large design space of MoErging has spurred the development\nof many new methods over the past few years. This rapid pace of development has\nmade it challenging to compare different MoErging methods, which are rarely\ncompared to one another and are often validated in different experimental\nsetups. To remedy such gaps, we present a comprehensive survey of MoErging\nmethods that includes a novel taxonomy for cataloging key design choices and\nclarifying suitable applications for each method. Apart from surveying MoErging\nresearch, we inventory software tools and applications that make use of\nMoErging. We additionally discuss related fields of study such as model\nmerging, multitask learning, and mixture-of-experts models. Taken as a whole,\nour survey provides a unified overview of existing MoErging methods and creates\na solid foundation for future work in this burgeoning field.\n","authors":["Prateek Yadav","Colin Raffel","Mohammed Muqeeth","Lucas Caccia","Haokun Liu","Tianlong Chen","Mohit Bansal","Leshem Choshen","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2408.07057v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2408.07055v1","updated":"2024-08-13T17:46:12Z","published":"2024-08-13T17:46:12Z","title":"LongWriter: Unleashing 10,000+ Word Generation from Long Context LLMs","summary":" Current long context large language models (LLMs) can process inputs up to\n100,000 tokens, yet struggle to generate outputs exceeding even a modest length\nof 2,000 words. Through controlled experiments, we find that the model's\neffective generation length is inherently bounded by the sample it has seen\nduring supervised fine-tuning (SFT). In other words, their output limitation is\ndue to the scarcity of long-output examples in existing SFT datasets. To\naddress this, we introduce AgentWrite, an agent-based pipeline that decomposes\nultra-long generation tasks into subtasks, enabling off-the-shelf LLMs to\ngenerate coherent outputs exceeding 20,000 words. Leveraging AgentWrite, we\nconstruct LongWriter-6k, a dataset containing 6,000 SFT data with output\nlengths ranging from 2k to 32k words. By incorporating this dataset into model\ntraining, we successfully scale the output length of existing models to over\n10,000 words while maintaining output quality. We also develop LongBench-Write,\na comprehensive benchmark for evaluating ultra-long generation capabilities.\nOur 9B parameter model, further improved through DPO, achieves state-of-the-art\nperformance on this benchmark, surpassing even much larger proprietary models.\nIn general, our work demonstrates that existing long context LLM already\npossesses the potential for a larger output window--all you need is data with\nextended output during model alignment to unlock this capability. Our code &\nmodels are at: https://github.com/THUDM/LongWriter.\n","authors":["Yushi Bai","Jiajie Zhang","Xin Lv","Linzhi Zheng","Siqi Zhu","Lei Hou","Yuxiao Dong","Jie Tang","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2408.07055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07052v1","updated":"2024-08-13T17:43:32Z","published":"2024-08-13T17:43:32Z","title":"The News Comment Gap and Algorithmic Agenda Setting in Online Forums","summary":" The disparity between news stories valued by journalists and those preferred\nby readers, known as the \"News Gap\", is well-documented. However, the\ndifference in expectations regarding news related user-generated content is\nless studied. Comment sections, hosted by news websites, are popular venues for\nreader engagement, yet still subject to editorial decisions. It is thus\nimportant to understand journalist vs reader comment preferences and how these\nare served by various comment ranking algorithms that represent discussions\ndifferently. We analyse 1.2 million comments from Austrian newspaper Der\nStandard to understand the \"News Comment Gap\" and the effects of different\nranking algorithms. We find that journalists prefer positive, timely, complex,\ndirect responses, while readers favour comments similar to article content from\nelite authors. We introduce the versatile Feature-Oriented Ranking Utility\nMetric (FORUM) to assess the impact of different ranking algorithms and find\ndramatic differences in how they prioritise the display of comments by\nsentiment, topical relevance, lexical diversity, and readability. Journalists\ncan exert substantial influence over the discourse through both curatorial and\nalgorithmic means. Understanding these choices' implications is vital in\nfostering engaging and civil discussions while aligning with journalistic\nobjectives, especially given the increasing legal scrutiny and societal\nimportance of online discourse.\n","authors":["Flora Böwing","Patrick Gildersleve"],"pdf_url":"https://arxiv.org/pdf/2408.07052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07045v1","updated":"2024-08-13T17:20:52Z","published":"2024-08-13T17:20:52Z","title":"TableGuard -- Securing Structured & Unstructured Data","summary":" With the increasing demand for data sharing across platforms and\norganizations, ensuring the privacy and security of sensitive information has\nbecome a critical challenge. This paper introduces \"TableGuard\". An innovative\napproach to data obfuscation tailored for relational databases. Building on the\nprinciples and techniques developed in prior work on context-sensitive\nobfuscation, TableGuard applies these methods to ensure that API calls return\nonly obfuscated data, thereby safeguarding privacy when sharing data with third\nparties. TableGuard leverages advanced context-sensitive obfuscation techniques\nto replace sensitive data elements with contextually appropriate alternatives.\nBy maintaining the relational integrity and coherence of the data, our approach\nmitigates the risks of cognitive dissonance and data leakage. We demonstrate\nthe implementation of TableGuard using a BERT based transformer model, which\nidentifies and obfuscates sensitive entities within relational tables. Our\nevaluation shows that TableGuard effectively balances privacy protection with\ndata utility, minimizing information loss while ensuring that the obfuscated\ndata remains functionally useful for downstream applications. The results\nhighlight the importance of domain-specific obfuscation strategies and the role\nof context length in preserving data integrity. The implications of this\nresearch are significant for organizations that need to share data securely\nwith external parties. TableGuard offers a robust framework for implementing\nprivacy-preserving data sharing mechanisms, thereby contributing to the broader\nfield of data privacy and security.\n","authors":["Anantha Sharma","Ajinkya Deshmukh"],"pdf_url":"https://arxiv.org/pdf/2408.07045v1.pdf","comment":"7 pages, 3 tables, 1 figure"},{"id":"http://arxiv.org/abs/2408.07003v1","updated":"2024-08-13T16:07:16Z","published":"2024-08-13T16:07:16Z","title":"Generative AI for automatic topic labelling","summary":" Topic Modeling has become a prominent tool for the study of scientific\nfields, as they allow for a large scale interpretation of research trends.\nNevertheless, the output of these models is structured as a list of keywords\nwhich requires a manual interpretation for the labelling. This paper proposes\nto assess the reliability of three LLMs, namely flan, GPT-4o, and GPT-4 mini\nfor topic labelling. Drawing on previous research leveraging BERTopic, we\ngenerate topics from a dataset of all the scientific articles (n=34,797)\nauthored by all biology professors in Switzerland (n=465) between 2008 and\n2020, as recorded in the Web of Science database. We assess the output of the\nthree models both quantitatively and qualitatively and find that, first, both\nGPT models are capable of accurately and precisely label topics from the\nmodels' output keywords. Second, 3-word labels are preferable to grasp the\ncomplexity of research topics.\n","authors":["Diego Kozlowski","Carolina Pradier","Pierre Benz"],"pdf_url":"https://arxiv.org/pdf/2408.07003v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.12261v4","updated":"2024-08-13T15:20:13Z","published":"2024-02-19T16:19:15Z","title":"NEO-BENCH: Evaluating Robustness of Large Language Models with\n Neologisms","summary":" The performance of Large Language Models (LLMs) degrades from the temporal\ndrift between data used for model training and newer text seen during\ninference. One understudied avenue of language change causing data drift is the\nemergence of neologisms -- new word forms -- over time. We create a diverse\nresource of recent English neologisms by using several popular collection\nmethods. We analyze temporal drift using neologisms by comparing sentences\ncontaining new words with near-identical sentences that replace neologisms with\nexisting substitute words. Model performance is nearly halved in machine\ntranslation when a single neologism is introduced in a sentence. Motivated by\nthese results, we construct a benchmark to evaluate LLMs' ability to generalize\nto neologisms with various natural language understanding tasks and model\nperplexity. Models with later knowledge cutoff dates yield lower perplexities\nand perform better in downstream tasks. LLMs are also affected differently\nbased on the linguistic origins of words, indicating that neologisms are\ncomplex for static LLMs to address. We will release our benchmark and code for\nreproducing our experiments.\n","authors":["Jonathan Zheng","Alan Ritter","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.12261v4.pdf","comment":"accepted to ACL 2024 main conference, 9 pages"},{"id":"http://arxiv.org/abs/2408.06273v2","updated":"2024-08-13T14:57:25Z","published":"2024-08-12T16:34:56Z","title":"FuxiTranyu: A Multilingual Large Language Model Trained with Balanced\n Data","summary":" Large language models (LLMs) have demonstrated prowess in a wide range of\ntasks. However, many LLMs exhibit significant performance discrepancies between\nhigh- and low-resource languages. To mitigate this challenge, we present\nFuxiTranyu, an open-source multilingual LLM, which is designed to satisfy the\nneed of the research community for balanced and high-performing multilingual\ncapabilities. FuxiTranyu-8B, the base model with 8 billion parameters, is\ntrained from scratch on a meticulously balanced multilingual data repository\nthat contains 600 billion tokens covering 43 natural languages and 16\nprogramming languages. In addition to the base model, we also develop two\ninstruction-tuned models: FuxiTranyu-8B-SFT that is fine-tuned on a diverse\nmultilingual instruction dataset, and FuxiTranyu-8B-DPO that is further refined\nwith DPO on a preference dataset for enhanced alignment ability. Extensive\nexperiments on a wide range of multilingual benchmarks demonstrate the\ncompetitive performance of FuxiTranyu against existing multilingual LLMs, e.g.,\nBLOOM-7B, PolyLM-13B, Llama-2-Chat-7B and Mistral-7B-Instruct. Interpretability\nanalyses at both the neuron and representation level suggest that FuxiTranyu is\nable to learn consistent multilingual representations across different\nlanguages. To promote further research into multilingual LLMs and their working\nmechanisms, we release both the base and instruction-tuned FuxiTranyu models\ntogether with 58 pretraining checkpoints at HuggingFace and Github.\n","authors":["Haoran Sun","Renren Jin","Shaoyang Xu","Leiyu Pan"," Supryadi","Menglong Cui","Jiangcun Du","Yikun Lei","Lei Yang","Ling Shi","Juesi Xiao","Shaolin Zhu","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.06273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11877v4","updated":"2024-08-13T14:38:59Z","published":"2024-05-20T08:41:15Z","title":"A Novel Cartography-Based Curriculum Learning Method Applied on RoNLI:\n The First Romanian Natural Language Inference Corpus","summary":" Natural language inference (NLI), the task of recognizing the entailment\nrelationship in sentence pairs, is an actively studied topic serving as a proxy\nfor natural language understanding. Despite the relevance of the task in\nbuilding conversational agents and improving text classification, machine\ntranslation and other NLP tasks, to the best of our knowledge, there is no\npublicly available NLI corpus for the Romanian language. To this end, we\nintroduce the first Romanian NLI corpus (RoNLI) comprising 58K training\nsentence pairs, which are obtained via distant supervision, and 6K validation\nand test sentence pairs, which are manually annotated with the correct labels.\nWe conduct experiments with multiple machine learning methods based on distant\nlearning, ranging from shallow models based on word embeddings to\ntransformer-based neural networks, to establish a set of competitive baselines.\nFurthermore, we improve on the best model by employing a new curriculum\nlearning strategy based on data cartography. Our dataset and code to reproduce\nthe baselines are available at https://github.com/Eduard6421/RONLI.\n","authors":["Eduard Poesina","Cornelia Caragea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2405.11877v4.pdf","comment":"Accepted at ACL 2024 (Main)"},{"id":"http://arxiv.org/abs/2408.06931v1","updated":"2024-08-13T14:34:59Z","published":"2024-08-13T14:34:59Z","title":"The advantages of context specific language models: the case of the\n Erasmian Language Model","summary":" The current trend to improve language model performance seems to be based on\nscaling up with the number of parameters (e.g. the state of the art GPT4 model\nhas approximately 1.7 trillion parameters) or the amount of training data fed\ninto the model. However this comes at significant costs in terms of\ncomputational resources and energy costs that compromise the sustainability of\nAI solutions, as well as risk relating to privacy and misuse. In this paper we\npresent the Erasmian Language Model (ELM) a small context specific, 900 million\nparameter model, pre-trained and fine-tuned by and for Erasmus University\nRotterdam. We show how the model performs adequately in a classroom context for\nessay writing, and how it achieves superior performance in subjects that are\npart of its context. This has implications for a wide range of institutions and\norganizations, showing that context specific language models may be a viable\nalternative for resource constrained, privacy sensitive use cases.\n","authors":["João Gonçalves","Nick Jelicic","Michele Murgia","Evert Stamhuis"],"pdf_url":"https://arxiv.org/pdf/2408.06931v1.pdf","comment":"12 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.06930v1","updated":"2024-08-13T14:33:32Z","published":"2024-08-13T14:33:32Z","title":"Diagnosis extraction from unstructured Dutch echocardiogram reports\n using span- and document-level characteristic classification","summary":" Clinical machine learning research and AI driven clinical decision support\nmodels rely on clinically accurate labels. Manually extracting these labels\nwith the help of clinical specialists is often time-consuming and expensive.\nThis study tests the feasibility of automatic span- and document-level\ndiagnosis extraction from unstructured Dutch echocardiogram reports.\n We included 115,692 unstructured echocardiogram reports from the UMCU a large\nuniversity hospital in the Netherlands. A randomly selected subset was manually\nannotated for the occurrence and severity of eleven commonly described cardiac\ncharacteristics. We developed and tested several automatic labelling techniques\nat both span and document levels, using weighted and macro F1-score, precision,\nand recall for performance evaluation. We compared the performance of span\nlabelling against document labelling methods, which included both direct\ndocument classifiers and indirect document classifiers that rely on span\nclassification results.\n The SpanCategorizer and MedRoBERTa.nl models outperformed all other span and\ndocument classifiers, respectively. The weighted F1-score varied between\ncharacteristics, ranging from 0.60 to 0.93 in SpanCategorizer and 0.96 to 0.98\nin MedRoBERTa.nl. Direct document classification was superior to indirect\ndocument classification using span classifiers. SetFit achieved competitive\ndocument classification performance using only 10\\% of the training data.\nUtilizing a reduced label set yielded near-perfect document classification\nresults.\n We recommend using our published SpanCategorizer and MedRoBERTa.nl models for\nspan- and document-level diagnosis extraction from Dutch echocardiography\nreports. For settings with limited training data, SetFit may be a promising\nalternative for document classification.\n","authors":["Bauke Arends","Melle Vessies","Dirk van Osch","Arco Teske","Pim van der Harst","René van Es","Bram van Es"],"pdf_url":"https://arxiv.org/pdf/2408.06930v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06929v1","updated":"2024-08-13T14:32:43Z","published":"2024-08-13T14:32:43Z","title":"Evaluating Cultural Adaptability of a Large Language Model via\n Simulation of Synthetic Personas","summary":" The success of Large Language Models (LLMs) in multicultural environments\nhinges on their ability to understand users' diverse cultural backgrounds. We\nmeasure this capability by having an LLM simulate human profiles representing\nvarious nationalities within the scope of a questionnaire-style psychological\nexperiment. Specifically, we employ GPT-3.5 to reproduce reactions to\npersuasive news articles of 7,286 participants from 15 countries; comparing the\nresults with a dataset of real participants sharing the same demographic\ntraits. Our analysis shows that specifying a person's country of residence\nimproves GPT-3.5's alignment with their responses. In contrast, using native\nlanguage prompting introduces shifts that significantly reduce overall\nalignment, with some languages particularly impairing performance. These\nfindings suggest that while direct nationality information enhances the model's\ncultural adaptability, native language cues do not reliably improve simulation\nfidelity and can detract from the model's effectiveness.\n","authors":["Louis Kwok","Michal Bravansky","Lewis D. Griffin"],"pdf_url":"https://arxiv.org/pdf/2408.06929v1.pdf","comment":"18 pages, 8 figures, Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2406.12094v2","updated":"2024-08-13T14:02:13Z","published":"2024-06-17T21:15:12Z","title":"Who's asking? User personas and the mechanics of latent misalignment","summary":" Despite investments in improving model safety, studies show that misaligned\ncapabilities remain latent in safety-tuned models. In this work, we shed light\non the mechanics of this phenomenon. First, we show that even when model\ngenerations are safe, harmful content can persist in hidden representations and\ncan be extracted by decoding from earlier layers. Then, we show that whether\nthe model divulges such content depends significantly on its perception of who\nit is talking to, which we refer to as user persona. In fact, we find\nmanipulating user persona to be even more effective for eliciting harmful\ncontent than direct attempts to control model refusal. We study both natural\nlanguage prompting and activation steering as control methods and show that\nactivation steering is significantly more effective at bypassing safety\nfilters. We investigate why certain personas break model safeguards and find\nthat they enable the model to form more charitable interpretations of otherwise\ndangerous queries. Finally, we show we can predict a persona's effect on\nrefusal given only the geometry of its steering vector.\n","authors":["Asma Ghandeharioun","Ann Yuan","Marius Guerard","Emily Reif","Michael A. Lepori","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2406.12094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06904v1","updated":"2024-08-13T13:58:23Z","published":"2024-08-13T13:58:23Z","title":"Re-TASK: Revisiting LLM Tasks from Capability, Skill, and Knowledge\n Perspectives","summary":" As large language models (LLMs) continue to scale, their enhanced performance\noften proves insufficient for solving domain-specific tasks. Systematically\nanalyzing their failures and effectively enhancing their performance remain\nsignificant challenges. This paper introduces the Re-TASK framework, a novel\ntheoretical model that Revisits LLM Tasks from cApability, Skill, Knowledge\nperspectives, guided by the principles of Bloom's Taxonomy and Knowledge Space\nTheory. The Re-TASK framework provides a systematic methodology to deepen our\nunderstanding, evaluation, and enhancement of LLMs for domain-specific tasks.\nIt explores the interplay among an LLM's capabilities, the knowledge it\nprocesses, and the skills it applies, elucidating how these elements are\ninterconnected and impact task performance. Our application of the Re-TASK\nframework reveals that many failures in domain-specific tasks can be attributed\nto insufficient knowledge or inadequate skill adaptation. With this insight, we\npropose structured strategies for enhancing LLMs through targeted knowledge\ninjection and skill adaptation. Specifically, we identify key capability items\nassociated with tasks and employ a deliberately designed prompting strategy to\nenhance task performance, thereby reducing the need for extensive fine-tuning.\nAlternatively, we fine-tune the LLM using capability-specific instructions,\nfurther validating the efficacy of our framework. Experimental results confirm\nthe framework's effectiveness, demonstrating substantial improvements in both\nthe performance and applicability of LLMs.\n","authors":["Zhihu Wang","Shiwan Zhao","Yu Wang","Heyuan Huang","Jiaxin Shi","Sitao Xie","Zhixing Wang","Yubo Zhang","Hongyan Li","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.06904v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2407.16205v3","updated":"2024-08-13T13:46:18Z","published":"2024-07-23T06:14:41Z","title":"Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought remarkable\ngenerative capabilities across diverse tasks. However, despite the impressive\nachievements, these LLMs still have numerous inherent vulnerabilities,\nparticularly when faced with jailbreak attacks. By investigating jailbreak\nattacks, we can uncover hidden weaknesses in LLMs and inform the development of\nmore robust defense mechanisms to fortify their security. In this paper, we\nfurther explore the boundary of jailbreak attacks on LLMs and propose\nAnalyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes\nadvantage of LLMs' growing analyzing and reasoning capability and reveals their\nunderlying vulnerabilities when facing analyzing-based tasks. We conduct a\ndetailed evaluation of ABJ across various open-source and closed-source LLMs,\nwhich achieves 94.8% attack success rate (ASR) and 1.06 attack efficiency (AE)\non GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and\nefficiency. Our research highlights the importance of prioritizing and\nenhancing the safety of LLMs to mitigate the risks of misuse. The code is\npublicly available at hhttps://github.com/theshi-1128/ABJ-Attack. Warning: This\npaper contains examples of LLMs that might be offensive or harmful.\n","authors":["Shi Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19220v5","updated":"2024-08-13T13:32:07Z","published":"2024-05-29T16:00:46Z","title":"WRDScore: New Metric for Evaluation of Natural Language Generation\n Models","summary":" Evaluating natural language generation models, particularly for method name\nprediction, poses significant challenges. A robust metric must account for the\nversatility of method naming, considering both semantic and syntactic\nvariations. Traditional overlap-based metrics, such as ROUGE, fail to capture\nthese nuances. Existing embedding-based metrics often suffer from imbalanced\nprecision and recall, lack normalized scores, or make unrealistic assumptions\nabout sequences. To address these limitations, we leverage the theory of\noptimal transport and construct WRDScore, a novel metric that strikes a balance\nbetween simplicity and effectiveness. In the WRDScore framework, we define\nprecision as the maximum degree to which the predicted sequence's tokens are\nincluded in the reference sequence, token by token. Recall is calculated as the\ntotal cost of the optimal transport plan that maps the reference sequence to\nthe predicted one. Finally, WRDScore is computed as the harmonic mean of\nprecision and recall, balancing these two complementary metrics. Our metric is\nlightweight, normalized, and precision-recall-oriented, avoiding unrealistic\nassumptions while aligning well with human judgments. Experiments on a\nhuman-curated dataset confirm the superiority of WRDScore over other available\ntext metrics.\n","authors":["Ravil Mussabayev"],"pdf_url":"https://arxiv.org/pdf/2405.19220v5.pdf","comment":"Accepted to IEEE Xplore"},{"id":"http://arxiv.org/abs/2310.00598v2","updated":"2024-08-13T13:19:29Z","published":"2023-10-01T07:06:17Z","title":"A Novel Computational and Modeling Foundation for Automatic Coherence\n Assessment","summary":" Coherence is an essential property of well-written texts, that refers to the\nway textual units relate to one another. In the era of generative AI, coherence\nassessment is essential for many NLP tasks; summarization, generation,\nlong-form question-answering, and more. However, in NLP {coherence} is an\nill-defined notion, not having a formal definition or evaluation metrics, that\nwould allow for large-scale automatic and systematic coherence assessment. To\nbridge this gap, in this work we employ the formal linguistic definition of\n\\citet{Reinhart:1980} of what makes a discourse coherent, consisting of three\nconditions -- {\\em cohesion, consistency} and {\\em relevance} -- and formalize\nthese conditions as respective computational tasks. We hypothesize that (i) a\nmodel trained on all of these tasks will learn the features required for\ncoherence detection, and that (ii) a joint model for all tasks will exceed the\nperformance of models trained on each task individually. On two benchmarks for\ncoherence scoring rated by humans, one containing 500 automatically-generated\nshort stories and another containing 4k real-world texts, our experiments\nconfirm that jointly training on the proposed tasks leads to better performance\non each task compared with task-specific models, and to better performance on\nassessing coherence overall, compared with strong baselines. We conclude that\nthe formal and computational setup of coherence as proposed here provides a\nsolid foundation for advanced methods of large-scale automatic assessment of\ncoherence.\n","authors":["Aviya Maimon","Reut Tsarfaty"],"pdf_url":"https://arxiv.org/pdf/2310.00598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06874v1","updated":"2024-08-13T13:11:53Z","published":"2024-08-13T13:11:53Z","title":"Leveraging Language Models for Emotion and Behavior Analysis in\n Education","summary":" The analysis of students' emotions and behaviors is crucial for enhancing\nlearning outcomes and personalizing educational experiences. Traditional\nmethods often rely on intrusive visual and physiological data collection,\nposing privacy concerns and scalability issues. This paper proposes a novel\nmethod leveraging large language models (LLMs) and prompt engineering to\nanalyze textual data from students. Our approach utilizes tailored prompts to\nguide LLMs in detecting emotional and engagement states, providing a\nnon-intrusive and scalable solution. We conducted experiments using Qwen,\nChatGPT, Claude2, and GPT-4, comparing our method against baseline models and\nchain-of-thought (CoT) prompting. Results demonstrate that our method\nsignificantly outperforms the baselines in both accuracy and contextual\nunderstanding. This study highlights the potential of LLMs combined with prompt\nengineering to offer practical and effective tools for educational emotion and\nbehavior analysis.\n","authors":["Kaito Tanaka","Benjamin Tan","Brian Wong"],"pdf_url":"https://arxiv.org/pdf/2408.06874v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2404.01713v2","updated":"2024-08-13T12:58:13Z","published":"2024-04-02T07:57:05Z","title":"Generative AI for Immersive Communication: The Next Frontier in\n Internet-of-Senses Through 6G","summary":" Over the past two decades, the Internet-of-Things (IoT) has become a\ntransformative concept, and as we approach 2030, a new paradigm known as the\nInternet of Senses (IoS) is emerging. Unlike conventional Virtual Reality (VR),\nIoS seeks to provide multi-sensory experiences, acknowledging that in our\nphysical reality, our perception extends far beyond just sight and sound; it\nencompasses a range of senses. This article explores the existing technologies\ndriving immersive multi-sensory media, delving into their capabilities and\npotential applications. This exploration includes a comparative analysis\nbetween conventional immersive media streaming and a proposed use case that\nleverages semantic communication empowered by generative Artificial\nIntelligence (AI). The focal point of this analysis is the substantial\nreduction in bandwidth consumption by 99.93% in the proposed scheme. Through\nthis comparison, we aim to underscore the practical applications of generative\nAI for immersive media. Concurrently addressing major challenges in this field,\nsuch as temporal synchronization of multiple media, ensuring high throughput,\nminimizing the End-to-End (E2E) latency, and robustness to low bandwidth while\noutlining future trajectories.\n","authors":["Nassim Sehad","Lina Bariah","Wassim Hamidouche","Hamed Hellaoui","Riku Jäntti","Mérouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2404.01713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06571v3","updated":"2024-08-13T12:49:20Z","published":"2024-06-03T16:43:04Z","title":"SUBLLM: A Novel Efficient Architecture with Token Sequence Subsampling\n for LLM","summary":" While Large Language Models (LLMs) have achieved remarkable success in\nvarious fields, the efficiency of training and inference remains a major\nchallenge. To address this issue, we propose SUBLLM, short for\nSubsampling-Upsampling-Bypass Large Language Model, an innovative architecture\nthat extends the core decoder-only framework by incorporating subsampling,\nupsampling, and bypass modules. The subsampling modules are responsible for\nshortening the sequence, while the upsampling modules restore the sequence\nlength, and the bypass modules enhance convergence. In comparison to LLaMA, the\nproposed SUBLLM exhibits significant enhancements in both training and\ninference speeds as well as memory usage, while maintaining competitive\nfew-shot performance. During training, SUBLLM increases speeds by 26% and cuts\nmemory by 10GB per GPU. In inference, it boosts speeds by up to 37% and reduces\nmemory by 1GB per GPU. The training and inference speeds can be enhanced by 34%\nand 52% respectively when the context window is expanded to 8192. Our code is\navailable at https://github.com/XiaoMi/subllm.\n","authors":["Quandong Wang","Yuxuan Yuan","Xiaoyu Yang","Ruike Zhang","Kang Zhao","Wei Liu","Jian Luan","Daniel Povey","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06571v3.pdf","comment":"9 pages, 3 figures, accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2408.06854v1","updated":"2024-08-13T12:31:30Z","published":"2024-08-13T12:31:30Z","title":"LoRA$^2$ : Multi-Scale Low-Rank Approximations for Fine-Tuning Large\n Language Models","summary":" Fine-tuning large language models (LLMs) with high parameter efficiency for\ndownstream tasks has become a new paradigm. Low-Rank Adaptation (LoRA)\nsignificantly reduces the number of trainable parameters for fine-tuning.\nAlthough it has demonstrated commendable performance, updating parameters\nwithin a single scale may not be the optimal choice for complex downstream\ntasks.In this paper, we extend the LoRA to multiple scales, dubbed as LoRA$^2$.\nWe first combine orthogonal projection theory to train a set of LoRAs in two\nmutually orthogonal planes. Then, we improve the importance score algorithm,\nwhich reduce parameter sensitivity score calculations by approximately 98.5\\%.\nBy pruning singular values with lower importance scores, thereby enhancing\nadaptability to various downstream tasks. Extensive experiments are conducted\non two widely used pre-trained models to validate the effectiveness of\nLoRA$^2$. Results show that it significantly reduces the number of trainable\nparameters to just 0.72\\% compared to full fine-tuning, while still delivering\nhighly impressive performance. Even when the parameters are further reduced to\n0.17M, it still achieves comparable results to the baseline with 8 times more\nparameters. Our code is available here:\nhttps://anonymous.4open.science/r/LoRA-2-5B4C\n","authors":["Jia-Chen Zhang","Yu-Jie Xiong","He-Xi Qiu","Dong-Hai Zhu","Chun-Ming Xia"],"pdf_url":"https://arxiv.org/pdf/2408.06854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03627v3","updated":"2024-08-13T12:27:10Z","published":"2024-07-04T04:30:04Z","title":"DSLR: Document Refinement with Sentence-Level Re-ranking and\n Reconstruction to Enhance Retrieval-Augmented Generation","summary":" Recent advancements in Large Language Models (LLMs) have significantly\nimproved their performance across various Natural Language Processing (NLP)\ntasks. However, LLMs still struggle with generating non-factual responses due\nto limitations in their parametric memory. Retrieval-Augmented Generation (RAG)\nsystems address this issue by incorporating external knowledge with a retrieval\nmodule. Despite their successes, however, current RAG systems face challenges\nwith retrieval failures and the limited ability of LLMs to filter out\nirrelevant information. Therefore, in this work, we propose DSLR (Document\nRefinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised\nframework that decomposes retrieved documents into sentences, filters out\nirrelevant sentences, and reconstructs them again into coherent passages. We\nexperimentally validate DSLR on multiple open-domain QA datasets and the\nresults demonstrate that DSLR significantly enhances the RAG performance over\nconventional fixed-size passage. Furthermore, our DSLR enhances performance in\nspecific, yet realistic scenarios without the need for additional training,\nproviding an effective and efficient solution for refining retrieved documents\nin RAG systems.\n","authors":["Taeho Hwang","Soyeong Jeong","Sukmin Cho","SeungYoon Han","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.03627v3.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2408.06849v1","updated":"2024-08-13T12:22:26Z","published":"2024-08-13T12:22:26Z","title":"Causal Agent based on Large Language Model","summary":" Large language models (LLMs) have achieved significant success across various\ndomains. However, the inherent complexity of causal problems and causal theory\nposes challenges in accurately describing them in natural language, making it\ndifficult for LLMs to comprehend and use them effectively. Causal methods are\nnot easily conveyed through natural language, which hinders LLMs' ability to\napply them accurately. Additionally, causal datasets are typically tabular,\nwhile LLMs excel in handling natural language data, creating a structural\nmismatch that impedes effective reasoning with tabular data. This lack of\ncausal reasoning capability limits the development of LLMs. To address these\nchallenges, we have equipped the LLM with causal tools within an agent\nframework, named the Causal Agent, enabling it to tackle causal problems. The\ncausal agent comprises tools, memory, and reasoning modules. In the tools\nmodule, the causal agent applies causal methods to align tabular data with\nnatural language. In the reasoning module, the causal agent employs the ReAct\nframework to perform reasoning through multiple iterations with the tools. In\nthe memory module, the causal agent maintains a dictionary instance where the\nkeys are unique names and the values are causal graphs. To verify the causal\nability of the causal agent, we established a benchmark consisting of four\nlevels of causal problems: variable level, edge level, causal graph level, and\ncausal effect level. We generated a test dataset of 1.3K using ChatGPT-3.5 for\nthese four levels of issues and tested the causal agent on the datasets. Our\nmethodology demonstrates remarkable efficacy on the four-level causal problems,\nwith accuracy rates all above 80%. For further insights and implementation\ndetails, our code is accessible via the GitHub repository\nhttps://github.com/Kairong-Han/Causal_Agent.\n","authors":["Kairong Han","Kun Kuang","Ziyu Zhao","Junjian Ye","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10908v3","updated":"2024-08-13T11:46:52Z","published":"2024-06-16T12:11:46Z","title":"MICL: Improving In-Context Learning through Multiple-Label Words in\n Demonstration","summary":" In-context learning (ICL) enables large language models (LLMs) to perform new\ntasks by using sample-label pairs as demonstrations. However, variations in\ndemonstrations can lead to significantly different performances. Current\nresearch mainly focuses on selecting demonstration samples, preassuming the\nclass name to be the label word when creating sample-label pairs. However, the\nchoice of label words is crucial for ICL performance. Besides, we observe that\nusing a single class name in demonstration may not yield optimal results while\nusing multiple label words in one sample-label pair can enhance ICL\nperformance. In this paper, we propose a comprehensive approach that organizes\nboth samples and labels in demonstrations based on LLMs' output space\ndistribution. This approach uses multiple label words in one sample-label pair\nto enhance label instruction. Evaluation results from seven classification\ndatasets show that this demonstration organization method, which incorporates\nmultiple label words to provide diverse label information, improves ICL\nperformance.\n","authors":["Zhu Zixiao","Feng Zijian","Zhou Hanzhang","Qian Junlang","Mao Kezhi"],"pdf_url":"https://arxiv.org/pdf/2406.10908v3.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.06816v1","updated":"2024-08-13T11:17:31Z","published":"2024-08-13T11:17:31Z","title":"MAQA: Evaluating Uncertainty Quantification in LLMs Regarding Data\n Uncertainty","summary":" Although large language models (LLMs) are capable of performing various\ntasks, they still suffer from producing plausible but incorrect responses. To\nimprove the reliability of LLMs, recent research has focused on uncertainty\nquantification to predict whether a response is correct or not. However, most\nuncertainty quantification methods have been evaluated on questions requiring a\nsingle clear answer, ignoring the existence of data uncertainty that arises\nfrom irreducible randomness. Instead, these methods only consider model\nuncertainty, which arises from a lack of knowledge. In this paper, we\ninvestigate previous uncertainty quantification methods under the presence of\ndata uncertainty. Our contributions are two-fold: 1) proposing a new\nMulti-Answer Question Answering dataset, MAQA, consisting of world knowledge,\nmathematical reasoning, and commonsense reasoning tasks to evaluate uncertainty\nquantification regarding data uncertainty, and 2) assessing 5 uncertainty\nquantification methods of diverse white- and black-box LLMs. Our findings show\nthat entropy and consistency-based methods estimate the model uncertainty well\neven under data uncertainty, while other methods for white- and black-box LLMs\nstruggle depending on the tasks. Additionally, methods designed for white-box\nLLMs suffer from overconfidence in reasoning tasks compared to simple knowledge\nqueries. We believe our observations will pave the way for future work on\nuncertainty quantification in realistic setting.\n","authors":["Yongjin Yang","Haneul Yoo","Hwaran Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06276v2","updated":"2024-08-13T11:05:10Z","published":"2024-08-12T16:39:03Z","title":"Review-driven Personalized Preference Reasoning with Large Language\n Models for Recommendation","summary":" Recent advancements in Large Language Models (LLMs) have demonstrated\nexceptional performance across a wide range of tasks, generating significant\ninterest in their application to recommendation systems. However, existing\nmethods have not fully capitalized on the potential of LLMs, often constrained\nby limited input information or failing to fully utilize their advanced\nreasoning capabilities. To address these limitations, we introduce EXP3RT, a\nnovel LLM-based recommender designed to leverage rich preference information\ncontained in user and item reviews. EXP3RT is basically fine-tuned through\ndistillation from a teacher LLM to perform three key tasks in order: EXP3RT\nfirst extracts and encapsulates essential subjective preferences from raw\nreviews, aggregates and summarizes them according to specific criteria to\ncreate user and item profiles. It then generates detailed step-by-step\nreasoning followed by predicted rating, i.e., reasoning-enhanced rating\nprediction, by considering both subjective and objective information from\nuser/item profiles and item descriptions. This personalized preference\nreasoning from EXP3RT enhances rating prediction accuracy and also provides\nfaithful and reasonable explanations for recommendation. Extensive experiments\nshow that EXP3RT outperforms existing methods on both rating prediction and\ncandidate item reranking for top-k recommendation, while significantly\nenhancing the explainability of recommendation systems.\n","authors":["Jieyong Kim","Hyunseo Kim","Hyunjin Cho","SeongKu Kang","Buru Chang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17075v3","updated":"2024-08-13T10:59:17Z","published":"2024-07-24T08:04:00Z","title":"SAFETY-J: Evaluating Safety with Critique","summary":" The deployment of Large Language Models (LLMs) in content generation raises\nsignificant safety concerns, particularly regarding the transparency and\ninterpretability of content evaluations. Current methods, primarily focused on\nbinary safety classifications, lack mechanisms for detailed critique, limiting\ntheir utility for model improvement and user trust. To address these\nlimitations, we introduce SAFETY-J, a bilingual generative safety evaluator for\nEnglish and Chinese with critique-based judgment. SAFETY-J utilizes a robust\ntraining dataset that includes diverse dialogues and augmented query-response\npairs to assess safety across various scenarios comprehensively. We establish\nan automated meta-evaluation benchmark that objectively assesses the quality of\ncritiques with minimal human intervention, facilitating scalable and continuous\nimprovement. Additionally, SAFETY-J employs an iterative preference learning\ntechnique to dynamically refine safety assessments based on meta-evaluations\nand critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced\nand accurate safety evaluations, thereby enhancing both critique quality and\npredictive reliability in complex content scenarios. To facilitate further\nresearch and application, we open-source SAFETY-J's training protocols,\ndatasets, and code at https://github.com/GAIR-NLP/Safety-J.\n","authors":["Yixiu Liu","Yuxiang Zheng","Shijie Xia","Jiajun Li","Yi Tu","Chaoling Song","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17075v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06793v1","updated":"2024-08-13T10:25:13Z","published":"2024-08-13T10:25:13Z","title":"Layerwise Recurrent Router for Mixture-of-Experts","summary":" The scaling of large language models (LLMs) has revolutionized their\ncapabilities in various tasks, yet this growth must be matched with efficient\ncomputational strategies. The Mixture-of-Experts (MoE) architecture stands out\nfor its ability to scale model size without significantly increasing training\ncosts. Despite their advantages, current MoE models often display parameter\ninefficiency. For instance, a pre-trained MoE-based LLM with 52 billion\nparameters might perform comparably to a standard model with 6.7 billion\nparameters. Being a crucial part of MoE, current routers in different layers\nindependently assign tokens without leveraging historical routing information,\npotentially leading to suboptimal token-expert combinations and the parameter\ninefficiency problem. To alleviate this issue, we introduce the Layerwise\nRecurrent Router for Mixture-of-Experts (RMoE). RMoE leverages a Gated\nRecurrent Unit (GRU) to establish dependencies between routing decisions across\nconsecutive layers. Such layerwise recurrence can be efficiently parallelly\ncomputed for input tokens and introduces negotiable costs. Our extensive\nempirical evaluations demonstrate that RMoE-based language models consistently\noutperform a spectrum of baseline models. Furthermore, RMoE integrates a novel\ncomputation stage orthogonal to existing methods, allowing seamless\ncompatibility with other MoE architectures. Our analyses attribute RMoE's gains\nto its effective cross-layer information sharing, which also improves expert\nselection and diversity. Our code is at https://github.com/qiuzh20/RMoE\n","authors":["Zihan Qiu","Zeyu Huang","Shuang Cheng","Yizhi Zhou","Zili Wang","Ivan Titov","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2408.06793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06787v1","updated":"2024-08-13T10:15:55Z","published":"2024-08-13T10:15:55Z","title":"Unlock the Power of Frozen LLMs in Knowledge Graph Completion","summary":" Classical knowledge graph completion (KGC) methods rely solely on structural\ninformation, struggling with the inherent sparsity of knowledge graphs (KGs).\nLarge Language Models (LLMs) learn extensive knowledge from large corpora with\npowerful context modeling, which is ideal for mitigating the limitations of\nprevious methods. Directly fine-tuning LLMs offers great capability but comes\nat the cost of huge time and memory consumption, while utilizing frozen LLMs\nyields suboptimal results. In this work, we aim to leverage LLMs for KGC\neffectively and efficiently. We capture the context-aware hidden states of\nknowledge triples by employing prompts to stimulate the intermediate layers of\nLLMs. We then train a data-efficient classifier on these hidden states to\nharness the inherent capabilities of frozen LLMs in KGC. We also generate\nentity descriptions with subgraph sampling on KGs, reducing the ambiguity of\ntriplets and enriching the knowledge representation. Extensive experiments on\nstandard benchmarks showcase the efficiency and effectiveness of our approach.\nWe outperform classical KGC methods on most datasets and match the performance\nof fine-tuned LLMs. Additionally, compared to fine-tuned LLMs, we boost GPU\nmemory efficiency by \\textbf{$188\\times$} and speed up training+inference by\n\\textbf{$13.48\\times$}.\n","authors":["Bo Xue","Yi Xu","Yunchong Song","Yiming Pang","Yuyang Ren","Jiaxin Ding","Luoyi Fu","Xinbing Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03541v3","updated":"2024-08-13T10:09:32Z","published":"2024-08-07T04:38:38Z","title":"EXAONE 3.0 7.8B Instruction Tuned Language Model","summary":" We introduce EXAONE 3.0 instruction-tuned language model, the first open\nmodel in the family of Large Language Models (LLMs) developed by LG AI\nResearch. Among different model sizes, we publicly release the 7.8B\ninstruction-tuned model to promote open research and innovations. Through\nextensive evaluations across a wide range of public and in-house benchmarks,\nEXAONE 3.0 demonstrates highly competitive real-world performance with\ninstruction-following capability against other state-of-the-art open models of\nsimilar size. Our comparative analysis shows that EXAONE 3.0 excels\nparticularly in Korean, while achieving compelling performance across general\ntasks and complex reasoning. With its strong real-world effectiveness and\nbilingual proficiency, we hope that EXAONE keeps contributing to advancements\nin Expert AI. Our EXAONE 3.0 instruction-tuned model is available at\nhttps://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct\n","authors":["LG AI Research"," :","Soyoung An","Kyunghoon Bae","Eunbi Choi","Stanley Jungkyu Choi","Yemuk Choi","Seokhee Hong","Yeonjung Hong","Junwon Hwang","Hyojin Jeon","Gerrard Jeongwon Jo","Hyunjik Jo","Jiyeon Jung","Yountae Jung","Euisoon Kim","Hyosang Kim","Joonkee Kim","Seonghwan Kim","Soyeon Kim","Sunkyoung Kim","Yireun Kim","Youchul Kim","Edward Hwayoung Lee","Haeju Lee","Honglak Lee","Jinsik Lee","Kyungmin Lee","Moontae Lee","Seungjun Lee","Woohyung Lim","Sangha Park","Sooyoun Park","Yongmin Park","Boseong Seo","Sihoon Yang","Heuiyeen Yeen","Kyungjae Yoo","Hyeongu Yun"],"pdf_url":"https://arxiv.org/pdf/2408.03541v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06778v1","updated":"2024-08-13T10:04:29Z","published":"2024-08-13T10:04:29Z","title":"Fast-and-Frugal Text-Graph Transformers are Effective Link Predictors","summary":" Link prediction models can benefit from incorporating textual descriptions of\nentities and relations, enabling fully inductive learning and flexibility in\ndynamic graphs. We address the challenge of also capturing rich structured\ninformation about the local neighbourhood of entities and their relations, by\nintroducing a Transformer-based approach that effectively integrates textual\ndescriptions with graph structure, reducing the reliance on resource-intensive\ntext encoders. Our experiments on three challenging datasets show that our\nFast-and-Frugal Text-Graph (FnF-TG) Transformers achieve superior performance\ncompared to the previous state-of-the-art methods, while maintaining efficiency\nand scalability.\n","authors":["Andrei C. Coman","Christos Theodoropoulos","Marie-Francine Moens","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2408.06778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18003v3","updated":"2024-08-13T09:55:43Z","published":"2024-07-25T12:56:22Z","title":"Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache\n Consumption","summary":" Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022,\nhave revolutionized various industries with their advanced language\ncomprehension. However, their efficiency is challenged by the Transformer\narchitecture' s struggle with handling long texts. KV-Cache has emerged as a\npivotal solution to this issue, converting the time complexity of token\ngeneration from quadratic to linear, albeit with increased GPU memory overhead\nproportional to conversation length. With the development of the LLM community\nand academia, various KV-Cache compression methods have been proposed. In this\nreview, we dissect the various properties of KV-Cache and elaborate on various\nmethods currently used to optimize the KV-Cache space usage of LLMs. These\nmethods span the pre-training phase, deployment phase, and inference phase, and\nwe summarize the commonalities and differences among these methods.\nAdditionally, we list some metrics for evaluating the long-text capabilities of\nlarge language models, from both efficiency and capability perspectives. Our\nreview thus sheds light on the evolving landscape of LLM optimization, offering\ninsights into future advancements in this dynamic field.\n","authors":["Luohe Shi","Hongyi Zhang","Yao Yao","Zuchao Li","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18003v3.pdf","comment":"to be published in CoLM 2024"},{"id":"http://arxiv.org/abs/2406.16464v4","updated":"2024-08-13T09:52:57Z","published":"2024-06-24T09:13:42Z","title":"InterCLIP-MEP: Interactive CLIP and Memory-Enhanced Predictor for\n Multi-modal Sarcasm Detection","summary":" The prevalence of sarcasm in social media, conveyed through text-image\ncombinations, presents significant challenges for sentiment analysis and\nintention mining. Existing multi-modal sarcasm detection methods have been\nproven to overestimate performance, as they struggle to effectively capture the\nintricate sarcastic cues that arise from the interaction between an image and\ntext. To address these issues, we propose InterCLIP-MEP, a novel framework for\nmulti-modal sarcasm detection. Specifically, we introduce an Interactive CLIP\n(InterCLIP) as the backbone to extract text-image representations, enhancing\nthem by embedding cross-modality information directly within each encoder,\nthereby improving the representations to capture text-image interactions\nbetter. Furthermore, an efficient training strategy is designed to adapt\nInterCLIP for our proposed Memory-Enhanced Predictor (MEP). MEP uses a dynamic,\nfixed-length dual-channel memory to store historical knowledge of valuable test\nsamples during inference. It then leverages this memory as a non-parametric\nclassifier to derive the final prediction, offering a more robust recognition\nof multi-modal sarcasm. Experiments demonstrate that InterCLIP-MEP achieves\nstate-of-the-art performance on the MMSD2.0 benchmark, with an accuracy\nimprovement of 1.08% and an F1 score improvement of 1.51% over the previous\nbest method.\n","authors":["Junjie Chen","Hang Yu","Weidong Liu","Subin Huang","Sanmin Liu"],"pdf_url":"https://arxiv.org/pdf/2406.16464v4.pdf","comment":"9 pages, 6 figures, 3 tables; Code and data are available at\n https://github.com/CoderChen01/InterCLIP-MEP"},{"id":"http://arxiv.org/abs/2311.18711v2","updated":"2024-08-13T09:40:35Z","published":"2023-11-30T17:06:00Z","title":"Women Are Beautiful, Men Are Leaders: Gender Stereotypes in Machine\n Translation and Language Modeling","summary":" We present GEST -- a new dataset for measuring gender-stereotypical reasoning\nin masked LMs and English-to-X machine translation systems. GEST contains\nsamples that are compatible with 9 Slavic languages and English for 16 gender\nstereotypes about men and women (e.g., Women are beautiful, Men are leaders).\nThe definition of said stereotypes was informed by gender experts. We used GEST\nto evaluate 11 masked LMs and 4 machine translation systems. We discovered\nsignificant and consistent amounts of stereotypical reasoning in almost all the\nevaluated models and languages.\n","authors":["Matúš Pikuliak","Andrea Hrckova","Stefan Oresko","Marián Šimko"],"pdf_url":"https://arxiv.org/pdf/2311.18711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06755v1","updated":"2024-08-13T09:26:41Z","published":"2024-08-13T09:26:41Z","title":"Sumotosima: A Framework and Dataset for Classifying and Summarizing\n Otoscopic Images","summary":" Otoscopy is a diagnostic procedure to examine the ear canal and eardrum using\nan otoscope. It identifies conditions like infections, foreign bodies, ear drum\nperforations and ear abnormalities. We propose a novel resource efficient deep\nlearning and transformer based framework, Sumotosima (Summarizer for otoscopic\nimages), an end-to-end pipeline for classification followed by summarization.\nOur framework works on combination of triplet and cross-entropy losses.\nAdditionally, we use Knowledge Enhanced Multimodal BART whose input is fused\ntextual and image embedding. The objective is to provide summaries that are\nwell-suited for patients, ensuring clarity and efficiency in understanding\notoscopic images. Given the lack of existing datasets, we have curated our own\nOCASD (Otoscopic Classification And Summary Dataset), which includes 500 images\nwith 5 unique categories annotated with their class and summaries by\nOtolaryngologists. Sumotosima achieved a result of 98.03%, which is 7.00%,\n3.10%, 3.01% higher than K-Nearest Neighbors, Random Forest and Support Vector\nMachines, respectively, in classification tasks. For summarization, Sumotosima\noutperformed GPT-4o and LLaVA by 88.53% and 107.57% in ROUGE scores,\nrespectively. We have made our code and dataset publicly available at\nhttps://github.com/anas2908/Sumotosima\n","authors":["Eram Anwarul Khan","Anas Anwarul Haq Khan"],"pdf_url":"https://arxiv.org/pdf/2408.06755v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2408.05517v2","updated":"2024-08-13T09:22:21Z","published":"2024-08-10T11:00:13Z","title":"SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning","summary":" Recent development in Large Language Models (LLMs) and Multi-modal Large\nLanguage Models (MLLMs) have leverage Attention-based Transformer architectures\nand achieved superior performance and generalization capabilities. They have\nsince covered extensive areas of traditional learning tasks. For instance,\ntext-based tasks such as text-classification and sequence-labeling, as well as\nmulti-modal tasks like Visual Question Answering (VQA) and Optical Character\nRecognition (OCR), which were previously addressed using different models, can\nnow be tackled based on one foundation model. Consequently, the training and\nlightweight fine-tuning of LLMs and MLLMs, especially those based on\nTransformer architecture, has become particularly important. In recognition of\nthese overwhelming needs, we develop SWIFT, a customizable one-stop\ninfrastructure for large models. With support of over $300+$ LLMs and $50+$\nMLLMs, SWIFT stands as the open-source framework that provide the \\textit{most\ncomprehensive support} for fine-tuning large models. In particular, it is the\nfirst training framework that provides systematic support for MLLMs. In\naddition to the core functionalities of fine-tuning, SWIFT also integrates\npost-training processes such as inference, evaluation, and model quantization,\nto facilitate fast adoptions of large models in various application scenarios.\nWith a systematic integration of various training techniques, SWIFT offers\nhelpful utilities such as benchmark comparisons among different training\ntechniques for large models. For fine-tuning models specialized in agent\nframework, we show that notable improvements on the ToolBench leader-board can\nbe achieved by training with customized dataset on SWIFT, with an increase of\n5.2%-21.8% in the Act.EM metric over various baseline models, a reduction in\nhallucination by 1.6%-14.1%, and an average performance improvement of 8%-17%.\n","authors":["Yuze Zhao","Jintao Huang","Jinghan Hu","Xingjun Wang","Yunlin Mao","Daoze Zhang","Zeyinzi Jiang","Zhikai Wu","Baole Ai","Ang Wang","Wenmeng Zhou","Yingda Chen"],"pdf_url":"https://arxiv.org/pdf/2408.05517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06737v1","updated":"2024-08-13T08:55:28Z","published":"2024-08-13T08:55:28Z","title":"Multilingual Models for Check-Worthy Social Media Posts Detection","summary":" This work presents an extensive study of transformer-based NLP models for\ndetection of social media posts that contain verifiable factual claims and\nharmful claims. The study covers various activities, including dataset\ncollection, dataset pre-processing, architecture selection, setup of settings,\nmodel training (fine-tuning), model testing, and implementation. The study\nincludes a comprehensive analysis of different models, with a special focus on\nmultilingual models where the same model is capable of processing social media\nposts in both English and in low-resource languages such as Arabic, Bulgarian,\nDutch, Polish, Czech, Slovak. The results obtained from the study were\nvalidated against state-of-the-art models, and the comparison demonstrated the\nrobustness of the proposed models. The novelty of this work lies in the\ndevelopment of multi-label multilingual classification models that can\nsimultaneously detect harmful posts and posts that contain verifiable factual\nclaims in an efficient way.\n","authors":["Sebastian Kula","Michal Gregor"],"pdf_url":"https://arxiv.org/pdf/2408.06737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06732v1","updated":"2024-08-13T08:47:29Z","published":"2024-08-13T08:47:29Z","title":"Exploring the anatomy of articulation rate in spontaneous English\n speech: relationships between utterance length effects and social factors","summary":" Speech rate has been shown to vary across social categories such as gender,\nage, and dialect, while also being conditioned by properties of speech\nplanning. The effect of utterance length, where speech rate is faster and less\nvariable for longer utterances, has also been shown to reduce the role of\nsocial factors once it has been accounted for, leaving unclear the relationship\nbetween social factors and speech production in conditioning speech rate.\nThrough modelling of speech rate across 13 English speech corpora, it is found\nthat utterance length has the largest effect on speech rate, though this effect\nitself varies little across corpora and speakers. While age and gender also\nmodulate speech rate, their effects are much smaller in magnitude. These\nfindings suggest utterance length effects may be conditioned by articulatory\nand perceptual constraints, and that social influences on speech rate should be\ninterpreted in the broader context of how speech rate variation is structured.\n","authors":["James Tanner","Morgan Sonderegger","Jane Stuart-Smith","Tyler Kendall","Jeff Mielke","Robin Dodsworth","Erik Thomas"],"pdf_url":"https://arxiv.org/pdf/2408.06732v1.pdf","comment":"Proceedings of Interspeech 2024. 5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.06731v1","updated":"2024-08-13T08:45:34Z","published":"2024-08-13T08:45:34Z","title":"Large language models can consistently generate high-quality content for\n election disinformation operations","summary":" Advances in large language models have raised concerns about their potential\nuse in generating compelling election disinformation at scale. This study\npresents a two-part investigation into the capabilities of LLMs to automate\nstages of an election disinformation operation. First, we introduce DisElect, a\nnovel evaluation dataset designed to measure LLM compliance with instructions\nto generate content for an election disinformation operation in localised UK\ncontext, containing 2,200 malicious prompts and 50 benign prompts. Using\nDisElect, we test 13 LLMs and find that most models broadly comply with these\nrequests; we also find that the few models which refuse malicious prompts also\nrefuse benign election-related prompts, and are more likely to refuse to\ngenerate content from a right-wing perspective. Secondly, we conduct a series\nof experiments (N=2,340) to assess the \"humanness\" of LLMs: the extent to which\ndisinformation operation content generated by an LLM is able to pass as\nhuman-written. Our experiments suggest that almost all LLMs tested released\nsince 2022 produce election disinformation operation content indiscernible by\nhuman evaluators over 50% of the time. Notably, we observe that multiple models\nachieve above-human levels of humanness. Taken together, these findings suggest\nthat current LLMs can be used to generate high-quality content for election\ndisinformation operations, even in hyperlocalised scenarios, at far lower costs\nthan traditional methods, and offer researchers and policymakers an empirical\nbenchmark for the measurement and evaluation of these capabilities in current\nand future models.\n","authors":["Angus R. Williams","Liam Burke-Moore","Ryan Sze-Yin Chan","Florence E. Enock","Federico Nanni","Tvesha Sippy","Yi-Ling Chung","Evelina Gabasova","Kobi Hackenburg","Jonathan Bright"],"pdf_url":"https://arxiv.org/pdf/2408.06731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06725v1","updated":"2024-08-13T08:36:15Z","published":"2024-08-13T08:36:15Z","title":"Enhancing Visual Dialog State Tracking through Iterative Object-Entity\n Alignment in Multi-Round Conversations","summary":" Visual Dialog (VD) is a task where an agent answers a series of image-related\nquestions based on a multi-round dialog history. However, previous VD methods\noften treat the entire dialog history as a simple text input, disregarding the\ninherent conversational information flows at the round level. In this paper, we\nintroduce Multi-round Dialogue State Tracking model (MDST), a framework that\naddresses this limitation by leveraging the dialogue state learned from dialog\nhistory to answer questions. MDST captures each round of dialog history,\nconstructing internal dialogue state representations defined as 2-tuples of\nvision-language representations. These representations effectively ground the\ncurrent question, enabling the generation of accurate answers. Experimental\nresults on the VisDial v1.0 dataset demonstrate that MDST achieves a new\nstate-of-the-art performance in generative setting. Furthermore, through a\nseries of human studies, we validate the effectiveness of MDST in generating\nlong, consistent, and human-like answers while consistently answering a series\nof questions correctly.\n","authors":["Wei Pang","Ruixue Duan","Jinfu Yang","Ning Li"],"pdf_url":"https://arxiv.org/pdf/2408.06725v1.pdf","comment":"This article has been accepted in CAAI Transactions on Intelligence\n Technology! Article ID: CIT2_12370, Article DOI: 10.1049/cit2.12370"},{"id":"http://arxiv.org/abs/2310.01929v3","updated":"2024-08-13T08:11:49Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n Text-To-Image Models","summary":" Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have\ndemonstrated remarkable prompt-based image generation capabilities.\nMultilingual encoders may have a substantial impact on the cultural agency of\nthese models, as language is a conduit of culture. In this study, we explore\nthe cultural perception embedded in TTI models by characterizing culture across\nthree hierarchical tiers: cultural dimensions, cultural domains, and cultural\nconcepts. Based on this ontology, we derive prompt templates to unlock the\ncultural knowledge in TTI models, and propose a comprehensive suite of\nevaluation techniques, including intrinsic evaluations using the CLIP space,\nextrinsic evaluations with a Visual-Question-Answer (VQA) model and human\nassessments, to evaluate the cultural content of TTI-generated images. To\nbolster our research, we introduce the CulText2I dataset, derived from six\ndiverse TTI models and spanning ten languages. Our experiments provide insights\nregarding Do, What, Which and How research questions about the nature of\ncultural encoding in TTI models, paving the way for cross-cultural applications\nof these models.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v3.pdf","comment":"Project page: https://venturamor.github.io/CulText2IWeb/"},{"id":"http://arxiv.org/abs/2408.04840v2","updated":"2024-08-13T08:10:32Z","published":"2024-08-09T03:25:42Z","title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal\n Large Language Models","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in executing instructions for a variety of single-image tasks.\nDespite this progress, significant challenges remain in modeling long image\nsequences. In this work, we introduce the versatile multi-modal large language\nmodel, mPLUG-Owl3, which enhances the capability for long image-sequence\nunderstanding in scenarios that incorporate retrieved image-text knowledge,\ninterleaved image-text, and lengthy videos. Specifically, we propose novel\nhyper attention blocks to efficiently integrate vision and language into a\ncommon language-guided semantic space, thereby facilitating the processing of\nextended multi-image scenarios. Extensive experimental results suggest that\nmPLUG-Owl3 achieves state-of-the-art performance among models with a similar\nsize on single-image, multi-image, and video benchmarks. Moreover, we propose a\nchallenging long visual sequence evaluation named Distractor Resistance to\nassess the ability of models to maintain focus amidst distractions. Finally,\nwith the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance\non ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to\nthe development of more efficient and powerful multimodal large language\nmodels.\n","authors":["Jiabo Ye","Haiyang Xu","Haowei Liu","Anwen Hu","Ming Yan","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.04840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17475v2","updated":"2024-08-13T07:39:59Z","published":"2024-04-26T15:23:47Z","title":"CEval: A Benchmark for Evaluating Counterfactual Text Generation","summary":" Counterfactual text generation aims to minimally change a text, such that it\nis classified differently. Judging advancements in method development for\ncounterfactual text generation is hindered by a non-uniform usage of data sets\nand metrics in related work. We propose CEval, a benchmark for comparing\ncounterfactual text generation methods. CEval unifies counterfactual and text\nquality metrics, includes common counterfactual datasets with human\nannotations, standard baselines (MICE, GDBA, CREST) and the open-source\nlanguage model LLAMA-2. Our experiments found no perfect method for generating\ncounterfactual text. Methods that excel at counterfactual metrics often produce\nlower-quality text while LLMs with simple prompts generate high-quality text\nbut struggle with counterfactual criteria. By making CEval available as an\nopen-source Python library, we encourage the community to contribute more\nmethods and maintain consistent evaluation in future work.\n","authors":["Van Bach Nguyen","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2404.17475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13369v2","updated":"2024-08-13T07:35:31Z","published":"2024-03-20T08:01:33Z","title":"Clinical information extraction for Low-resource languages with Few-shot\n learning using Pre-trained language models and Prompting","summary":" Automatic extraction of medical information from clinical documents poses\nseveral challenges: high costs of required clinical expertise, limited\ninterpretability of model predictions, restricted computational resources and\nprivacy regulations. Recent advances in domain-adaptation and prompting methods\nshowed promising results with minimal training data using lightweight masked\nlanguage models, which are suited for well-established interpretability\nmethods. We are first to present a systematic evaluation of these methods in a\nlow-resource setting, by performing multi-class section classification on\nGerman doctor's letters. We conduct extensive class-wise evaluations supported\nby Shapley values, to validate the quality of our small training data set and\nto ensure the interpretability of model predictions. We demonstrate that a\nlightweight, domain-adapted pretrained model, prompted with just 20 shots,\noutperforms a traditional classification model by 30.5% accuracy. Our results\nserve as a process-oriented guideline for clinical information extraction\nprojects working with low-resource.\n","authors":["Phillip Richter-Pechanski","Philipp Wiesenbach","Dominic M. Schwab","Christina Kiriakou","Nicolas Geis","Christoph Dieterich","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2403.13369v2.pdf","comment":"Paper accepted for publication in the journal: Natural Language\n Engineering (Cambridge Core)"},{"id":"http://arxiv.org/abs/2402.11432v3","updated":"2024-08-13T07:16:01Z","published":"2024-02-18T02:52:54Z","title":"Can Deception Detection Go Deeper? Dataset, Evaluation, and Benchmark\n for Deception Reasoning","summary":" Deception detection has attracted increasing attention due to its importance\nin real-world scenarios. Its main goal is to detect deceptive behaviors from\nmultimodal clues such as gestures, facial expressions, prosody, etc. However,\nthese bases are usually subjective and related to personal habits. Therefore,\nwe extend deception detection to deception reasoning, further providing\nobjective evidence to support subjective judgment. Specifically, we provide\npotential lies and basic facts and then analyze why this sentence may be a lie\nby combining factual inconsistencies and intent behind them. Compared with\ndeception detection, this task is more applicable to real-world scenarios. For\nexample, in interrogation, the police should judge whether a person is lying\nbased on solid evidence. This paper presents our initial attempts at this task,\nincluding constructing a dataset and defining evaluation metrics. Meanwhile,\nthis task can serve as a benchmark for evaluating the complex reasoning\ncapability of large language models. Our code and data are provided in the\nsupplementary material.\n","authors":["Kang Chen","Zheng Lian","Haiyang Sun","Rui Liu","Jiangyan Yi","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2402.11432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17328v2","updated":"2024-08-13T07:12:34Z","published":"2024-06-25T07:25:15Z","title":"Dual-Space Knowledge Distillation for Large Language Models","summary":" Knowledge distillation (KD) is known as a promising solution to compress\nlarge language models (LLMs) via transferring their knowledge to smaller\nmodels. During this process, white-box KD methods usually minimize the distance\nbetween the output distributions of the two models so that more knowledge can\nbe transferred. However, in the current white-box KD framework, the output\ndistributions are from the respective output spaces of the two models, using\ntheir own prediction heads. We argue that the space discrepancy will lead to\nlow similarity between the teacher model and the student model on both\nrepresentation and distribution levels. Furthermore, this discrepancy also\nhinders the KD process between models with different vocabularies, which is\ncommon for current LLMs. To address these issues, we propose a dual-space\nknowledge distillation (DSKD) framework that unifies the output spaces of the\ntwo models for KD. On the basis of DSKD, we further develop a cross-model\nattention mechanism, which can automatically align the representations of the\ntwo models with different vocabularies. Thus, our framework is not only\ncompatible with various distance functions for KD (e.g., KL divergence) like\nthe current framework, but also supports KD between any two LLMs regardless of\ntheir vocabularies. Experiments on task-agnostic instruction-following\nbenchmarks show that DSKD significantly outperforms the current white-box KD\nframework with various distance functions, and also surpasses existing KD\nmethods for LLMs with different vocabularies.\n","authors":["Songming Zhang","Xue Zhang","Zengkui Sun","Yufeng Chen","Jinan Xu"],"pdf_url":"https://arxiv.org/pdf/2406.17328v2.pdf","comment":"17 pages, 11 figures, code available at:\n https://github.com/songmzhang/DSKD"},{"id":"http://arxiv.org/abs/2408.02253v2","updated":"2024-08-13T07:01:46Z","published":"2024-08-05T05:56:37Z","title":"Advancing Post-OCR Correction: A Comparative Study of Synthetic Data","summary":" This paper explores the application of synthetic data in the post-OCR domain\non multiple fronts by conducting experiments to assess the impact of data\nvolume, augmentation, and synthetic data generation methods on model\nperformance. Furthermore, we introduce a novel algorithm that leverages\ncomputer vision feature detection algorithms to calculate glyph similarity for\nconstructing post-OCR synthetic data. Through experiments conducted across a\nvariety of languages, including several low-resource ones, we demonstrate that\nmodels like ByT5 can significantly reduce Character Error Rates (CER) without\nthe need for manually annotated data, and our proposed synthetic data\ngeneration method shows advantages over traditional methods, particularly in\nlow-resource languages.\n","authors":["Shuhao Guan","Derek Greene"],"pdf_url":"https://arxiv.org/pdf/2408.02253v2.pdf","comment":"ACL 2024 findings"},{"id":"http://arxiv.org/abs/2408.06675v1","updated":"2024-08-13T06:55:54Z","published":"2024-08-13T06:55:54Z","title":"Latin Treebanks in Review: An Evaluation of Morphological Tagging Across\n Time","summary":" Existing Latin treebanks draw from Latin's long written tradition, spanning\n17 centuries and a variety of cultures. Recent efforts have begun to harmonize\nthese treebanks' annotations to better train and evaluate morphological\ntaggers. However, the heterogeneity of these treebanks must be carefully\nconsidered to build effective and reliable data. In this work, we review\nexisting Latin treebanks to identify the texts they draw from, identify their\noverlap, and document their coverage across time and genre. We additionally\ndesign automated conversions of their morphological feature annotations into\nthe conventions of standard Latin grammar. From this, we build new time-period\ndata splits that draw from the existing treebanks which we use to perform a\nbroad cross-time analysis for POS and morphological feature tagging. We find\nthat BERT-based taggers outperform existing taggers while also being more\nrobust to cross-domain shifts.\n","authors":["Marisa Hudspeth","Brendan O'Connor","Laure Thompson"],"pdf_url":"https://arxiv.org/pdf/2408.06675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06673v1","updated":"2024-08-13T06:52:29Z","published":"2024-08-13T06:52:29Z","title":"Pragmatic inference of scalar implicature by LLMs","summary":" This study investigates how Large Language Models (LLMs), particularly BERT\n(Devlin et al., 2019) and GPT-2 (Radford et al., 2019), engage in pragmatic\ninference of scalar implicature, such as some. Two sets of experiments were\nconducted using cosine similarity and next sentence/token prediction as\nexperimental methods. The results in experiment 1 showed that, both models\ninterpret some as pragmatic implicature not all in the absence of context,\naligning with human language processing. In experiment 2, in which Question\nUnder Discussion (QUD) was presented as a contextual cue, BERT showed\nconsistent performance regardless of types of QUDs, while GPT-2 encountered\nprocessing difficulties since a certain type of QUD required pragmatic\ninference for implicature. The findings revealed that, in terms of theoretical\napproaches, BERT inherently incorporates pragmatic implicature not all within\nthe term some, adhering to Default model (Levinson, 2000). In contrast, GPT-2\nseems to encounter processing difficulties in inferring pragmatic implicature\nwithin context, consistent with Context-driven model (Sperber and Wilson,\n2002).\n","authors":["Ye-eun Cho","Seong mook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.06673v1.pdf","comment":"This research was presented at the Association for Computational\n Linguistics conference, held on August 11-16"},{"id":"http://arxiv.org/abs/2408.06663v1","updated":"2024-08-13T06:28:43Z","published":"2024-08-13T06:28:43Z","title":"Amuro & Char: Analyzing the Relationship between Pre-Training and\n Fine-Tuning of Large Language Models","summary":" The development of large language models leads to the formation of a\npre-train-then-align paradigm, in which the model is typically pre-trained on a\nlarge text corpus and undergoes a tuning stage to align the model with human\npreference or downstream tasks. In this work, we investigate the relationship\nbetween pre-training and fine-tuning by fine-tuning multiple intermediate\npre-trained model checkpoints. Our results on 18 datasets suggest that i)\ncontinual pre-training improves the model in a latent way that unveils after\nfine-tuning; ii) with extra fine-tuning, the datasets that the model does not\ndemonstrate capability gain much more than those that the model performs well\nduring the pre-training stage; iii) although model benefits significantly\nthrough supervised fine-tuning, it may forget previously known domain knowledge\nand the tasks that are not seen during fine-tuning; iv) the model resembles\nhigh sensitivity to evaluation prompts after supervised fine-tuning, but this\nsensitivity can be alleviated by more pre-training.\n","authors":["Kaiser Sun","Mark Dredze"],"pdf_url":"https://arxiv.org/pdf/2408.06663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05338v2","updated":"2024-08-13T05:48:31Z","published":"2023-10-09T01:52:27Z","title":"Negative Object Presence Evaluation (NOPE) to Measure Object\n Hallucination in Vision-Language Models","summary":" Object hallucination poses a significant challenge in vision-language (VL)\nmodels, often leading to the generation of nonsensical or unfaithful responses\nwith non-existent objects. However, the absence of a general measurement for\nevaluating object hallucination in VL models has hindered our understanding and\nability to mitigate this issue. In this work, we present NOPE (Negative Object\nPresence Evaluation), a novel benchmark designed to assess object hallucination\nin VL models through visual question answering (VQA). We propose a\ncost-effective and scalable approach utilizing large language models to\ngenerate 29.5k synthetic negative pronoun (NegP) data of high quality for NOPE.\nWe extensively investigate the performance of 10 state-of-the-art VL models in\ndiscerning the non-existence of objects in visual questions, where the ground\ntruth answers are denoted as NegP (e.g., \"none\"). Additionally, we evaluate\ntheir standard performance on visual questions on 9 other VQA datasets. Through\nour experiments, we demonstrate that no VL model is immune to the vulnerability\nof object hallucination, as all models achieve accuracy below 10\\% on NegP.\nFurthermore, we uncover that lexically diverse visual questions, question types\nwith large scopes, and scene-relevant objects capitalize the risk of object\nhallucination in VL models.\n","authors":["Holy Lovenia","Wenliang Dai","Samuel Cahyawijaya","Ziwei Ji","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2310.05338v2.pdf","comment":"Published in ALVR Workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2408.06634v1","updated":"2024-08-13T04:53:31Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v1.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2401.08315v2","updated":"2024-08-13T04:50:43Z","published":"2024-01-16T12:30:56Z","title":"Application of LLM Agents in Recruitment: A Novel Framework for Resume\n Screening","summary":" The automation of resume screening is a crucial aspect of the recruitment\nprocess in organizations. Automated resume screening systems often encompass a\nrange of natural language processing (NLP) tasks. This paper introduces a novel\nLarge Language Models (LLMs) based agent framework for resume screening, aimed\nat enhancing efficiency and time management in recruitment processes. Our\nframework is distinct in its ability to efficiently summarize and grade each\nresume from a large dataset. Moreover, it utilizes LLM agents for\ndecision-making. To evaluate our framework, we constructed a dataset from\nactual resumes and simulated a resume screening process. Subsequently, the\noutcomes of the simulation experiment were compared and subjected to detailed\nanalysis. The results demonstrate that our automated resume screening framework\nis 11 times faster than traditional manual methods. Furthermore, by fine-tuning\nthe LLMs, we observed a significant improvement in the F1 score, reaching\n87.73\\%, during the resume sentence classification phase. In the resume\nsummarization and grading phase, our fine-tuned model surpassed the baseline\nperformance of the GPT-3.5 model. Analysis of the decision-making efficacy of\nthe LLM agents in the final offer stage further underscores the potential of\nLLM agents in transforming resume screening processes.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2401.08315v2.pdf","comment":"Accept by Journal of Information Processing,(2024), 18 pages, 19\n figures"},{"id":"http://arxiv.org/abs/2408.06632v1","updated":"2024-08-13T04:40:56Z","published":"2024-08-13T04:40:56Z","title":"EditScribe: Non-Visual Image Editing with Natural Language Verification\n Loops","summary":" Image editing is an iterative process that requires precise visual evaluation\nand manipulation for the output to match the editing intent. However, current\nimage editing tools do not provide accessible interaction nor sufficient\nfeedback for blind and low vision individuals to achieve this level of control.\nTo address this, we developed EditScribe, a prototype system that makes image\nediting accessible using natural language verification loops powered by large\nmultimodal models. Using EditScribe, the user first comprehends the image\ncontent through initial general and object descriptions, then specifies edit\nactions using open-ended natural language prompts. EditScribe performs the\nimage edit, and provides four types of verification feedback for the user to\nverify the performed edit, including a summary of visual changes, AI judgement,\nand updated general and object descriptions. The user can ask follow-up\nquestions to clarify and probe into the edits or verification feedback, before\nperforming another edit. In a study with ten blind or low-vision users, we\nfound that EditScribe supported participants to perform and verify image edit\nactions non-visually. We observed different prompting strategies from\nparticipants, and their perceptions on the various types of verification\nfeedback. Finally, we discuss the implications of leveraging natural language\nverification loops to make visual authoring non-visually accessible.\n","authors":["Ruei-Che Chang","Yuxuan Liu","Lotus Zhang","Anhong Guo"],"pdf_url":"https://arxiv.org/pdf/2408.06632v1.pdf","comment":"ASSETS 2024"},{"id":"http://arxiv.org/abs/2408.06631v1","updated":"2024-08-13T04:36:18Z","published":"2024-08-13T04:36:18Z","title":"IFShip: A Large Vision-Language Model for Interpretable Fine-grained\n Ship Classification via Domain Knowledge-Enhanced Instruction Tuning","summary":" End-to-end interpretation is currently the prevailing paradigm for remote\nsensing fine-grained ship classification (RS-FGSC) task. However, its inference\nprocess is uninterpretable, leading to criticism as a black box model. To\naddress this issue, we propose a large vision-language model (LVLM) named\nIFShip for interpretable fine-grained ship classification. Unlike traditional\nmethods, IFShip excels in interpretability by accurately conveying the\nreasoning process of FGSC in natural language. Specifically, we first design a\ndomain knowledge-enhanced Chain-of-Thought (COT) prompt generation mechanism.\nThis mechanism is used to semi-automatically construct a task-specific\ninstruction-following dataset named TITANIC-FGS, which emulates human-like\nlogical decision-making. We then train the IFShip model using task instructions\ntuned with the TITANIC-FGS dataset. Building on IFShip, we develop an FGSC\nvisual chatbot that redefines the FGSC problem as a step-by-step reasoning task\nand conveys the reasoning process in natural language. Experimental results\nreveal that the proposed method surpasses state-of-the-art FGSC algorithms in\nboth classification interpretability and accuracy. Moreover, compared to LVLMs\nlike LLaVA and MiniGPT-4, our approach demonstrates superior expertise in the\nFGSC task. It provides an accurate chain of reasoning when fine-grained ship\ntypes are recognizable to the human eye and offers interpretable explanations\nwhen they are not.\n","authors":["Mingning Guo","Mengwei Wu","Yuxiang Shen","Haifeng Li","Chao Tao"],"pdf_url":"https://arxiv.org/pdf/2408.06631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06627v1","updated":"2024-08-13T04:32:45Z","published":"2024-08-13T04:32:45Z","title":"WorldScribe: Towards Context-Aware Live Visual Descriptions","summary":" Automated live visual descriptions can aid blind people in understanding\ntheir surroundings with autonomy and independence. However, providing\ndescriptions that are rich, contextual, and just-in-time has been a\nlong-standing challenge in accessibility. In this work, we develop WorldScribe,\na system that generates automated live real-world visual descriptions that are\ncustomizable and adaptive to users' contexts: (i) WorldScribe's descriptions\nare tailored to users' intents and prioritized based on semantic relevance.\n(ii) WorldScribe is adaptive to visual contexts, e.g., providing consecutively\nsuccinct descriptions for dynamic scenes, while presenting longer and detailed\nones for stable settings. (iii) WorldScribe is adaptive to sound contexts,\ne.g., increasing volume in noisy environments, or pausing when conversations\nstart. Powered by a suite of vision, language, and sound recognition models,\nWorldScribe introduces a description generation pipeline that balances the\ntradeoffs between their richness and latency to support real-time use. The\ndesign of WorldScribe is informed by prior work on providing visual\ndescriptions and a formative study with blind participants. Our user study and\nsubsequent pipeline evaluation show that WorldScribe can provide real-time and\nfairly accurate visual descriptions to facilitate environment understanding\nthat is adaptive and customized to users' contexts. Finally, we discuss the\nimplications and further steps toward making live visual descriptions more\ncontext-aware and humanized.\n","authors":["Ruei-Che Chang","Yuxuan Liu","Anhong Guo"],"pdf_url":"https://arxiv.org/pdf/2408.06627v1.pdf","comment":"UIST 2024"},{"id":"http://arxiv.org/abs/2408.06621v1","updated":"2024-08-13T04:18:32Z","published":"2024-08-13T04:18:32Z","title":"Towards Robust and Cost-Efficient Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) have demonstrated strong reasoning and\nmemorization capabilities via pretraining on massive textual corpora. However,\ntraining LLMs on human-written text entails significant risk of privacy and\ncopyright violations, which demands an efficient machine unlearning framework\nto remove knowledge of sensitive data without retraining the model from\nscratch. While Gradient Ascent (GA) is widely used for unlearning by reducing\nthe likelihood of generating unwanted information, the unboundedness of\nincreasing the cross-entropy loss causes not only unstable optimization, but\nalso catastrophic forgetting of knowledge that needs to be retained. We also\ndiscover its joint application under low-rank adaptation results in\nsignificantly suboptimal computational cost vs. generative performance\ntrade-offs. In light of this limitation, we propose two novel techniques for\nrobust and cost-efficient unlearning on LLMs. We first design an Inverted Hinge\nloss that suppresses unwanted tokens by increasing the probability of the next\nmost likely token, thereby retaining fluency and structure in language\ngeneration. We also propose to initialize low-rank adapter weights based on\nFisher-weighted low-rank approximation, which induces faster unlearning and\nbetter knowledge retention by allowing model updates to be focused on\nparameters that are important in generating textual data we wish to remove.\n","authors":["Sungmin Cha","Sungjun Cho","Dasol Hwang","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06621v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.06618v1","updated":"2024-08-13T04:06:45Z","published":"2024-08-13T04:06:45Z","title":"Generalized knowledge-enhanced framework for biomedical entity and\n relation extraction","summary":" In recent years, there has been an increasing number of frameworks developed\nfor biomedical entity and relation extraction. This research effort aims to\naddress the accelerating growth in biomedical publications and the intricate\nnature of biomedical texts, which are written for mainly domain experts. To\nhandle these challenges, we develop a novel framework that utilizes external\nknowledge to construct a task-independent and reusable background knowledge\ngraph for biomedical entity and relation extraction. The design of our model is\ninspired by how humans learn domain-specific topics. In particular, humans\noften first acquire the most basic and common knowledge regarding a field to\nbuild the foundational knowledge and then use that as a basis for extending to\nvarious specialized topics. Our framework employs such common-knowledge-sharing\nmechanism to build a general neural-network knowledge graph that is learning\ntransferable to different domain-specific biomedical texts effectively.\nExperimental evaluations demonstrate that our model, equipped with this\ngeneralized and cross-transferable knowledge base, achieves competitive\nperformance benchmarks, including BioRelEx for binding interaction detection\nand ADE for Adverse Drug Effect identification.\n","authors":["Minh Nguyen","Phuong Le"],"pdf_url":"https://arxiv.org/pdf/2408.06618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01019v3","updated":"2024-08-13T03:55:35Z","published":"2024-04-01T09:39:38Z","title":"Source-Aware Training Enables Knowledge Attribution in Language Models","summary":" Large language models (LLMs) learn a vast amount of knowledge during\npretraining, but they are often oblivious to the source(s) of such knowledge.\nWe investigate the problem of intrinsic source citation, where LLMs are\nrequired to cite the pretraining source supporting a generated response.\nIntrinsic source citation can enhance LLM transparency, interpretability, and\nverifiability. To give LLMs such ability, we explore source-aware training -- a\nrecipe that involves (i) training the LLM to associate unique source document\nidentifiers with the knowledge in each document, followed by (ii) an\ninstruction-tuning stage to teach the LLM to cite a supporting pretraining\nsource when prompted. Source-aware training borrows from existing\npretraining/fine-tuning frameworks and requires minimal changes to the model\narchitecture or implementation. Through experiments on synthetic data, we\ndemonstrate that our training recipe can enable faithful attribution to the\npretraining data without a substantial impact on the model's perplexity\ncompared to standard pretraining. Our findings also highlight the importance of\npretraining data augmentation in achieving attribution. Code and data available\nhere: \\url{https://github.com/mukhal/intrinsic-source-citation}\n","authors":["Muhammad Khalifa","David Wadden","Emma Strubell","Honglak Lee","Lu Wang","Iz Beltagy","Hao Peng"],"pdf_url":"https://arxiv.org/pdf/2404.01019v3.pdf","comment":"COLM '24"},{"id":"http://arxiv.org/abs/2408.06610v1","updated":"2024-08-13T03:45:11Z","published":"2024-08-13T03:45:11Z","title":"CROME: Cross-Modal Adapters for Efficient Multimodal LLM","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable\nimage-language capabilities, but their widespread use faces challenges in\ncost-effective training and adaptation. Existing approaches often necessitate\nexpensive language model retraining and limited adaptability. Additionally, the\ncurrent focus on zero-shot performance improvements offers insufficient\nguidance for task-specific tuning. We propose CROME, an efficient\nvision-language instruction tuning framework. It features a novel gated\ncross-modal adapter that effectively combines visual and textual\nrepresentations prior to input into a frozen LLM. This lightweight adapter,\ntrained with minimal parameters, enables efficient cross-modal understanding.\nNotably, CROME demonstrates superior zero-shot performance on standard visual\nquestion answering and instruction-following benchmarks. Moreover, it yields\nfine-tuning with exceptional parameter efficiency, competing with task-specific\nspecialist state-of-the-art methods. CROME demonstrates the potential of pre-LM\nalignment for building scalable, adaptable, and parameter-efficient multimodal\nmodels.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Tejas Nama","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2408.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06598v1","updated":"2024-08-13T03:25:49Z","published":"2024-08-13T03:25:49Z","title":"A Perspective on Large Language Models, Intelligent Machines, and\n Knowledge Acquisition","summary":" Large Language Models (LLMs) are known for their remarkable ability to\ngenerate synthesized 'knowledge', such as text documents, music, images, etc.\nHowever, there is a huge gap between LLM's and human capabilities for\nunderstanding abstract concepts and reasoning. We discuss these issues in a\nlarger philosophical context of human knowledge acquisition and the Turing\ntest. In addition, we illustrate the limitations of LLMs by analyzing GPT-4\nresponses to questions ranging from science and math to common sense reasoning.\nThese examples show that GPT-4 can often imitate human reasoning, even though\nit lacks understanding. However, LLM responses are synthesized from a large LLM\nmodel trained on all available data. In contrast, human understanding is based\non a small number of abstract concepts. Based on this distinction, we discuss\nthe impact of LLMs on acquisition of human knowledge and education.\n","authors":["Vladimir Cherkassky","Eng Hock Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07791v3","updated":"2024-08-13T02:52:10Z","published":"2024-06-12T01:12:28Z","title":"Judging the Judges: A Systematic Investigation of Position Bias in\n Pairwise Comparative Assessments by LLMs","summary":" LLM-as-a-Judge offers a promising alternative to human judges across various\ntasks, yet inherent biases, particularly position bias - a systematic\npreference for answers based on their position in the prompt - compromise its\neffectiveness. Our study investigates this issue by developing a framework to\nsystematically study and quantify position bias using metrics such as\nrepetitional consistency, positional consistency, and positional fairness. We\nconduct experiments with 9 judge models across 22 tasks from the MTBench and\nDevBench benchmarks and nearly 40 answer-generating models, generating\napproximately 80,000 evaluation instances. This comprehensive assessment\nreveals significant variations in bias across judges and tasks. Although GPT-4\noften excels in positional consistency and fairness, some more cost-effective\nmodels perform comparably or even better in specific tasks, highlighting\nessential trade-offs between consistency, fairness, and cost. Our results also\ndemonstrate high consistency of judgment across repetitions, confirming that\nposition bias is not due to random variations. This research significantly\ncontributes to the field by introducing new concepts for understanding position\nbias and providing a multi-dimensional framework for evaluation. These insights\nguide the selection of optimal judge models, enhance benchmark design, and lay\nthe foundation for future research into effective debiasing strategies,\nultimately enhancing the reliability of LLM evaluators.\n","authors":["Lin Shi","Chiyu Ma","Weicheng Ma","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2406.07791v3.pdf","comment":"70 pages, around 200 figures and subfigures"},{"id":"http://arxiv.org/abs/2408.06583v1","updated":"2024-08-13T02:43:19Z","published":"2024-08-13T02:43:19Z","title":"Biomedical Event Extraction via Structure-aware Generation","summary":" Biomedical Event Extraction (BEE) is a critical task that involves modeling\ncomplex relationships between fine-grained entities in biomedical text data.\nHowever, most existing BEE models rely on classification methods that neglect\nthe label semantics and argument dependency structure within the data. To\naddress these limitations, we propose GenBEE, a generative model enhanced with\na structure-aware prefix for biomedical event extraction. GenBEE constructs\nevent prompts that leverage knowledge distilled from large language models\n(LLMs), thereby incorporating both label semantics and argument dependency\nrelationships. Additionally, GenBEE introduces a structural prefix learning\nmodule that generates structure-aware prefixes with structural prompts,\nenriching the generation process with structural features. Extensive\nexperiments on three benchmark datasets demonstrate the effectiveness of GenBEE\nand it achieves state-of-the-art performance on the MLEE and GE11 datasets.\nFurthermore, our analysis shows that the structural prefixes effectively bridge\nthe gap between structural prompts and the representation space of generative\nmodels, enabling better integration of event structural information.\n","authors":["Haohan Yuan","Siu Cheung Hui","Haopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06583v1.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.06578v1","updated":"2024-08-13T02:35:54Z","published":"2024-08-13T02:35:54Z","title":"OpenEP: Open-Ended Future Event Prediction","summary":" Future event prediction (FEP) is a long-standing and crucial task in the\nworld, as understanding the evolution of events enables early risk\nidentification, informed decision-making, and strategic planning. Existing work\ntypically treats event prediction as classification tasks and confines the\noutcomes of future events to a fixed scope, such as yes/no questions, candidate\nset, and taxonomy, which is difficult to include all possible outcomes of\nfuture events. In this paper, we introduce OpenEP (an Open-Ended Future Event\nPrediction task), which generates flexible and diverse predictions aligned with\nreal-world scenarios. This is mainly reflected in two aspects: firstly, the\npredictive questions are diverse, covering different stages of event\ndevelopment and perspectives; secondly, the outcomes are flexible, without\nconstraints on scope or format. To facilitate the study of this task, we\nconstruct OpenEPBench, an open-ended future event prediction dataset. For\nquestion construction, we pose questions from seven perspectives, including\nlocation, time, event development, event outcome, event impact, event response,\nand other, to facilitate an in-depth analysis and understanding of the\ncomprehensive evolution of events. For outcome construction, we collect\nfree-form text containing the outcomes as ground truth to provide semantically\ncomplete and detail-enriched outcomes. Furthermore, we propose StkFEP, a\nstakeholder-enhanced future event prediction framework, that incorporates event\ncharacteristics for open-ended settings. Our method extracts stakeholders\ninvolved in events to extend questions to gather diverse information. We also\ncollect historically events that are relevant and similar to the question to\nreveal potential evolutionary patterns. Experiment results indicate that\naccurately predicting future events in open-ended settings is challenging for\nexisting LLMs.\n","authors":["Yong Guan","Hao Peng","Xiaozhi Wang","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2408.06578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06576v1","updated":"2024-08-13T02:25:16Z","published":"2024-08-13T02:25:16Z","title":"CTISum: A New Benchmark Dataset For Cyber Threat Intelligence\n Summarization","summary":" Cyber Threat Intelligence (CTI) summarization task requires the system to\ngenerate concise and accurate highlights from raw intelligence data, which\nplays an important role in providing decision-makers with crucial information\nto quickly detect and respond to cyber threats in the cybersecurity domain.\nHowever, efficient techniques for summarizing CTI reports, including facts,\nanalytical insights, attack processes, etc., have largely been unexplored,\nprimarily due to the lack of available dataset. To this end, we present CTISum,\na new benchmark for CTI summarization task. Considering the importance of\nattack process, a novel fine-grained subtask of attack process summarization is\nproposed to enable defenders to assess risk, identify security gaps,\nvulnerabilities, and so on. Specifically, we first design a multi-stage\nannotation pipeline to gather and annotate the CTI data, and then benchmark the\nCTISum with a collection of extractive and abstractive summarization methods.\nExperimental results show that current state-of-the-art models exhibit\nlimitations when applied to CTISum, underscoring the fact that automatically\nproducing concise summaries of CTI reports remains an open research challenge.\n","authors":["Wei Peng","Junmei Ding","Wei Wang","Lei Cui","Wei Cai","Zhiyu Hao","Xiaochun Yun"],"pdf_url":"https://arxiv.org/pdf/2408.06576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06574v1","updated":"2024-08-13T02:18:47Z","published":"2024-08-13T02:18:47Z","title":"SparkRA: A Retrieval-Augmented Knowledge Service System Based on Spark\n Large Language Model","summary":" Large language models (LLMs) have shown remarkable achievements across\nvarious language tasks.To enhance the performance of LLMs in scientific\nliterature services, we developed the scientific literature LLM (SciLit-LLM)\nthrough pre-training and supervised fine-tuning on scientific literature,\nbuilding upon the iFLYTEK Spark LLM. Furthermore, we present a knowledge\nservice system Spark Research Assistant (SparkRA) based on our SciLit-LLM.\nSparkRA is accessible online and provides three primary functions: literature\ninvestigation, paper reading, and academic writing. As of July 30, 2024,\nSparkRA has garnered over 50,000 registered users, with a total usage count\nexceeding 1.3 million.\n","authors":["Dayong Wu","Jiaqi Li","Baoxin Wang","Honghong Zhao","Siyuan Xue","Yanjie Yang","Zhijun Chang","Rui Zhang","Li Qian","Bo Wang","Shijin Wang","Zhixiong Zhang","Guoping Hu"],"pdf_url":"https://arxiv.org/pdf/2408.06574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03247v2","updated":"2024-08-13T02:16:23Z","published":"2024-08-06T15:07:08Z","title":"Unveiling Factual Recall Behaviors of Large Language Models through\n Knowledge Neurons","summary":" In this paper, we investigate whether Large Language Models (LLMs) actively\nrecall or retrieve their internal repositories of factual knowledge when faced\nwith reasoning tasks. Through an analysis of LLMs' internal factual recall at\neach reasoning step via Knowledge Neurons, we reveal that LLMs fail to harness\nthe critical factual associations under certain circumstances. Instead, they\ntend to opt for alternative, shortcut-like pathways to answer reasoning\nquestions. By manually manipulating the recall process of parametric knowledge\nin LLMs, we demonstrate that enhancing this recall process directly improves\nreasoning performance whereas suppressing it leads to notable degradation.\nFurthermore, we assess the effect of Chain-of-Thought (CoT) prompting, a\npowerful technique for addressing complex reasoning tasks. Our findings\nindicate that CoT can intensify the recall of factual knowledge by encouraging\nLLMs to engage in orderly and reliable reasoning. Furthermore, we explored how\ncontextual conflicts affect the retrieval of facts during the reasoning process\nto gain a comprehensive understanding of the factual recall behaviors of LLMs.\nCode and data will be available soon.\n","authors":["Yifei Wang","Yuheng Chen","Wanting Wen","Yu Sheng","Linjing Li","Daniel Dajun Zeng"],"pdf_url":"https://arxiv.org/pdf/2408.03247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06569v1","updated":"2024-08-13T02:08:32Z","published":"2024-08-13T02:08:32Z","title":"Social Debiasing for Fair Multi-modal LLMs","summary":" Multi-modal Large Language Models (MLLMs) have advanced significantly,\noffering powerful vision-language understanding capabilities. However, these\nmodels often inherit severe social biases from their training datasets, leading\nto unfair predictions based on attributes like race and gender. This paper\naddresses the issue of social biases in MLLMs by i) Introducing a comprehensive\nCounterfactual dataset with Multiple Social Concepts (CMSC), which provides a\nmore diverse and extensive training set compared to existing datasets. ii)\nProposing an Anti-Stereotype Debiasing strategy (ASD). Our method works by\nrevisiting the MLLM training process, rescaling the autoregressive loss\nfunction, and improving data sampling methods to counteract biases. Through\nextensive experiments on various MLLMs, our CMSC dataset and ASD method\ndemonstrate a significant reduction in social biases while maintaining the\nmodels' original performance.\n","authors":["Harry Cheng","Yangyang Guo","Qingpei Guo","Ming Yang","Tian Gan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2408.06569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06567v1","updated":"2024-08-13T02:07:00Z","published":"2024-08-13T02:07:00Z","title":"AquilaMoE: Efficient Training for MoE Models with Scale-Up and Scale-Out\n Strategies","summary":" In recent years, with the rapid application of large language models across\nvarious fields, the scale of these models has gradually increased, and the\nresources required for their pre-training have grown exponentially. Training an\nLLM from scratch will cost a lot of computation resources while scaling up from\na smaller model is a more efficient approach and has thus attracted significant\nattention. In this paper, we present AquilaMoE, a cutting-edge bilingual 8*16B\nMixture of Experts (MoE) language model that has 8 experts with 16 billion\nparameters each and is developed using an innovative training methodology\ncalled EfficientScale. This approach optimizes performance while minimizing\ndata requirements through a two-stage process. The first stage, termed\nScale-Up, initializes the larger model with weights from a pre-trained smaller\nmodel, enabling substantial knowledge transfer and continuous pretraining with\nsignificantly less data. The second stage, Scale-Out, uses a pre-trained dense\nmodel to initialize the MoE experts, further enhancing knowledge transfer and\nperformance. Extensive validation experiments on 1.8B and 7B models compared\nvarious initialization schemes, achieving models that maintain and reduce loss\nduring continuous pretraining. Utilizing the optimal scheme, we successfully\ntrained a 16B model and subsequently the 8*16B AquilaMoE model, demonstrating\nsignificant improvements in performance and training efficiency.\n","authors":["Bo-Wen Zhang","Liangdong Wang","Ye Yuan","Jijie Li","Shuhao Gu","Mengdi Zhao","Xinya Wu","Guang Liu","Chengwei Wu","Hanyu Zhao","Li Du","Yiming Ju","Quanyue Ma","Yulong Ao","Yingli Zhao","Songhe Zhu","Zhou Cao","Dong Liang","Yonghua Lin","Ming Zhang","Shunfei Wang","Yanxin Zhou","Min Ye","Xuekai Chen","Xinyang Yu","Xiangjun Huang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03656v5","updated":"2024-08-13T00:52:06Z","published":"2023-08-07T15:18:30Z","title":"Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using\n EmotionBench","summary":" Evaluating Large Language Models' (LLMs) anthropomorphic capabilities has\nbecome increasingly important in contemporary discourse. Utilizing the emotion\nappraisal theory from psychology, we propose to evaluate the empathy ability of\nLLMs, \\ie, how their feelings change when presented with specific situations.\nAfter a careful and comprehensive survey, we collect a dataset containing over\n400 situations that have proven effective in eliciting the eight emotions\ncentral to our study. Categorizing the situations into 36 factors, we conduct a\nhuman evaluation involving more than 1,200 subjects worldwide. With the human\nevaluation results as references, our evaluation includes seven LLMs, covering\nboth commercial and open-source models, including variations in model sizes,\nfeaturing the latest iterations, such as GPT-4, Mixtral-8x22B, and LLaMA-3.1.\nWe find that, despite several misalignments, LLMs can generally respond\nappropriately to certain situations. Nevertheless, they fall short in alignment\nwith the emotional behaviors of human beings and cannot establish connections\nbetween similar situations. Our EmotionBench, including collected dataset of\nsituations, the human evaluation results, and the code of our testing\nframework, is publicly available at https://github.com/CUHK-ARISE/EmotionBench.\n","authors":["Jen-tse Huang","Man Ho Lam","Eric John Li","Shujie Ren","Wenxuan Wang","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.03656v5.pdf","comment":"Add LLaMA-3.1, Mixtral-8x22B; 10 pages of main text; 14 pages of\n appendices"},{"id":"http://arxiv.org/abs/2311.09356v2","updated":"2024-08-13T00:35:54Z","published":"2023-11-15T20:33:27Z","title":"LePaRD: A Large-Scale Dataset of Judges Citing Precedents","summary":" We present the Legal Passage Retrieval Dataset LePaRD. LePaRD is a massive\ncollection of U.S. federal judicial citations to precedent in context. The\ndataset aims to facilitate work on legal passage prediction, a challenging\npractice-oriented legal retrieval and reasoning task. Legal passage prediction\nseeks to predict relevant passages from precedential court decisions given the\ncontext of a legal argument. We extensively evaluate various retrieval\napproaches on LePaRD, and find that classification appears to work best.\nHowever, we note that legal precedent prediction is a difficult task, and there\nremains significant room for improvement. We hope that by publishing LePaRD, we\nwill encourage others to engage with a legal NLP task that promises to help\nexpand access to justice by reducing the burden associated with legal research.\nA subset of the LePaRD dataset is freely available and the whole dataset will\nbe released upon publication.\n","authors":["Robert Mahari","Dominik Stammbach","Elliott Ash","Alex `Sandy' Pentland"],"pdf_url":"https://arxiv.org/pdf/2311.09356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01879v2","updated":"2024-08-13T00:07:22Z","published":"2024-06-04T01:20:14Z","title":"Bi-DCSpell: A Bi-directional Detector-Corrector Interactive Framework\n for Chinese Spelling Check","summary":" Chinese Spelling Check (CSC) aims to detect and correct potentially\nmisspelled characters in Chinese sentences. Naturally, it involves the\ndetection and correction subtasks, which interact with each other dynamically.\nSuch interactions are bi-directional, i.e., the detection result would help\nreduce the risk of over-correction and under-correction while the knowledge\nlearnt from correction would help prevent false detection. Current CSC\napproaches are of two types: correction-only or single-directional\ndetection-to-correction interactive frameworks. Nonetheless, they overlook the\nbi-directional interactions between detection and correction. This paper aims\nto fill the gap by proposing a Bi-directional Detector-Corrector framework for\nCSC (Bi-DCSpell). Notably, Bi-DCSpell contains separate detection and\ncorrection encoders, followed by a novel interactive learning module\nfacilitating bi-directional feature interactions between detection and\ncorrection to improve each other's representation learning. Extensive\nexperimental results demonstrate a robust correction performance of Bi-DCSpell\non widely used benchmarking datasets while possessing a satisfactory detection\nability.\n","authors":["Haiming Wu","Hanqing Zhang","Richeng Xuan","Dawei Song"],"pdf_url":"https://arxiv.org/pdf/2406.01879v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.06537v1","updated":"2024-08-13T00:06:56Z","published":"2024-08-13T00:06:56Z","title":"Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality\n Parallel Data Outperforms Traditional Web-Crawled Data","summary":" Recent research in neural machine translation (NMT) has shown that training\non high-quality machine-generated data can outperform training on\nhuman-generated data. This work accompanies the first-ever release of a\nLLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and\nmulti-sentence examples. We perform extensive experiments to demonstrate the\nquality of our dataset in terms of its downstream impact on NMT model\nperformance. We find that training from scratch on our (machine-generated)\ndataset outperforms training on the (web-crawled) WMT'23 training dataset\n(which is 300 times larger), and also outperforms training on the top-quality\nsubset of the WMT'23 training dataset. We also find that performing\nself-distillation by finetuning the LLM which generated this dataset\noutperforms the LLM's strong few-shot baseline. These findings corroborate the\nquality of our dataset, and demonstrate the value of high-quality\nmachine-generated data in improving performance of NMT models.\n","authors":["Mara Finkelstein","David Vilar","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2408.06537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07238v1","updated":"2024-08-13T23:59:36Z","published":"2024-08-13T23:59:36Z","title":"Using Advanced LLMs to Enhance Smaller LLMs: An Interpretable Knowledge\n Distillation Approach","summary":" Advanced Large language models (LLMs) like GPT-4 or LlaMa 3 provide superior\nperformance in complex human-like interactions. But they are costly, or too\nlarge for edge devices such as smartphones and harder to self-host, leading to\nsecurity and privacy concerns. This paper introduces a novel interpretable\nknowledge distillation approach to enhance the performance of smaller, more\neconomical LLMs that firms can self-host. We study this problem in the context\nof building a customer service agent aimed at achieving high customer\nsatisfaction through goal-oriented dialogues. Unlike traditional knowledge\ndistillation, where the \"student\" model learns directly from the \"teacher\"\nmodel's responses via fine-tuning, our interpretable \"strategy\" teaching\napproach involves the teacher providing strategies to improve the student's\nperformance in various scenarios. This method alternates between a \"scenario\ngeneration\" step and a \"strategies for improvement\" step, creating a customized\nlibrary of scenarios and optimized strategies for automated prompting. The\nmethod requires only black-box access to both student and teacher models; hence\nit can be used without manipulating model parameters. In our customer service\napplication, the method improves performance, and the learned strategies are\ntransferable to other LLMs and scenarios beyond the training set. The method's\ninterpretabilty helps safeguard against potential harms through human audit.\n","authors":["Tong Wang","K. Sudhir","Dat Hong"],"pdf_url":"https://arxiv.org/pdf/2408.07238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07237v1","updated":"2024-08-13T23:58:45Z","published":"2024-08-13T23:58:45Z","title":"Neural embedding of beliefs reveals the role of relative dissonance in\n human decision-making","summary":" Beliefs serve as the foundation for human cognition and decision-making. They\nguide individuals in deriving meaning from their lives, shaping their\nbehaviors, and forming social connections. Therefore, a model that encapsulates\nbeliefs and their interrelationships is crucial for quantitatively studying the\ninfluence of beliefs on our actions. Despite its importance, research on the\ninterplay between human beliefs has often been limited to a small set of\nbeliefs pertaining to specific issues, with a heavy reliance on surveys or\nexperiments. Here, we propose a method for extracting nuanced relations between\nthousands of beliefs by leveraging large-scale user participation data from an\nonline debate platform and mapping these beliefs to an embedding space using a\nfine-tuned large language model (LLM). This belief embedding space effectively\nencapsulates the interconnectedness of diverse beliefs as well as polarization\nacross various social issues. We discover that the positions within this belief\nspace predict new beliefs of individuals. Furthermore, we find that the\nrelative distance between one's existing beliefs and new beliefs can serve as a\nquantitative estimate of cognitive dissonance, allowing us to predict new\nbeliefs. Our study highlights how modern LLMs, when combined with collective\nonline records of human beliefs, can offer insights into the fundamental\nprinciples that govern human belief formation and decision-making processes.\n","authors":["Byunghwee Lee","Rachith Aiyappa","Yong-Yeol Ahn","Haewoon Kwak","Jisun An"],"pdf_url":"https://arxiv.org/pdf/2408.07237v1.pdf","comment":"26 pages, 6 figures, SI"},{"id":"http://arxiv.org/abs/2404.03626v2","updated":"2024-08-13T22:01:42Z","published":"2024-04-04T17:48:28Z","title":"Training LLMs over Neurally Compressed Text","summary":" In this paper, we explore the idea of training large language models (LLMs)\nover highly compressed text. While standard subword tokenizers compress text by\na small factor, neural text compressors can achieve much higher rates of\ncompression. If it were possible to train LLMs directly over neurally\ncompressed text, this would confer advantages in training and serving\nefficiency, as well as easier handling of long text spans. The main obstacle to\nthis goal is that strong compression tends to produce opaque outputs that are\nnot well-suited for learning. In particular, we find that text na\\\"ively\ncompressed via Arithmetic Coding is not readily learnable by LLMs. To overcome\nthis, we propose Equal-Info Windows, a novel compression technique whereby text\nis segmented into blocks that each compress to the same bit length. Using this\nmethod, we demonstrate effective learning over neurally compressed text that\nimproves with scale, and outperforms byte-level baselines by a wide margin on\nperplexity and inference speed benchmarks. While our method delivers worse\nperplexity than subword tokenizers for models trained with the same parameter\ncount, it has the benefit of shorter sequence lengths. Shorter sequence lengths\nrequire fewer autoregressive generation steps, and reduce latency. Finally, we\nprovide extensive analysis of the properties that contribute to learnability,\nand offer concrete suggestions for how to further improve the performance of\nhigh-compression tokenizers.\n","authors":["Brian Lester","Jaehoon Lee","Alex Alemi","Jeffrey Pennington","Adam Roberts","Jascha Sohl-Dickstein","Noah Constant"],"pdf_url":"https://arxiv.org/pdf/2404.03626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05346v2","updated":"2024-08-13T20:46:18Z","published":"2024-08-09T21:31:33Z","title":"DataNarrative: Automated Data-Driven Storytelling with Visualizations\n and Texts","summary":" Data-driven storytelling is a powerful method for conveying insights by\ncombining narrative techniques with visualizations and text. These stories\nintegrate visual aids, such as highlighted bars and lines in charts, along with\ntextual annotations explaining insights. However, creating such stories\nrequires a deep understanding of the data and meticulous narrative planning,\noften necessitating human intervention, which can be time-consuming and\nmentally taxing. While Large Language Models (LLMs) excel in various NLP tasks,\ntheir ability to generate coherent and comprehensive data stories remains\nunderexplored. In this work, we introduce a novel task for data story\ngeneration and a benchmark containing 1,449 stories from diverse sources. To\naddress the challenges of crafting coherent data stories, we propose a\nmultiagent framework employing two LLM agents designed to replicate the human\nstorytelling process: one for understanding and describing the data\n(Reflection), generating the outline, and narration, and another for\nverification at each intermediary step. While our agentic framework generally\noutperforms non-agentic counterparts in both model-based and human evaluations,\nthe results also reveal unique challenges in data story generation.\n","authors":["Mohammed Saidul Islam","Md Tahmid Rahman Laskar","Md Rizwan Parvez","Enamul Hoque","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2408.05346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06935v2","updated":"2024-08-13T20:11:47Z","published":"2024-01-13T00:08:23Z","title":"MiTTenS: A Dataset for Evaluating Gender Mistranslation","summary":" Translation systems, including foundation models capable of translation, can\nproduce errors that result in gender mistranslation, and such errors can be\nespecially harmful. To measure the extent of such potential harms when\ntranslating into and out of English, we introduce a dataset, MiTTenS, covering\n26 languages from a variety of language families and scripts, including several\ntraditionally under-represented in digital resources. The dataset is\nconstructed with handcrafted passages that target known failure patterns,\nlonger synthetically generated passages, and natural passages sourced from\nmultiple domains. We demonstrate the usefulness of the dataset by evaluating\nboth neural machine translation systems and foundation models, and show that\nall systems exhibit gender mistranslation and potential harm, even in high\nresource languages.\n","authors":["Kevin Robinson","Sneha Kudugunta","Romina Stella","Sunipa Dev","Jasmijn Bastings"],"pdf_url":"https://arxiv.org/pdf/2401.06935v2.pdf","comment":"GitHub repository https://github.com/google-research-datasets/mittens"},{"id":"http://arxiv.org/abs/2408.07190v1","updated":"2024-08-13T20:08:26Z","published":"2024-08-13T20:08:26Z","title":"BERT's Conceptual Cartography: Mapping the Landscapes of Meaning","summary":" Conceptual Engineers want to make words better. However, they often\nunderestimate how varied our usage of words is. In this paper, we take the\nfirst steps in exploring the contextual nuances of words by creating conceptual\nlandscapes -- 2D surfaces representing the pragmatic usage of words -- that\nconceptual engineers can use to inform their projects. We use the spoken\ncomponent of the British National Corpus and BERT to create contextualised word\nembeddings, and use Gaussian Mixture Models, a selection of metrics, and\nqualitative analysis to visualise and numerically represent lexical landscapes.\nSuch an approach has not yet been used in the conceptual engineering literature\nand provides a detailed examination of how different words manifest in various\ncontexts that is potentially useful to conceptual engineering projects. Our\nfindings highlight the inherent complexity of conceptual engineering, revealing\nthat each word exhibits a unique and intricate landscape. Conceptual Engineers\ncannot, therefore, use a one-size-fits-all approach when improving words -- a\ntask that may be practically intractable at scale.\n","authors":["Nina Haket","Ryan Daniels"],"pdf_url":"https://arxiv.org/pdf/2408.07190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08158v5","updated":"2024-08-13T19:51:48Z","published":"2023-06-13T22:07:54Z","title":"Sociodemographic Bias in Language Models: A Survey and Forward Path","summary":" Sociodemographic bias in language models (LMs) has the potential for harm\nwhen deployed in real-world settings. This paper presents a comprehensive\nsurvey of the past decade of research on sociodemographic bias in LMs,\norganized into a typology that facilitates examining the different aims: types\nof bias, quantifying bias, and debiasing techniques. We track the evolution of\nthe latter two questions, then identify current trends and their limitations,\nas well as emerging techniques. To guide future research towards more effective\nand reliable solutions, and to help authors situate their work within this\nbroad landscape, we conclude with a checklist of open questions.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2306.08158v5.pdf","comment":"23 pages, 3 figure"},{"id":"http://arxiv.org/abs/2408.07180v1","updated":"2024-08-13T19:45:02Z","published":"2024-08-13T19:45:02Z","title":"Unlocking Efficiency: Adaptive Masking for Gene Transformer Models","summary":" Gene transformer models such as Nucleotide Transformer, DNABert, and LOGO are\ntrained to learn optimal gene sequence representations by using the Masked\nLanguage Modeling (MLM) training objective over the complete Human Reference\nGenome. However, the typical tokenization methods employ a basic sliding window\nof tokens, such as k-mers, that fail to utilize gene-centric semantics. This\ncould result in the (trivial) masking of easily predictable sequences, leading\nto inefficient MLM training. Time-variant training strategies are known to\nimprove pretraining efficiency in both language and vision tasks. In this work,\nwe focus on using curriculum masking where we systematically increase the\ndifficulty of masked token prediction task by using a Pointwise Mutual\nInformation-based difficulty criterion, as gene sequences lack well-defined\nsemantic units similar to words or sentences of NLP domain. Our proposed\nCurriculum Masking-based Gene Masking Strategy (CM-GEMS) demonstrates superior\nrepresentation learning capabilities compared to baseline masking approaches\nwhen evaluated on downstream gene sequence classification tasks. We perform\nextensive evaluation in both few-shot (five datasets) and full dataset settings\n(Genomic Understanding Evaluation benchmark consisting of 27 tasks). Our\nfindings reveal that CM-GEMS outperforms state-of-the-art models (DNABert-2,\nNucleotide transformer, DNABert) trained at 120K steps, achieving similar\nresults in just 10K and 1K steps. We also demonstrate that Curriculum-Learned\nLOGO (a 2-layer DNABert-like model) can achieve nearly 90% of the\nstate-of-the-art model performance of 120K steps. We will make the models and\ncodes publicly available at https://github.com/roysoumya/curriculum-GeneMask.\n","authors":["Soumyadeep Roy","Shamik Sural","Niloy Ganguly"],"pdf_url":"https://arxiv.org/pdf/2408.07180v1.pdf","comment":"10 pages, 5 figures. Accepted for publication at the 27th European\n Conference on Artificial Intelligence (ECAI 2024)"},{"id":"http://arxiv.org/abs/2403.19631v2","updated":"2024-08-13T19:34:13Z","published":"2024-03-28T17:47:19Z","title":"Retrieval-enhanced Knowledge Editing in Language Models for Multi-Hop\n Question Answering","summary":" Large Language Models (LLMs) have shown proficiency in question-answering\ntasks but often struggle to integrate real-time knowledge, leading to\npotentially outdated or inaccurate responses. This problem becomes even more\nchallenging when dealing with multi-hop questions, since they require LLMs to\nupdate and integrate multiple knowledge pieces relevant to the questions. To\ntackle the problem, we propose the Retrieval-Augmented model Editing (RAE)\nframework for multi-hop question answering. RAE first retrieves edited facts\nand then refines the language model through in-context learning. Specifically,\nour retrieval approach, based on mutual information maximization, leverages the\nreasoning abilities of LLMs to identify chain facts that traditional\nsimilarity-based searches might miss. In addition, our framework includes a\npruning strategy to eliminate redundant information from the retrieved facts,\nwhich enhances the editing accuracy and mitigates the hallucination problem.\nOur framework is supported by theoretical justification for its fact retrieval\nefficacy. Finally, comprehensive evaluation across various LLMs validates RAE's\nability in providing accurate answers with updated knowledge. Our code is\navailable at: https://github.com/sycny/RAE.\n","authors":["Yucheng Shi","Qiaoyu Tan","Xuansheng Wu","Shaochen Zhong","Kaixiong Zhou","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19631v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.14962v4","updated":"2024-08-13T19:17:32Z","published":"2024-07-20T18:48:35Z","title":"Recent Advances in Generative AI and Large Language Models: Current\n Status, Challenges, and Perspectives","summary":" The emergence of Generative Artificial Intelligence (AI) and Large Language\nModels (LLMs) has marked a new era of Natural Language Processing (NLP),\nintroducing unprecedented capabilities that are revolutionizing various\ndomains. This paper explores the current state of these cutting-edge\ntechnologies, demonstrating their remarkable advancements and wide-ranging\napplications. Our paper contributes to providing a holistic perspective on the\ntechnical foundations, practical applications, and emerging challenges within\nthe evolving landscape of Generative AI and LLMs. We believe that understanding\nthe generative capabilities of AI systems and the specific context of LLMs is\ncrucial for researchers, practitioners, and policymakers to collaboratively\nshape the responsible and ethical integration of these technologies into\nvarious domains. Furthermore, we identify and address main research gaps,\nproviding valuable insights to guide future research endeavors within the AI\nresearch community.\n","authors":["Desta Haileselassie Hagos","Rick Battle","Danda B. Rawat"],"pdf_url":"https://arxiv.org/pdf/2407.14962v4.pdf","comment":"This version is accepted for publication in the Journal of IEEE\n Transactions on Artificial Intelligence (TAI)"},{"id":"http://arxiv.org/abs/2408.05241v2","updated":"2024-08-13T19:09:57Z","published":"2024-08-05T20:49:48Z","title":"Large Model Strategic Thinking, Small Model Efficiency: Transferring\n Theory of Mind in Large Language Models","summary":" As the performance of larger, newer Large Language Models continues to\nimprove for strategic Theory of Mind (ToM) tasks, the demand for these state of\nthe art models increases commensurately. However, their deployment is costly\nboth in terms of processing power and time. In this paper, we investigate the\nfeasibility of creating smaller, simulation-ready agents by way of fine-tuning.\nTo do this, we present a large pre-trained model with 20 unique scenarios that\ncombine a social context with a social dilemma, recording its answers, and\nusing them for Q\\&A fine-tuning on a smaller model of the same family. Our\nfocus is on in-context game-theoretic decision-making, the same domain within\nwhich human interaction occurs and that requires both a theory of mind (or a\nsemblance thereof) and an understanding of social dynamics. We find that the\nfine-tuned smaller language model exhibited significant performance closer to\nthat of its larger relative, and that their improvements extended in areas and\ncontexts beyond the ones provided in the training examples. On average for all\ngames, through fine-tuning, the smaller model showed a \\%46 improvement in\naligning with the behavior of the larger model, with \\%100 representing\ncomplete alignment. This suggests that our pipeline represents an efficient\nmethod to transmit some form of theory of mind to smaller models, creating\nimproved and cheaply deployable algorithms in the process. Despite their\nsimplicity and their associated shortcomings and limitations, our findings\nrepresent a stepping stone in the pursuit and training of specialized models\nfor strategic and social decision making.\n","authors":["Nunzio Lore","Alireza Sepehr Ilami","Babak Heydari"],"pdf_url":"https://arxiv.org/pdf/2408.05241v2.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.09198v2","updated":"2024-08-13T19:04:18Z","published":"2023-11-15T18:42:44Z","title":"Never Lost in the Middle: Mastering Long-Context Question Answering with\n Position-Agnostic Decompositional Training","summary":" While large language models (LLMs) are equipped with longer text input\ncapabilities than before, they are struggling to seek correct information in\nlong contexts. The \"lost in the middle\" problem challenges most LLMs, referring\nto the dramatic decline in accuracy when correct information is located in the\nmiddle. To overcome this crucial issue, this paper proposes to enhance the\ninformation searching and reflection ability of LLMs in long contexts via\nspecially designed tasks called Attention Strengthening Multi-doc QA (ASM QA).\nFollowing these tasks, our model excels in focusing more precisely on the\ndesired information. Experimental results show substantial improvement in\nMulti-doc QA and other benchmarks, superior to state-of-the-art models by 13.7%\nabsolute gain in shuffled settings, by 21.5% in passage retrieval task. We\nrelease our model, Ziya-Reader to promote related research in the community.\n","authors":["Junqing He","Kunhao Pan","Xiaoqun Dong","Zhuoyang Song","Yibo Liu","Qianguo Sun","Yuxin Liang","Hao Wang","Enming Zhang","Jiaxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.09198v2.pdf","comment":"Accepted by ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2408.07154v1","updated":"2024-08-13T18:50:07Z","published":"2024-08-13T18:50:07Z","title":"Self-folding Self-replication","summary":" Inspired by protein folding, we explored the construction of\nthree-dimensional structures and machines from one-dimensional chains of simple\nbuilding blocks. This approach not only allows us to recreate the\nself-replication mechanism introduced earlier, but also significantly\nsimplifies the process. We introduced a new set of folding blocks that\nfacilitate the formation of secondary structures such as {\\alpha}-helices and\n\\b{eta}-sheets, as well as more advanced tertiary and quaternary structures,\nincluding self-replicating machines. The introduction of rotational degrees of\nfreedom leads to a reduced variety of blocks and, most importantly, reduces the\noverall size of the machines by a factor of five. In addition, we present a\nuniversal copier-constructor, a highly efficient self-replicating mechanism\ncomposed of approximately 40 blocks, including the restictions posed on it. The\npaper also addresses evolutionary considerations, outlining several steps on\nthe evolutionary ladder towards more sophisticated self-replicating systems.\nFinally, this study offers a clear rationale for nature's preference for\none-dimensional chains in constructing three-dimensional structures.\n","authors":["Ralph P. Lano"],"pdf_url":"https://arxiv.org/pdf/2408.07154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07144v1","updated":"2024-08-13T18:26:04Z","published":"2024-08-13T18:26:04Z","title":"Language Models as Models of Language","summary":" This chapter critically examines the potential contributions of modern\nlanguage models to theoretical linguistics. Despite their focus on engineering\ngoals, these models' ability to acquire sophisticated linguistic knowledge from\nmere exposure to data warrants a careful reassessment of their relevance to\nlinguistic theory. I review a growing body of empirical evidence suggesting\nthat language models can learn hierarchical syntactic structure and exhibit\nsensitivity to various linguistic phenomena, even when trained on\ndevelopmentally plausible amounts of data. While the competence/performance\ndistinction has been invoked to dismiss the relevance of such models to\nlinguistic theory, I argue that this assessment may be premature. By carefully\ncontrolling learning conditions and making use of causal intervention methods,\nexperiments with language models can potentially constrain hypotheses about\nlanguage acquisition and competence. I conclude that closer collaboration\nbetween theoretical linguists and computational researchers could yield\nvaluable insights, particularly in advancing debates about linguistic nativism.\n","authors":["Raphaël Millière"],"pdf_url":"https://arxiv.org/pdf/2408.07144v1.pdf","comment":"Forthcoming in Nefdt, R., Dupre, G., \\& Stanton, K. (eds.), The\n Oxford Handbook of the Philosophy of Linguistics. Oxford University Press"},{"id":"http://arxiv.org/abs/2408.07137v1","updated":"2024-08-13T18:12:00Z","published":"2024-08-13T18:12:00Z","title":"ELLA: Empowering LLMs for Interpretable, Accurate and Informative Legal\n Advice","summary":" Despite remarkable performance in legal consultation exhibited by legal Large\nLanguage Models(LLMs) combined with legal article retrieval components, there\nare still cases when the advice given is incorrect or baseless. To alleviate\nthese problems, we propose {\\bf ELLA}, a tool for {\\bf E}mpowering {\\bf L}LMs\nfor interpretable, accurate, and informative {\\bf L}egal {\\bf A}dvice. ELLA\nvisually presents the correlation between legal articles and LLM's response by\ncalculating their similarities, providing users with an intuitive legal basis\nfor the responses. Besides, based on the users' queries, ELLA retrieves\nrelevant legal articles and displays them to users. Users can interactively\nselect legal articles for LLM to generate more accurate responses. ELLA also\nretrieves relevant legal cases for user reference. Our user study shows that\npresenting the legal basis for the response helps users understand better. The\naccuracy of LLM's responses also improves when users intervene in selecting\nlegal articles for LLM. Providing relevant legal cases also aids individuals in\nobtaining comprehensive information.\n","authors":["Yutong Hu","Kangcheng Luo","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2408.07137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04614v2","updated":"2024-08-13T18:00:57Z","published":"2024-08-08T17:42:32Z","title":"Better Alignment with Instruction Back-and-Forth Translation","summary":" We propose a new method, instruction back-and-forth translation, to construct\nhigh-quality synthetic data grounded in world knowledge for aligning large\nlanguage models (LLMs). Given documents from a web corpus, we generate and\ncurate synthetic instructions using the backtranslation approach proposed by Li\net al.(2023a), and rewrite the responses to improve their quality further based\non the initial documents. Fine-tuning with the resulting (backtranslated\ninstruction, rewritten response) pairs yields higher win rates on AlpacaEval\nthan using other common instruction datasets such as Humpback, ShareGPT, Open\nOrca, Alpaca-GPT4 and Self-instruct. We also demonstrate that rewriting the\nresponses with an LLM outperforms direct distillation, and the two generated\ntext distributions exhibit significant distinction in embedding space. Further\nanalysis shows that our backtranslated instructions are of higher quality than\nother sources of synthetic instructions, while our responses are more diverse\nand complex than those obtained from distillation. Overall we find that\ninstruction back-and-forth translation combines the best of both worlds --\nmaking use of the information diversity and quantity found on the web, while\nensuring the quality of the responses which is necessary for effective\nalignment.\n","authors":["Thao Nguyen","Jeffrey Li","Sewoong Oh","Ludwig Schmidt","Jason Weston","Luke Zettlemoyer","Xian Li"],"pdf_url":"https://arxiv.org/pdf/2408.04614v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.07065v1","updated":"2024-08-13T17:57:14Z","published":"2024-08-13T17:57:14Z","title":"Fingerspelling within Sign Language Translation","summary":" Fingerspelling poses challenges for sign language processing due to its\nhigh-frequency motion and use for open-vocabulary terms. While prior work has\nstudied fingerspelling recognition, there has been little attention to\nevaluating how well sign language translation models understand fingerspelling\nin the context of entire sentences -- and improving this capability. We\nmanually annotate instances of fingerspelling within FLEURS-ASL and use them to\nevaluate the effect of two simple measures to improve fingerspelling\nrecognition within American Sign Language to English translation: 1) use a\nmodel family (ByT5) with character- rather than subword-level tokenization, and\n2) mix fingerspelling recognition data into the translation training mixture.\nWe find that 1) substantially improves understanding of fingerspelling (and\ntherefore translation quality overall), but the effect of 2) is mixed.\n","authors":["Garrett Tanzer"],"pdf_url":"https://arxiv.org/pdf/2408.07065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v2","updated":"2024-08-13T17:57:07Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Achieving Interpretability in Federated Learning via Neuron\n Provenance","summary":" In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a\ncost--FL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML explainability approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for the global model's prediction by\ntracking the flow of information from individual clients to the global model.\nSince inference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction. It then selectively picks a slice of the\nmost crucial neurons in the global model and maps them to the corresponding\nneurons in every participating client to determine each client's contribution,\nultimately localizing the responsible client. We evaluate TraceFL on six\ndatasets, including two real-world medical imaging datasets and four neural\nnetworks, including advanced models such as GPT. TraceFL achieves 99% accuracy\nin localizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-art ML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v2.pdf","comment":"13 pages. TraceFL is the first interpretability technique in FL that\n can work on both image and text classification tasks. For source code please\n contact at waris@vt.edu"},{"id":"http://arxiv.org/abs/2408.07050v1","updated":"2024-08-13T17:37:40Z","published":"2024-08-13T17:37:40Z","title":"PSM: Learning Probabilistic Embeddings for Multi-scale Zero-Shot\n Soundscape Mapping","summary":" A soundscape is defined by the acoustic environment a person perceives at a\nlocation. In this work, we propose a framework for mapping soundscapes across\nthe Earth. Since soundscapes involve sound distributions that span varying\nspatial scales, we represent locations with multi-scale satellite imagery and\nlearn a joint representation among this imagery, audio, and text. To capture\nthe inherent uncertainty in the soundscape of a location, we design the\nrepresentation space to be probabilistic. We also fuse ubiquitous metadata\n(including geolocation, time, and data source) to enable learning of spatially\nand temporally dynamic representations of soundscapes. We demonstrate the\nutility of our framework by creating large-scale soundscape maps integrating\nboth audio and text with temporal control. To facilitate future research on\nthis task, we also introduce a large-scale dataset, GeoSound, containing over\n$300k$ geotagged audio samples paired with both low- and high-resolution\nsatellite imagery. We demonstrate that our method outperforms the existing\nstate-of-the-art on both GeoSound and the existing SoundingEarth dataset. Our\ndataset and code is available at https://github.com/mvrl/PSM.\n","authors":["Subash Khanal","Eric Xing","Srikumar Sastry","Aayush Dhakal","Zhexiao Xiong","Adeel Ahmad","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2408.07050v1.pdf","comment":"Accepted at ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.07040v1","updated":"2024-08-13T17:07:29Z","published":"2024-08-13T17:07:29Z","title":"KAN You See It? KANs and Sentinel for Effective and Explainable Crop\n Field Segmentation","summary":" Segmentation of crop fields is essential for enhancing agricultural\nproductivity, monitoring crop health, and promoting sustainable practices. Deep\nlearning models adopted for this task must ensure accurate and reliable\npredictions to avoid economic losses and environmental impact. The newly\nproposed Kolmogorov-Arnold networks (KANs) offer promising advancements in the\nperformance of neural networks. This paper analyzes the integration of KAN\nlayers into the U-Net architecture (U-KAN) to segment crop fields using\nSentinel-2 and Sentinel-1 satellite images and provides an analysis of the\nperformance and explainability of these networks. Our findings indicate a 2\\%\nimprovement in IoU compared to the traditional full-convolutional U-Net model\nin fewer GFLOPs. Furthermore, gradient-based explanation techniques show that\nU-KAN predictions are highly plausible and that the network has a very high\nability to focus on the boundaries of cultivated areas rather than on the areas\nthemselves. The per-channel relevance analysis also reveals that some channels\nare irrelevant to this task.\n","authors":["Daniele Rege Cambrin","Eleonora Poeta","Eliana Pastor","Tania Cerquitelli","Elena Baralis","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2408.07040v1.pdf","comment":"Accepted at ECCV 2024 CVPPA Workshop"},{"id":"http://arxiv.org/abs/2408.07037v1","updated":"2024-08-13T17:05:06Z","published":"2024-08-13T17:05:06Z","title":"PathInsight: Instruction Tuning of Multimodal Datasets and Models for\n Intelligence Assisted Diagnosis in Histopathology","summary":" Pathological diagnosis remains the definitive standard for identifying\ntumors. The rise of multimodal large models has simplified the process of\nintegrating image analysis with textual descriptions. Despite this advancement,\nthe substantial costs associated with training and deploying these complex\nmultimodal models, together with a scarcity of high-quality training datasets,\ncreate a significant divide between cutting-edge technology and its application\nin the clinical setting. We had meticulously compiled a dataset of\napproximately 45,000 cases, covering over 6 different tasks, including the\nclassification of organ tissues, generating pathology report descriptions, and\naddressing pathology-related questions and answers. We have fine-tuned\nmultimodal large models, specifically LLaVA, Qwen-VL, InternLM, with this\ndataset to enhance instruction-based performance. We conducted a qualitative\nassessment of the capabilities of the base model and the fine-tuned model in\nperforming image captioning and classification tasks on the specific dataset.\nThe evaluation results demonstrate that the fine-tuned model exhibits\nproficiency in addressing typical pathological questions. We hope that by\nmaking both our models and datasets publicly available, they can be valuable to\nthe medical and research communities.\n","authors":["Xiaomin Wu","Rui Xu","Pengchen Wei","Wenkang Qin","Peixiang Huang","Ziheng Li","Lin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.07037v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.09717v3","updated":"2024-08-13T16:49:40Z","published":"2024-05-15T22:18:39Z","title":"From NeRFs to Gaussian Splats, and Back","summary":" For robotics applications where there is a limited number of (typically\nego-centric) views, parametric representations such as neural radiance fields\n(NeRFs) generalize better than non-parametric ones such as Gaussian splatting\n(GS) to views that are very different from those in the training data; GS\nhowever can render much faster than NeRFs. We develop a procedure to convert\nback and forth between the two. Our approach achieves the best of both NeRFs\n(superior PSNR, SSIM, and LPIPS on dissimilar views, and a compact\nrepresentation) and GS (real-time rendering and ability for easily modifying\nthe representation); the computational cost of these conversions is minor\ncompared to training the two from scratch.\n","authors":["Siming He","Zach Osman","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2405.09717v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07018v1","updated":"2024-08-13T16:34:06Z","published":"2024-08-13T16:34:06Z","title":"Efficient Human-Object-Interaction (EHOI) Detection via Interaction\n Label Coding and Conditional Decision","summary":" Human-Object Interaction (HOI) detection is a fundamental task in image\nunderstanding. While deep-learning-based HOI methods provide high performance\nin terms of mean Average Precision (mAP), they are computationally expensive\nand opaque in training and inference processes. An Efficient HOI (EHOI)\ndetector is proposed in this work to strike a good balance between detection\nperformance, inference complexity, and mathematical transparency. EHOI is a\ntwo-stage method. In the first stage, it leverages a frozen object detector to\nlocalize the objects and extract various features as intermediate outputs. In\nthe second stage, the first-stage outputs predict the interaction type using\nthe XGBoost classifier. Our contributions include the application of error\ncorrection codes (ECCs) to encode rare interaction cases, which reduces the\nmodel size and the complexity of the XGBoost classifier in the second stage.\nAdditionally, we provide a mathematical formulation of the relabeling and\ndecision-making process. Apart from the architecture, we present qualitative\nresults to explain the functionalities of the feedforward modules. Experimental\nresults demonstrate the advantages of ECC-coded interaction labels and the\nexcellent balance of detection performance and complexity of the proposed EHOI\nmethod.\n","authors":["Tsung-Shan Yang","Yun-Cheng Wang","Chengwei Wei","Suya You","C. -C. Jay Kuo"],"pdf_url":"https://arxiv.org/pdf/2408.07018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17609v2","updated":"2024-08-13T16:23:59Z","published":"2024-05-27T19:14:46Z","title":"GarmentCodeData: A Dataset of 3D Made-to-Measure Garments With Sewing\n Patterns","summary":" Recent research interest in the learning-based processing of garments, from\nvirtual fitting to generation and reconstruction, stumbles on a scarcity of\nhigh-quality public data in the domain. We contribute to resolving this need by\npresenting the first large-scale synthetic dataset of 3D made-to-measure\ngarments with sewing patterns, as well as its generation pipeline.\nGarmentCodeData contains 115,000 data points that cover a variety of designs in\nmany common garment categories: tops, shirts, dresses, jumpsuits, skirts,\npants, etc., fitted to a variety of body shapes sampled from a custom\nstatistical body model based on CAESAR, as well as a standard reference body\nshape, applying three different textile materials. To enable the creation of\ndatasets of such complexity, we introduce a set of algorithms for automatically\ntaking tailor's measures on sampled body shapes, sampling strategies for sewing\npattern design, and propose an automatic, open-source 3D garment draping\npipeline based on a fast XPBD simulator, while contributing several solutions\nfor collision resolution and drape correctness to enable scalability.\n Project Page: https://igl.ethz.ch/projects/GarmentCodeData/\n Dataset: https://doi.org/10.3929/ethz-b-000673889\n","authors":["Maria Korosteleva","Timur Levent Kesdogan","Fabian Kemper","Stephan Wenninger","Jasmin Koller","Yuhan Zhang","Mario Botsch","Olga Sorkine-Hornung"],"pdf_url":"https://arxiv.org/pdf/2405.17609v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.03596v2","updated":"2024-08-13T16:18:25Z","published":"2024-08-07T07:24:15Z","title":"Hierarchical Quantum Control Gates for Functional MRI Understanding","summary":" Quantum computing has emerged as a powerful tool for solving complex problems\nintractable for classical computers, particularly in popular fields such as\ncryptography, optimization, and neurocomputing. In this paper, we present a new\nquantum-based approach named the Hierarchical Quantum Control Gates (HQCG)\nmethod for efficient understanding of Functional Magnetic Resonance Imaging\n(fMRI) data. This approach includes two novel modules: the Local Quantum\nControl Gate (LQCG) and the Global Quantum Control Gate (GQCG), which are\ndesigned to extract local and global features of fMRI signals, respectively.\nOur method operates end-to-end on a quantum machine, leveraging quantum\nmechanics to learn patterns within extremely high-dimensional fMRI signals,\nsuch as 30,000 samples which is a challenge for classical computers. Empirical\nresults demonstrate that our approach significantly outperforms classical\nmethods. Additionally, we found that the proposed quantum model is more stable\nand less prone to overfitting than the classical methods.\n","authors":["Xuan-Bac Nguyen","Hoang-Quan Nguyen","Hugh Churchill","Samee U. Khan","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2408.03596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07009v1","updated":"2024-08-13T16:15:50Z","published":"2024-08-13T16:15:50Z","title":"Imagen 3","summary":" We introduce Imagen 3, a latent diffusion model that generates high quality\nimages from text prompts. We describe our quality and responsibility\nevaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at\nthe time of evaluation. In addition, we discuss issues around safety and\nrepresentation, as well as methods we used to minimize the potential harm of\nour models.\n","authors":[" Imagen-Team-Google"," :","Jason Baldridge","Jakob Bauer","Mukul Bhutani","Nicole Brichtova","Andrew Bunner","Kelvin Chan","Yichang Chen","Sander Dieleman","Yuqing Du","Zach Eaton-Rosen","Hongliang Fei","Nando de Freitas","Yilin Gao","Evgeny Gladchenko","Sergio Gómez Colmenarejo","Mandy Guo","Alex Haig","Will Hawkins","Hexiang Hu","Huilian Huang","Tobenna Peter Igwe","Christos Kaplanis","Siavash Khodadadeh","Yelin Kim","Ksenia Konyushkova","Karol Langner","Eric Lau","Shixin Luo","Soňa Mokrá","Henna Nandwani","Yasumasa Onoe","Aäron van den Oord","Zarana Parekh","Jordi Pont-Tuset","Hang Qi","Rui Qian","Deepak Ramachandran","Poorva Rane","Abdullah Rashwan","Ali Razavi","Robert Riachi","Hansa Srinivasan","Srivatsan Srinivasan","Robin Strudel","Benigno Uria","Oliver Wang","Su Wang","Austin Waters","Chris Wolff","Auriel Wright","Zhisheng Xiao","Hao Xiong","Keyang Xu","Marc van Zee","Junlin Zhang","Katie Zhang","Wenlei Zhou","Konrad Zolna","Ola Aboubakar","Canfer Akbulut","Oscar Akerlund","Isabela Albuquerque","Nina Anderson","Marco Andreetto","Lora Aroyo","Ben Bariach","David Barker","Sherry Ben","Dana Berman","Courtney Biles","Irina Blok","Pankil Botadra","Jenny Brennan","Karla Brown","John Buckley","Rudy Bunel","Elie Bursztein","Christina Butterfield","Ben Caine","Viral Carpenter","Norman Casagrande","Ming-Wei Chang","Solomon Chang","Shamik Chaudhuri","Tony Chen","John Choi","Dmitry Churbanau","Nathan Clement","Matan Cohen","Forrester Cole","Mikhail Dektiarev","Vincent Du","Praneet Dutta","Tom Eccles","Ndidi Elue","Ashley Feden","Shlomi Fruchter","Frankie Garcia","Roopal Garg","Weina Ge","Ahmed Ghazy","Bryant Gipson","Andrew Goodman","Dawid Górny","Sven Gowal","Khyatti Gupta","Yoni Halpern","Yena Han","Susan Hao","Jamie Hayes","Amir Hertz","Ed Hirst","Tingbo Hou","Heidi Howard","Mohamed Ibrahim","Dirichi Ike-Njoku","Joana Iljazi","Vlad Ionescu","William Isaac","Reena Jana","Gemma Jennings","Donovon Jenson","Xuhui Jia","Kerry Jones","Xiaoen Ju","Ivana Kajic","Christos Kaplanis","Burcu Karagol Ayan","Jacob Kelly","Suraj Kothawade","Christina Kouridi","Ira Ktena","Jolanda Kumakaw","Dana Kurniawan","Dmitry Lagun","Lily Lavitas","Jason Lee","Tao Li","Marco Liang","Maggie Li-Calis","Yuchi Liu","Javier Lopez Alberca","Peggy Lu","Kristian Lum","Yukun Ma","Chase Malik","John Mellor","Inbar Mosseri","Tom Murray","Aida Nematzadeh","Paul Nicholas","João Gabriel Oliveira","Guillermo Ortiz-Jimenez","Michela Paganini","Tom Le Paine","Roni Paiss","Alicia Parrish","Anne Peckham","Vikas Peswani","Igor Petrovski","Tobias Pfaff","Alex Pirozhenko","Ryan Poplin","Utsav Prabhu","Yuan Qi","Matthew Rahtz","Cyrus Rashtchian","Charvi Rastogi","Amit Raul","Ali Razavi","Sylvestre-Alvise Rebuffi","Susanna Ricco","Felix Riedel","Dirk Robinson","Pankaj Rohatgi","Bill Rosgen","Sarah Rumbley","Moonkyung Ryu","Anthony Salgado","Sahil Singla","Florian Schroff","Candice Schumann","Tanmay Shah","Brendan Shillingford","Kaushik Shivakumar","Dennis Shtatnov","Zach Singer","Evgeny Sluzhaev","Valerii Sokolov","Thibault Sottiaux","Florian Stimberg","Brad Stone","David Stutz","Yu-Chuan Su","Eric Tabellion","Shuai Tang","David Tao","Kurt Thomas","Gregory Thornton","Andeep Toor","Cristian Udrescu","Aayush Upadhyay","Cristina Vasconcelos","Alex Vasiloff","Andrey Voynov","Amanda Walker","Luyu Wang","Miaosen Wang","Simon Wang","Stanley Wang","Qifei Wang","Yuxiao Wang","Ágoston Weisz","Olivia Wiles","Chenxia Wu","Xingyu Federico Xu","Andrew Xue","Jianbo Yang","Luo Yu","Mete Yurtoglu","Ali Zand","Han Zhang","Jiageng Zhang","Catherine Zhao","Adilet Zhaxybay","Miao Zhou","Shengqi Zhu","Zhenkai Zhu","Dawn Bloxwich","Mahyar Bordbar","Luis C. Cobo","Eli Collins","Shengyang Dai","Tulsee Doshi","Anca Dragan","Douglas Eck","Demis Hassabis","Sissie Hsiao","Tom Hume","Koray Kavukcuoglu","Helen King","Jack Krawczyk","Yeqing Li","Kathy Meier-Hellstern","Andras Orban","Yury Pinsky","Amar Subramanya","Oriol Vinyals","Ting Yu","Yori Zwols"],"pdf_url":"https://arxiv.org/pdf/2408.07009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18934v2","updated":"2024-08-13T16:01:14Z","published":"2024-02-15T10:34:28Z","title":"The Visual Experience Dataset: Over 200 Recorded Hours of Integrated Eye\n Movement, Odometry, and Egocentric Video","summary":" We introduce the Visual Experience Dataset (VEDB), a compilation of over 240\nhours of egocentric video combined with gaze- and head-tracking data that\noffers an unprecedented view of the visual world as experienced by human\nobservers. The dataset consists of 717 sessions, recorded by 58 observers\nranging from 6-49 years old. This paper outlines the data collection,\nprocessing, and labeling protocols undertaken to ensure a representative sample\nand discusses the potential sources of error or bias within the dataset. The\nVEDB's potential applications are vast, including improving gaze tracking\nmethodologies, assessing spatiotemporal image statistics, and refining deep\nneural networks for scene and activity recognition. The VEDB is accessible\nthrough established open science platforms and is intended to be a living\ndataset with plans for expansion and community contributions. It is released\nwith an emphasis on ethical considerations, such as participant privacy and the\nmitigation of potential biases. By providing a dataset grounded in real-world\nexperiences and accompanied by extensive metadata and supporting code, the\nauthors invite the research community to utilize and contribute to the VEDB,\nfacilitating a richer understanding of visual perception and behavior in\nnaturalistic settings.\n","authors":["Michelle R. Greene","Benjamin J. Balas","Mark D. Lescroart","Paul R. MacNeilage","Jennifer A. Hart","Kamran Binaee","Peter A. Hausamann","Ronald Mezile","Bharath Shankar","Christian B. Sinnott","Kaylie Capurro","Savannah Halow","Hunter Howe","Mariam Josyula","Annie Li","Abraham Mieses","Amina Mohamed","Ilya Nudnou","Ezra Parkhill","Peter Riley","Brett Schmidt","Matthew W. Shinkle","Wentao Si","Brian Szekely","Joaquin M. Torres","Eliana Weissmann"],"pdf_url":"https://arxiv.org/pdf/2404.18934v2.pdf","comment":"40 pages, 1 table, 9 figures"},{"id":"http://arxiv.org/abs/2312.02902v2","updated":"2024-08-13T15:56:58Z","published":"2023-12-05T17:19:22Z","title":"HeadGaS: Real-Time Animatable Head Avatars via 3D Gaussian Splatting","summary":" 3D head animation has seen major quality and runtime improvements over the\nlast few years, particularly empowered by the advances in differentiable\nrendering and neural radiance fields. Real-time rendering is a highly desirable\ngoal for real-world applications. We propose HeadGaS, a model that uses 3D\nGaussian Splats (3DGS) for 3D head reconstruction and animation. In this paper\nwe introduce a hybrid model that extends the explicit 3DGS representation with\na base of learnable latent features, which can be linearly blended with\nlow-dimensional parameters from parametric head models to obtain\nexpression-dependent color and opacity values. We demonstrate that HeadGaS\ndelivers state-of-the-art results in real-time inference frame rates,\nsurpassing baselines by up to 2dB, while accelerating rendering speed by over\nx10.\n","authors":["Helisa Dhamo","Yinyu Nie","Arthur Moreau","Jifei Song","Richard Shaw","Yiren Zhou","Eduardo Pérez-Pellitero"],"pdf_url":"https://arxiv.org/pdf/2312.02902v2.pdf","comment":"accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.06995v1","updated":"2024-08-13T15:56:20Z","published":"2024-08-13T15:56:20Z","title":"Low-Bitwidth Floating Point Quantization for Efficient High-Quality\n Diffusion Models","summary":" Diffusion models are emerging models that generate images by iteratively\ndenoising random Gaussian noise using deep neural networks. These models\ntypically exhibit high computational and memory demands, necessitating\neffective post-training quantization for high-performance inference. Recent\nworks propose low-bitwidth (e.g., 8-bit or 4-bit) quantization for diffusion\nmodels, however 4-bit integer quantization typically results in low-quality\nimages. We observe that on several widely used hardware platforms, there is\nlittle or no difference in compute capability between floating-point and\ninteger arithmetic operations of the same bitwidth (e.g., 8-bit or 4-bit).\nTherefore, we propose an effective floating-point quantization method for\ndiffusion models that provides better image quality compared to integer\nquantization methods. We employ a floating-point quantization method that was\neffective for other processing tasks, specifically computer vision and natural\nlanguage tasks, and tailor it for diffusion models by integrating weight\nrounding learning during the mapping of the full-precision values to the\nquantized values in the quantization process. We comprehensively study integer\nand floating-point quantization methods in state-of-the-art diffusion models.\nOur floating-point quantization method not only generates higher-quality images\nthan that of integer quantization methods, but also shows no noticeable\ndegradation compared to full-precision models (32-bit floating-point), when\nboth weights and activations are quantized to 8-bit floating-point values,\nwhile has minimal degradation with 4-bit weights and 8-bit activations.\n","authors":["Cheng Chen","Christina Giannoula","Andreas Moshovos"],"pdf_url":"https://arxiv.org/pdf/2408.06995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14332v4","updated":"2024-08-13T15:43:26Z","published":"2022-05-28T04:45:07Z","title":"V4d: voxel for 4d novel view synthesis","summary":" Neural radiance fields have made a remarkable breakthrough in the novel view\nsynthesis task at the 3D static scene. However, for the 4D circumstance (e.g.,\ndynamic scene), the performance of the existing method is still limited by the\ncapacity of the neural network, typically in a multilayer perceptron network\n(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance\nfield, short as V4D, where the 3D voxel has two formats. The first one is to\nregularly model the 3D space and then use the sampled local 3D feature with the\ntime index to model the density field and the texture field by a tiny MLP. The\nsecond one is in look-up tables (LUTs) format that is for the pixel-level\nrefinement, where the pseudo-surface produced by the volume rendering is\nutilized as the guidance information to learn a 2D pixel-level refinement\nmapping. The proposed LUTs-based refinement module achieves the performance\ngain with little computational cost and could serve as the plug-and-play module\nin the novel view synthesis task. Moreover, we propose a more effective\nconditional positional encoding toward the 4D data that achieves performance\ngain with negligible computational burdens. Extensive experiments demonstrate\nthat the proposed method achieves state-of-the-art performance at a low\ncomputational cost.\n","authors":["Wanshui Gan","Hongbin Xu","Yi Huang","Shifeng Chen","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2205.14332v4.pdf","comment":"Code released. Accepted by IEEE TVCG 2023"},{"id":"http://arxiv.org/abs/2408.06975v1","updated":"2024-08-13T15:32:54Z","published":"2024-08-13T15:32:54Z","title":"SpectralGaussians: Semantic, spectral 3D Gaussian splatting for\n multi-spectral scene representation, visualization and analysis","summary":" We propose a novel cross-spectral rendering framework based on 3D Gaussian\nSplatting (3DGS) that generates realistic and semantically meaningful splats\nfrom registered multi-view spectrum and segmentation maps. This extension\nenhances the representation of scenes with multiple spectra, providing insights\ninto the underlying materials and segmentation. We introduce an improved\nphysically-based rendering approach for Gaussian splats, estimating reflectance\nand lights per spectra, thereby enhancing accuracy and realism. In a\ncomprehensive quantitative and qualitative evaluation, we demonstrate the\nsuperior performance of our approach with respect to other recent\nlearning-based spectral scene representation approaches (i.e., XNeRF and\nSpectralNeRF) as well as other non-spectral state-of-the-art learning-based\napproaches. Our work also demonstrates the potential of spectral scene\nunderstanding for precise scene editing techniques like style transfer,\ninpainting, and removal. Thereby, our contributions address challenges in\nmulti-spectral scene representation, rendering, and editing, offering new\npossibilities for diverse applications.\n","authors":["Saptarshi Neil Sinha","Holger Graf","Michael Weinmann"],"pdf_url":"https://arxiv.org/pdf/2408.06975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06970v1","updated":"2024-08-13T15:27:43Z","published":"2024-08-13T15:27:43Z","title":"Prompt-Based Segmentation at Multiple Resolutions and Lighting\n Conditions using Segment Anything Model 2","summary":" This paper provides insight into the effectiveness of zero-shot,\nprompt-based, Segment Anything Model (SAM), and its updated version, SAM 2, and\nthe non-promptable, conventional convolutional network (CNN), in segmenting\nsolar panels, in RGB aerial imagery, across lighting conditions, spatial\nresolutions, and prompt strategies. SAM 2 demonstrates improvements over SAM,\nparticularly in sub-optimal lighting conditions when prompted by points. Both\nSAMs, prompted by user-box, outperformed CNN, in all scenarios. Additionally,\nYOLOv9 prompting outperformed user points prompting. In high-resolution\nimagery, both in optimal and sub-optimal lighting conditions, Eff-UNet\noutperformed both SAM models prompted by YOLOv9 boxes, positioning Eff-UNet as\nthe appropriate model for automatic segmentation in high-resolution data. In\nlow-resolution data, user box prompts were found crucial to achieve a\nreasonable performance. This paper provides details on strengths and\nlimitations of each model and outlines robustness of user prompted image\nsegmentation models in inconsistent resolution and lighting conditions of\nremotely sensed data.\n","authors":["Osher Rafaeli","Tal Svoray","Ariel Nahlieli"],"pdf_url":"https://arxiv.org/pdf/2408.06970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06968v1","updated":"2024-08-13T15:25:18Z","published":"2024-08-13T15:25:18Z","title":"Event-Stream Super Resolution using Sigma-Delta Neural Network","summary":" This study introduces a novel approach to enhance the spatial-temporal\nresolution of time-event pixels based on luminance changes captured by event\ncameras. These cameras present unique challenges due to their low resolution\nand the sparse, asynchronous nature of the data they collect. Current event\nsuper-resolution algorithms are not fully optimized for the distinct data\nstructure produced by event cameras, resulting in inefficiencies in capturing\nthe full dynamism and detail of visual scenes with improved computational\ncomplexity. To bridge this gap, our research proposes a method that integrates\nbinary spikes with Sigma Delta Neural Networks (SDNNs), leveraging\nspatiotemporal constraint learning mechanism designed to simultaneously learn\nthe spatial and temporal distributions of the event stream. The proposed\nnetwork is evaluated using widely recognized benchmark datasets, including\nN-MNIST, CIFAR10-DVS, ASL-DVS, and Event-NFS. A comprehensive evaluation\nframework is employed, assessing both the accuracy, through root mean square\nerror (RMSE), and the computational efficiency of our model. The findings\ndemonstrate significant improvements over existing state-of-the-art methods,\nspecifically, the proposed method outperforms state-of-the-art performance in\ncomputational efficiency, achieving a 17.04-fold improvement in event sparsity\nand a 32.28-fold increase in synaptic operation efficiency over traditional\nartificial neural networks, alongside a two-fold better performance over\nspiking neural networks.\n","authors":["Waseem Shariff","Joe Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2408.06968v1.pdf","comment":"ECCV: The 18th European Conference on Computer Vision ECCV 2024 NeVi\n Workshop"},{"id":"http://arxiv.org/abs/2408.00388v2","updated":"2024-08-13T15:10:20Z","published":"2024-08-01T08:57:47Z","title":"Deepfake Media Forensics: State of the Art and Challenges Ahead","summary":" AI-generated synthetic media, also called Deepfakes, have significantly\ninfluenced so many domains, from entertainment to cybersecurity. Generative\nAdversarial Networks (GANs) and Diffusion Models (DMs) are the main frameworks\nused to create Deepfakes, producing highly realistic yet fabricated content.\nWhile these technologies open up new creative possibilities, they also bring\nsubstantial ethical and security risks due to their potential misuse. The rise\nof such advanced media has led to the development of a cognitive bias known as\nImpostor Bias, where individuals doubt the authenticity of multimedia due to\nthe awareness of AI's capabilities. As a result, Deepfake detection has become\na vital area of research, focusing on identifying subtle inconsistencies and\nartifacts with machine learning techniques, especially Convolutional Neural\nNetworks (CNNs). Research in forensic Deepfake technology encompasses five main\nareas: detection, attribution and recognition, passive authentication,\ndetection in realistic scenarios, and active authentication. This paper reviews\nthe primary algorithms that address these challenges, examining their\nadvantages, limitations, and future prospects.\n","authors":["Irene Amerini","Mauro Barni","Sebastiano Battiato","Paolo Bestagini","Giulia Boato","Tania Sari Bonaventura","Vittoria Bruni","Roberto Caldelli","Francesco De Natale","Rocco De Nicola","Luca Guarnera","Sara Mandelli","Gian Luca Marcialis","Marco Micheletto","Andrea Montibeller","Giulia Orru'","Alessandro Ortis","Pericle Perazzo","Giovanni Puglisi","Davide Salvi","Stefano Tubaro","Claudia Melis Tonti","Massimo Villari","Domenico Vitulano"],"pdf_url":"https://arxiv.org/pdf/2408.00388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05477v3","updated":"2024-08-13T14:44:38Z","published":"2024-05-09T00:30:45Z","title":"DynaSeg: A Deep Dynamic Fusion Method for Unsupervised Image\n Segmentation Incorporating Feature Similarity and Spatial Continuity","summary":" Our work tackles the fundamental challenge of image segmentation in computer\nvision, which is crucial for diverse applications. While supervised methods\ndemonstrate proficiency, their reliance on extensive pixel-level annotations\nlimits scalability. We introduce DynaSeg, an innovative unsupervised image\nsegmentation approach that overcomes the challenge of balancing feature\nsimilarity and spatial continuity without relying on extensive hyperparameter\ntuning. Unlike traditional methods, DynaSeg employs a dynamic weighting scheme\nthat automates parameter tuning, adapts flexibly to image characteristics, and\nfacilitates easy integration with other segmentation networks. By incorporating\na Silhouette Score Phase, DynaSeg prevents undersegmentation failures where the\nnumber of predicted clusters might converge to one. DynaSeg uses CNN-based and\npre-trained ResNet feature extraction, making it computationally efficient and\nmore straightforward than other complex models. Experimental results showcase\nstate-of-the-art performance, achieving a 12.2% and 14.12% mIOU improvement\nover current unsupervised segmentation approaches on COCO-All and COCO-Stuff\ndatasets, respectively. We provide qualitative and quantitative results on five\nbenchmark datasets, demonstrating the efficacy of the proposed approach.Code is\navailable at https://github.com/RyersonMultimediaLab/DynaSeg\n","authors":["Boujemaa Guermazi","Naimul Khan"],"pdf_url":"https://arxiv.org/pdf/2405.05477v3.pdf","comment":"In Press: Image and Vision Computing Journal"},{"id":"http://arxiv.org/abs/2312.08879v3","updated":"2024-08-13T14:32:37Z","published":"2023-12-12T11:00:39Z","title":"Regularizing Self-supervised 3D Scene Flows with Surface Awareness and\n Cyclic Consistency","summary":" Learning without supervision how to predict 3D scene flows from point clouds\nis essential to many perception systems. We propose a novel learning framework\nfor this task which improves the necessary regularization. Relying on the\nassumption that scene elements are mostly rigid, current smoothness losses are\nbuilt on the definition of \"rigid clusters\" in the input point clouds. The\ndefinition of these clusters is challenging and has a significant impact on the\nquality of predicted flows. We introduce two new consistency losses that\nenlarge clusters while preventing them from spreading over distinct objects. In\nparticular, we enforce \\emph{temporal} consistency with a forward-backward\ncyclic loss and \\emph{spatial} consistency by considering surface orientation\nsimilarity in addition to spatial proximity. The proposed losses are\nmodel-independent and can thus be used in a plug-and-play fashion to\nsignificantly improve the performance of existing models, as demonstrated on\ntwo most widely used architectures. We also showcase the effectiveness and\ngeneralization capability of our framework on four standard sensor-unique\ndriving datasets, achieving state-of-the-art performance in 3D scene flow\nestimation. Our codes are available on https://github.com/ctu-vras/sac-flow.\n","authors":["Patrik Vacek","David Hurych","Karel Zimmermann","Patrick Perez","Tomas Svoboda"],"pdf_url":"https://arxiv.org/pdf/2312.08879v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06927v1","updated":"2024-08-13T14:29:00Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis.\n To overcome these constraints, this paper presents the Inter-class Feature\nCompensator (INFER), an innovative distillation approach that transcends the\nclass-specific data-label framework widely utilized in current dataset\ndistillation methods. Specifically, INFER leverages a Universal Feature\nCompensator (UFC) to enhance feature integration across classes, enabling the\ngeneration of multiple additional synthetic instances from a single UFC input.\nThis significantly improves the efficiency of the distillation budget.\n Moreover, INFER enriches inter-class interactions during the distillation,\nthereby enhancing the effectiveness and generalizability of the distilled data.\nBy allowing for the linear interpolation of labels similar to those in the\noriginal dataset, INFER meticulously optimizes the synthetic data and\ndramatically reduces the size of soft labels in the synthetic dataset to almost\nzero, establishing a new benchmark for efficiency and effectiveness in dataset\ndistillation.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06926v1","updated":"2024-08-13T14:26:30Z","published":"2024-08-13T14:26:30Z","title":"SceneGPT: A Language Model for 3D Scene Understanding","summary":" Building models that can understand and reason about 3D scenes is difficult\nowing to the lack of data sources for 3D supervised training and large-scale\ntraining regimes. In this work we ask - How can the knowledge in a pre-trained\nlanguage model be leveraged for 3D scene understanding without any 3D\npre-training. The aim of this work is to establish whether pre-trained LLMs\npossess priors/knowledge required for reasoning in 3D space and how can we\nprompt them such that they can be used for general purpose spatial reasoning\nand object understanding in 3D. To this end, we present SceneGPT, an LLM based\nscene understanding system which can perform 3D spatial reasoning without\ntraining or explicit 3D supervision. The key components of our framework are -\n1) a 3D scene graph, that serves as scene representation, encoding the objects\nin the scene and their spatial relationships 2) a pre-trained LLM that can be\nadapted with in context learning for 3D spatial reasoning. We evaluate our\nframework qualitatively on object and scene understanding tasks including\nobject semantics, physical properties and affordances (object-level) and\nspatial understanding (scene-level).\n","authors":["Shivam Chandhok"],"pdf_url":"https://arxiv.org/pdf/2408.06926v1.pdf","comment":"UBC Report"},{"id":"http://arxiv.org/abs/2404.08363v3","updated":"2024-08-13T14:24:12Z","published":"2024-04-12T10:04:03Z","title":"Let-It-Flow: Simultaneous Optimization of 3D Flow and Object Clustering","summary":" We study the problem of self-supervised 3D scene flow estimation from real\nlarge-scale raw point cloud sequences, which is crucial to various tasks like\ntrajectory prediction or instance segmentation. In the absence of ground truth\nscene flow labels, contemporary approaches concentrate on deducing optimizing\nflow across sequential pairs of point clouds by incorporating structure based\nregularization on flow and object rigidity. The rigid objects are estimated by\na variety of 3D spatial clustering methods. While state-of-the-art methods\nsuccessfully capture overall scene motion using the Neural Prior structure,\nthey encounter challenges in discerning multi-object motions. We identified the\nstructural constraints and the use of large and strict rigid clusters as the\nmain pitfall of the current approaches and we propose a novel clustering\napproach that allows for combination of overlapping soft clusters as well as\nnon-overlapping rigid clusters representation. Flow is then jointly estimated\nwith progressively growing non-overlapping rigid clusters together with fixed\nsize overlapping soft clusters. We evaluate our method on multiple datasets\nwith LiDAR point clouds, demonstrating the superior performance over the\nself-supervised baselines reaching new state of the art results. Our method\nespecially excels in resolving flow in complicated dynamic scenes with multiple\nindependently moving objects close to each other which includes pedestrians,\ncyclists and other vulnerable road users. Our codes are publicly available on\nhttps://github.com/ctu-vras/let-it-flow.\n","authors":["Patrik Vacek","David Hurych","Tomáš Svoboda","Karel Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2404.08363v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05905v2","updated":"2024-08-13T13:55:03Z","published":"2024-08-12T03:31:29Z","title":"Weakly Supervised Video Anomaly Detection and Localization with\n Spatio-Temporal Prompts","summary":" Current weakly supervised video anomaly detection (WSVAD) task aims to\nachieve frame-level anomalous event detection with only coarse video-level\nannotations available. Existing works typically involve extracting global\nfeatures from full-resolution video frames and training frame-level classifiers\nto detect anomalies in the temporal dimension. However, most anomalous events\ntend to occur in localized spatial regions rather than the entire video frames,\nwhich implies existing frame-level feature based works may be misled by the\ndominant background information and lack the interpretation of the detected\nanomalies. To address this dilemma, this paper introduces a novel method called\nSTPrompt that learns spatio-temporal prompt embeddings for weakly supervised\nvideo anomaly detection and localization (WSVADL) based on pre-trained\nvision-language models (VLMs). Our proposed method employs a two-stream network\nstructure, with one stream focusing on the temporal dimension and the other\nprimarily on the spatial dimension. By leveraging the learned knowledge from\npre-trained VLMs and incorporating natural motion priors from raw videos, our\nmodel learns prompt embeddings that are aligned with spatio-temporal regions of\nvideos (e.g., patches of individual frames) for identify specific local regions\nof anomalies, enabling accurate video anomaly detection while mitigating the\ninfluence of background information. Without relying on detailed\nspatio-temporal annotations or auxiliary object detection/tracking, our method\nachieves state-of-the-art performance on three public benchmarks for the WSVADL\ntask.\n","authors":["Peng Wu","Xuerong Zhou","Guansong Pang","Zhiwei Yang","Qingsen Yan","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05905v2.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2408.06901v1","updated":"2024-08-13T13:51:34Z","published":"2024-08-13T13:51:34Z","title":"Divide and Conquer: Improving Multi-Camera 3D Perception with 2D\n Semantic-Depth Priors and Input-Dependent Queries","summary":" 3D perception tasks, such as 3D object detection and Bird's-Eye-View (BEV)\nsegmentation using multi-camera images, have drawn significant attention\nrecently. Despite the fact that accurately estimating both semantic and 3D\nscene layouts are crucial for this task, existing techniques often neglect the\nsynergistic effects of semantic and depth cues, leading to the occurrence of\nclassification and position estimation errors. Additionally, the\ninput-independent nature of initial queries also limits the learning capacity\nof Transformer-based models. To tackle these challenges, we propose an\ninput-aware Transformer framework that leverages Semantics and Depth as priors\n(named SDTR). Our approach involves the use of an S-D Encoder that explicitly\nmodels semantic and depth priors, thereby disentangling the learning process of\nobject categorization and position estimation. Moreover, we introduce a\nPrior-guided Query Builder that incorporates the semantic prior into the\ninitial queries of the Transformer, resulting in more effective input-aware\nqueries. Extensive experiments on the nuScenes and Lyft benchmarks demonstrate\nthe state-of-the-art performance of our method in both 3D object detection and\nBEV segmentation tasks.\n","authors":["Qi Song","Qingyong Hu","Chi Zhang","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06901v1.pdf","comment":"Accepted by TIP 2024"},{"id":"http://arxiv.org/abs/2408.06899v1","updated":"2024-08-13T13:50:46Z","published":"2024-08-13T13:50:46Z","title":"EE3P3D: Event-based Estimation of Periodic Phenomena Frequency using 3D\n Correlation","summary":" We present a novel method for measuring the frequency of periodic phenomena,\ne.g., rotation, flicker and vibration, by an event camera, a device\nasynchronously reporting brightness changes at independently operating pixels\nwith high temporal resolution. The approach assumes that for a periodic\nphenomenon, a highly similar set of events is generated within a specific\nspatio-temporal window at a time difference corresponding to the phenomenon's\nperiod. The sets of similar events are detected by 3D spatio-temporal\ncorrelation in the event stream space. The proposed method, EE3P3D, is\nevaluated on a dataset of 12 sequences of periodic phenomena, i.e. flashing\nlight and vibration, and periodic motion, e.g., rotation, ranging from 3.2 Hz\nto 2 kHz (equivalent to 192 - 120 000 RPM). EE3P3D significantly outperforms\npublished methods on this dataset, achieving a mean relative error of 0.1%.\n","authors":["Jakub Kolář","Radim Špetlík","Jiří Matas"],"pdf_url":"https://arxiv.org/pdf/2408.06899v1.pdf","comment":"15 paper pages + 11 suppl pages, 15 figues, 4 tables"},{"id":"http://arxiv.org/abs/2408.06891v1","updated":"2024-08-13T13:38:32Z","published":"2024-08-13T13:38:32Z","title":"Automatic Feature Recognition and Dimensional Attributes Extraction From\n CAD Models for Hybrid Additive-Subtractive Manufacturing","summary":" The integration of Computer-Aided Design (CAD), Computer-Aided Process\nPlanning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in\nmodern manufacturing, facilitating seamless transitions from digital designs to\nphysical products. However, a significant challenge within this integration is\nthe Automatic Feature Recognition (AFR) of CAD models, especially in the\ncontext of hybrid manufacturing that combines subtractive and additive\nmanufacturing processes. Traditional AFR methods, focused mainly on the\nidentification of subtractive (machined) features including holes, fillets,\nchamfers, pockets, and slots, fail to recognize features pertinent to additive\nmanufacturing. Furthermore, the traditional methods fall short in accurately\nextracting geometric dimensions and orientations, which are also key factors\nfor effective manufacturing process planning. This paper presents a novel\napproach for creating a synthetic CAD dataset that encompasses features\nrelevant to both additive and subtractive machining through Python Open\nCascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is\nimplemented to accurately identify the composite additive-subtractive features\nwithin the synthetic CAD dataset. The key novelty and contribution of the\nproposed methodology lie in its ability to recognize a wide range of\nmanufacturing features, and precisely extracting their dimensions,\norientations, and stock sizes. The proposed model demonstrates remarkable\nfeature recognition accuracy exceeding 97% and a dimension extraction accuracy\nof 100% for identified features. Therefore, the proposed methodology enhances\nthe integration of CAD, CAPP, and CAM within hybrid manufacturing by providing\nprecise feature recognition and dimension extraction. It facilitates improved\nmanufacturing process planning, by enabling more informed decision-making.\n","authors":["Muhammad Tayyab Khan","Wenhe Feng","Lequn Chen","Ye Han Ng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2408.06891v1.pdf","comment":"10 pages, 12 figures. This paper has been accepted for presentation\n at the ASME IDETC-CIE 2024 conference"},{"id":"http://arxiv.org/abs/2408.06878v1","updated":"2024-08-13T13:26:24Z","published":"2024-08-13T13:26:24Z","title":"PBIR-NIE: Glossy Object Capture under Non-Distant Lighting","summary":" Glossy objects present a significant challenge for 3D reconstruction from\nmulti-view input images under natural lighting. In this paper, we introduce\nPBIR-NIE, an inverse rendering framework designed to holistically capture the\ngeometry, material attributes, and surrounding illumination of such objects. We\npropose a novel parallax-aware non-distant environment map as a lightweight and\nefficient lighting representation, accurately modeling the near-field\nbackground of the scene, which is commonly encountered in real-world capture\nsetups. This feature allows our framework to accommodate complex parallax\neffects beyond the capabilities of standard infinite-distance environment maps.\nOur method optimizes an underlying signed distance field (SDF) through\nphysics-based differentiable rendering, seamlessly connecting surface gradients\nbetween a triangle mesh and the SDF via neural implicit evolution (NIE). To\naddress the intricacies of highly glossy BRDFs in differentiable rendering, we\nintegrate the antithetic sampling algorithm to mitigate variance in the Monte\nCarlo gradient estimator. Consequently, our framework exhibits robust\ncapabilities in handling glossy object reconstruction, showcasing superior\nquality in geometry, relighting, and material estimation.\n","authors":["Guangyan Cai","Fujun Luan","Miloš Hašan","Kai Zhang","Sai Bi","Zexiang Xu","Iliyan Georgiev","Shuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.06878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05079v5","updated":"2024-08-13T13:25:34Z","published":"2024-05-08T14:22:39Z","title":"Power Variable Projection for Initialization-Free Large-Scale Bundle\n Adjustment","summary":" Most Bundle Adjustment (BA) solvers like the Levenberg-Marquardt algorithm\nrequire a good initialization. Instead, initialization-free BA remains a\nlargely uncharted territory. The under-explored Variable Projection algorithm\n(VarPro) exhibits a wide convergence basin even without initialization. Coupled\nwith object space error formulation, recent works have shown its ability to\nsolve small-scale initialization-free bundle adjustment problem. To make such\ninitialization-free BA approaches scalable, we introduce Power Variable\nProjection (PoVar), extending a recent inverse expansion method based on power\nseries. Importantly, we link the power series expansion to Riemannian manifold\noptimization. This projective framework is crucial to solve large-scale bundle\nadjustment problems without initialization. Using the real-world BAL dataset,\nwe experimentally demonstrate that our solver achieves state-of-the-art results\nin terms of speed and accuracy. To our knowledge, this work is the first to\naddress the scalability of BA without initialization opening new venues for\ninitialization-free structure-from-motion.\n","authors":["Simon Weber","Je Hyeong Hong","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2405.05079v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09447v3","updated":"2024-08-13T13:24:33Z","published":"2024-04-15T04:20:01Z","title":"kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually\n Expanding Large Vocabularies","summary":" Continual segmentation has not yet tackled the challenge of improving\nopen-vocabulary segmentation models with training data for accurate\nsegmentation across large, continually expanding vocabularies. We discover that\ntraditional continual training results in severe catastrophic forgetting,\nfailing to outperform a zero-shot segmentation baseline. We introduce a novel\ntraining-free strategy, kNN-CLIP, which augments the model with a database of\ninstance embeddings for semantic and panoptic segmentation that achieves zero\nforgetting. We demonstrate that kNN-CLIP can adapt to continually growing\nvocabularies without the need for retraining or large memory costs. kNN-CLIP\nenables open-vocabulary segmentation methods to expand their vocabularies on\nany domain with a single pass through the data, while only storing compact\nembeddings. This approach minimizes both compute and memory costs. kNN-CLIP\nachieves state-of-the-art performance across large-vocabulary semantic and\npanoptic segmentation datasets. We hope kNN-CLIP represents a significant step\nforward in enabling more efficient and adaptable continual segmentation, paving\nthe way for advances in real-world large-vocabulary continual segmentation\nmethods.\n","authors":["Zhongrui Gui","Shuyang Sun","Runjia Li","Jianhao Yuan","Zhaochong An","Karsten Roth","Ameya Prabhu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2404.09447v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08640v3","updated":"2024-08-13T13:10:08Z","published":"2024-03-13T15:52:20Z","title":"Refractive COLMAP: Refractive Structure-from-Motion Revisited","summary":" In this paper, we present a complete refractive Structure-from-Motion (RSfM)\nframework for underwater 3D reconstruction using refractive camera setups (for\nboth, flat- and dome-port underwater housings). Despite notable achievements in\nrefractive multi-view geometry over the past decade, a robust, complete and\npublicly available solution for such tasks is not available at present, and\noften practical applications have to resort to approximating refraction effects\nby the intrinsic (distortion) parameters of a pinhole camera model. To fill\nthis gap, we have integrated refraction considerations throughout the entire\nSfM process within the state-of-the-art, open-source SfM framework COLMAP.\nNumerical simulations and reconstruction results on synthetically generated but\nphoto-realistic images with ground truth validate that enabling refraction does\nnot compromise accuracy or robustness as compared to in-air reconstructions.\nFinally, we demonstrate the capability of our approach for large-scale\nrefractive scenarios using a dataset consisting of nearly 6000 images. The\nimplementation is released as open-source at:\nhttps://cau-git.rz.uni-kiel.de/inf-ag-koeser/colmap_underwater.\n","authors":["Mengkun She","Felix Seegräber","David Nakath","Kevin Köser"],"pdf_url":"https://arxiv.org/pdf/2403.08640v3.pdf","comment":"8 pages, 7 figures, the paper is accepted to be published at the 2024\n IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS\n 2024)"},{"id":"http://arxiv.org/abs/2408.06868v1","updated":"2024-08-13T13:06:50Z","published":"2024-08-13T13:06:50Z","title":"A Comprehensive Survey on Synthetic Infrared Image synthesis","summary":" Synthetic infrared (IR) scene and target generation is an important computer\nvision problem as it allows the generation of realistic IR images and targets\nfor training and testing of various applications, such as remote sensing,\nsurveillance, and target recognition. It also helps reduce the cost and risk\nassociated with collecting real-world IR data. This survey paper aims to\nprovide a comprehensive overview of the conventional mathematical\nmodelling-based methods and deep learning-based methods used for generating\nsynthetic IR scenes and targets. The paper discusses the importance of\nsynthetic IR scene and target generation and briefly covers the mathematics of\nblackbody and grey body radiations, as well as IR image-capturing methods. The\npotential use cases of synthetic IR scenes and target generation are also\ndescribed, highlighting the significance of these techniques in various fields.\nAdditionally, the paper explores possible new ways of developing new techniques\nto enhance the efficiency and effectiveness of synthetic IR scenes and target\ngeneration while highlighting the need for further research to advance this\nfield.\n","authors":["Avinash Upadhyay","Manoj sharma","Prerna Mukherjee","Amit Singhal","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2408.06868v1.pdf","comment":"Submitted in Journal of Infrared Physics & Technology"},{"id":"http://arxiv.org/abs/2408.05743v2","updated":"2024-08-13T13:02:23Z","published":"2024-08-11T10:42:22Z","title":"Neural Architecture Search based Global-local Vision Mamba for Palm-Vein\n Recognition","summary":" Due to the advantages such as high security, high privacy, and liveness\nrecognition, vein recognition has been received more and more attention in past\nyears. Recently, deep learning models, e.g., Mamba has shown robust feature\nrepresentation with linear computational complexity and successfully applied\nfor visual tasks. However, vision Manba can capture long-distance feature\ndependencies but unfortunately deteriorate local feature details. Besides,\nmanually designing a Mamba architecture based on human priori knowledge is very\ntime-consuming and error-prone. In this paper, first, we propose a hybrid\nnetwork structure named Global-local Vision Mamba (GLVM), to learn the local\ncorrelations in images explicitly and global dependencies among tokens for vein\nfeature representation. Secondly, we design a Multi-head Mamba to learn the\ndependencies along different directions, so as to improve the feature\nrepresentation ability of vision Mamba. Thirdly, to learn the complementary\nfeatures, we propose a ConvMamba block consisting of three branches, named\nMulti-head Mamba branch (MHMamba), Feature Iteration Unit branch (FIU), and\nConvolutional Neural Network (CNN) branch, where the Feature Iteration Unit\nbranch aims to fuse convolutional local features with Mamba-based global\nrepresentations. Finally, a Globallocal Alternate Neural Architecture Search\n(GLNAS) method is proposed to search the optimal architecture of GLVM\nalternately with the evolutionary algorithm, thereby improving the recognition\nperformance for vein recognition tasks. We conduct rigorous experiments on\nthree public palm-vein databases to estimate the performance. The experimental\nresults demonstrate that the proposed method outperforms the representative\napproaches and achieves state-of-the-art recognition accuracy.\n","authors":["Huafeng Qin","Yuming Fu","Jing Chen","Mounim A. El-Yacoubi","Xinbo Gao","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01233v2","updated":"2024-08-13T12:46:39Z","published":"2024-08-02T12:48:36Z","title":"CLIP4Sketch: Enhancing Sketch to Mugshot Matching through Dataset\n Augmentation using Diffusion Models","summary":" Forensic sketch-to-mugshot matching is a challenging task in face\nrecognition, primarily hindered by the scarcity of annotated forensic sketches\nand the modality gap between sketches and photographs. To address this, we\npropose CLIP4Sketch, a novel approach that leverages diffusion models to\ngenerate a large and diverse set of sketch images, which helps in enhancing the\nperformance of face recognition systems in sketch-to-mugshot matching. Our\nmethod utilizes Denoising Diffusion Probabilistic Models (DDPMs) to generate\nsketches with explicit control over identity and style. We combine CLIP and\nAdaface embeddings of a reference mugshot, along with textual descriptions of\nstyle, as the conditions to the diffusion model. We demonstrate the efficacy of\nour approach by generating a comprehensive dataset of sketches corresponding to\nmugshots and training a face recognition model on our synthetic data. Our\nresults show significant improvements in sketch-to-mugshot matching accuracy\nover training on an existing, limited amount of real face sketch data,\nvalidating the potential of diffusion models in enhancing the performance of\nface recognition systems across modalities. We also compare our dataset with\ndatasets generated using GAN-based methods to show its superiority.\n","authors":["Kushal Kumar Jain","Steve Grosz","Anoop M. Namboodiri","Anil K. Jain"],"pdf_url":"https://arxiv.org/pdf/2408.01233v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06840v1","updated":"2024-08-13T12:01:22Z","published":"2024-08-13T12:01:22Z","title":"Dynamic and Compressive Adaptation of Transformers From Images to Videos","summary":" Recently, the remarkable success of pre-trained Vision Transformers (ViTs)\nfrom image-text matching has sparked an interest in image-to-video adaptation.\nHowever, most current approaches retain the full forward pass for each frame,\nleading to a high computation overhead for processing entire videos. In this\npaper, we present InTI, a novel approach for compressive image-to-video\nadaptation using dynamic Inter-frame Token Interpolation. InTI aims to softly\npreserve the informative tokens without disrupting their coherent\nspatiotemporal structure. Specifically, each token pair at identical positions\nwithin neighbor frames is linearly aggregated into a new token, where the\naggregation weights are generated by a multi-scale context-aware network. In\nthis way, the information of neighbor frames can be adaptively compressed in a\npoint-by-point manner, thereby effectively reducing the number of processed\nframes by half each time. Importantly, InTI can be seamlessly integrated with\nexisting adaptation methods, achieving strong performance without extra-complex\ndesign. On Kinetics-400, InTI reaches a top-1 accuracy of 87.1 with a\nremarkable 37.5% reduction in GFLOPs compared to naive adaptation. When\ncombined with additional temporal modules, InTI achieves a top-1 accuracy of\n87.6 with a 37% reduction in GFLOPs. Similar conclusions have been verified in\nother common datasets.\n","authors":["Guozhen Zhang","Jingyu Liu","Shengming Cao","Xiaotong Zhao","Kevin Zhao","Kai Ma","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06834v1","updated":"2024-08-13T11:48:28Z","published":"2024-08-13T11:48:28Z","title":"GLGait: A Global-Local Temporal Receptive Field Network for Gait\n Recognition in the Wild","summary":" Gait recognition has attracted increasing attention from academia and\nindustry as a human recognition technology from a distance in non-intrusive\nways without requiring cooperation. Although advanced methods have achieved\nimpressive success in lab scenarios, most of them perform poorly in the wild.\nRecently, some Convolution Neural Networks (ConvNets) based methods have been\nproposed to address the issue of gait recognition in the wild. However, the\ntemporal receptive field obtained by convolution operations is limited for long\ngait sequences. If directly replacing convolution blocks with visual\ntransformer blocks, the model may not enhance a local temporal receptive field,\nwhich is important for covering a complete gait cycle. To address this issue,\nwe design a Global-Local Temporal Receptive Field Network (GLGait). GLGait\nemploys a Global-Local Temporal Module (GLTM) to establish a global-local\ntemporal receptive field, which mainly consists of a Pseudo Global Temporal\nSelf-Attention (PGTA) and a temporal convolution operation. Specifically, PGTA\nis used to obtain a pseudo global temporal receptive field with less memory and\ncomputation complexity compared with a multi-head self-attention (MHSA). The\ntemporal convolution operation is used to enhance the local temporal receptive\nfield. Besides, it can also aggregate pseudo global temporal receptive field to\na true holistic temporal receptive field. Furthermore, we also propose a\nCenter-Augmented Triplet Loss (CTL) in GLGait to reduce the intra-class\ndistance and expand the positive samples in the training stage. Extensive\nexperiments show that our method obtains state-of-the-art results on\nin-the-wild datasets, $i.e.$, Gait3D and GREW. The code is available at\nhttps://github.com/bgdpgz/GLGait.\n","authors":["Guozhen Peng","Yunhong Wang","Yuwei Zhao","Shaoxiong Zhang","Annan Li"],"pdf_url":"https://arxiv.org/pdf/2408.06834v1.pdf","comment":"Accepted by ACM MM2024"},{"id":"http://arxiv.org/abs/2408.06832v1","updated":"2024-08-13T11:46:32Z","published":"2024-08-13T11:46:32Z","title":"FlatFusion: Delving into Details of Sparse Transformer-based\n Camera-LiDAR Fusion for Autonomous Driving","summary":" The integration of data from diverse sensor modalities (e.g., camera and\nLiDAR) constitutes a prevalent methodology within the ambit of autonomous\ndriving scenarios. Recent advancements in efficient point cloud transformers\nhave underscored the efficacy of integrating information in sparse formats.\nWhen it comes to fusion, since image patches are dense in pixel space with\nambiguous depth, it necessitates additional design considerations for effective\nfusion. In this paper, we conduct a comprehensive exploration of design choices\nfor Transformer-based sparse cameraLiDAR fusion. This investigation encompasses\nstrategies for image-to-3D and LiDAR-to-2D mapping, attention neighbor\ngrouping, single modal tokenizer, and micro-structure of Transformer. By\namalgamating the most effective principles uncovered through our investigation,\nwe introduce FlatFusion, a carefully designed framework for sparse camera-LiDAR\nfusion. Notably, FlatFusion significantly outperforms state-of-the-art sparse\nTransformer-based methods, including UniTR, CMT, and SparseFusion, achieving\n73.7 NDS on the nuScenes validation set with 10.1 FPS with PyTorch.\n","authors":["Yutao Zhu","Xiaosong Jia","Xinyu Yang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.06832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06828v1","updated":"2024-08-13T11:39:14Z","published":"2024-08-13T11:39:14Z","title":"Photometric Inverse Rendering: Shading Cues Modeling and Surface\n Reflectance Regularization","summary":" This paper addresses the problem of inverse rendering from photometric\nimages. Existing approaches for this problem suffer from the effects of\nself-shadows, inter-reflections, and lack of constraints on the surface\nreflectance, leading to inaccurate decomposition of reflectance and\nillumination due to the ill-posed nature of inverse rendering. In this work, we\npropose a new method for neural inverse rendering. Our method jointly optimizes\nthe light source position to account for the self-shadows in images, and\ncomputes indirect illumination using a differentiable rendering layer and an\nimportance sampling strategy. To enhance surface reflectance decomposition, we\nintroduce a new regularization by distilling DINO features to foster accurate\nand consistent material decomposition. Extensive experiments on synthetic and\nreal datasets demonstrate that our method outperforms the state-of-the-art\nmethods in reflectance decomposition.\n","authors":["Jingzhi Bao","Guanying Chen","Shuguang Cui"],"pdf_url":"https://arxiv.org/pdf/2408.06828v1.pdf","comment":"Project page: https://jzbao03.site/projects/PIR/"},{"id":"http://arxiv.org/abs/2407.12317v2","updated":"2024-08-13T11:36:52Z","published":"2024-07-17T05:02:17Z","title":"Out of Length Text Recognition with Sub-String Matching","summary":" Scene Text Recognition (STR) methods have demonstrated robust performance in\nword-level text recognition. However, in real applications the text image is\nsometimes long due to detected with multiple horizontal words. It triggers the\nrequirement to build long text recognition models from readily available short\n(i.e., word-level) text datasets, which has been less studied previously. In\nthis paper, we term this task Out of Length (OOL) text recognition. We\nestablish the first Long Text Benchmark (LTB) to facilitate the assessment of\ndifferent methods in long text recognition. Meanwhile, we propose a novel\nmethod called OOL Text Recognition with sub-String Matching (SMTR). SMTR\ncomprises two cross-attention-based modules: one encodes a sub-string\ncontaining multiple characters into next and previous queries, and the other\nemploys the queries to attend to the image features, matching the sub-string\nand simultaneously recognizing its next and previous character. SMTR can\nrecognize text of arbitrary length by iterating the process above. To avoid\nbeing trapped in recognizing highly similar sub-strings, we introduce a\nregularization training to compel SMTR to effectively discover subtle\ndifferences between similar sub-strings for precise matching. In addition, we\npropose an inference augmentation strategy to alleviate confusion caused by\nidentical sub-strings in the same text and improve the overall recognition\nefficiency. Extensive experimental results reveal that SMTR, even when trained\nexclusively on short text, outperforms existing methods in public short text\nbenchmarks and exhibits a clear advantage on LTB. Code:\nhttps://github.com/Topdu/OpenOCR.\n","authors":["Yongkun Du","Zhineng Chen","Caiyan Jia","Xieping Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.12317v2.pdf","comment":"Preprint, 16 pages"},{"id":"http://arxiv.org/abs/2406.16633v3","updated":"2024-08-13T11:35:55Z","published":"2024-06-24T13:30:55Z","title":"MLAAN: Scaling Supervised Local Learning with Multilaminar Leap\n Augmented Auxiliary Network","summary":" Deep neural networks (DNNs) typically employ an end-to-end (E2E) training\nparadigm which presents several challenges, including high GPU memory\nconsumption, inefficiency, and difficulties in model parallelization during\ntraining. Recent research has sought to address these issues, with one\npromising approach being local learning. This method involves partitioning the\nbackbone network into gradient-isolated modules and manually designing\nauxiliary networks to train these local modules. Existing methods often neglect\nthe interaction of information between local modules, leading to myopic issues\nand a performance gap compared to E2E training. To address these limitations,\nwe propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN).\nSpecifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap\nAugmented Modules (LAM). MLM captures both local and global features through\nindependent and cascaded auxiliary networks, alleviating performance issues\ncaused by insufficient global features. However, overly simplistic auxiliary\nnetworks can impede MLM's ability to capture global information. To address\nthis, we further design LAM, an enhanced auxiliary network that uses the\nExponential Moving Average (EMA) method to facilitate information exchange\nbetween local modules, thereby mitigating the shortsightedness resulting from\ninadequate interaction. The synergy between MLM and LAM has demonstrated\nexcellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and\nImageNet datasets show that MLAAN can be seamlessly integrated into existing\nlocal learning frameworks, significantly enhancing their performance and even\nsurpassing end-to-end (E2E) training methods, while also reducing GPU memory\nconsumption.\n","authors":["Yuming Zhang","Shouxin Zhang","Peizhe Wang","Feiyu Zhu","Dongzhi Guan","Junhao Su","Jiabin Liu","Changpeng Cai"],"pdf_url":"https://arxiv.org/pdf/2406.16633v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06825v1","updated":"2024-08-13T11:34:28Z","published":"2024-08-13T11:34:28Z","title":"Membership Inference Attack Against Masked Image Modeling","summary":" Masked Image Modeling (MIM) has achieved significant success in the realm of\nself-supervised learning (SSL) for visual recognition. The image encoder\npre-trained through MIM, involving the masking and subsequent reconstruction of\ninput images, attains state-of-the-art performance in various downstream vision\ntasks. However, most existing works focus on improving the performance of\nMIM.In this work, we take a different angle by studying the pre-training data\nprivacy of MIM. Specifically, we propose the first membership inference attack\nagainst image encoders pre-trained by MIM, which aims to determine whether an\nimage is part of the MIM pre-training dataset. The key design is to simulate\nthe pre-training paradigm of MIM, i.e., image masking and subsequent\nreconstruction, and then obtain reconstruction errors. These reconstruction\nerrors can serve as membership signals for achieving attack goals, as the\nencoder is more capable of reconstructing the input image in its training set\nwith lower errors. Extensive evaluations are conducted on three model\narchitectures and three benchmark datasets. Empirical results show that our\nattack outperforms baseline methods. Additionally, we undertake intricate\nablation studies to analyze multiple factors that could influence the\nperformance of the attack.\n","authors":["Zheng Li","Xinlei He","Ning Yu","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12399v2","updated":"2024-08-13T11:28:13Z","published":"2024-07-17T08:25:32Z","title":"A Practical Solver for Scalar Data Topological Simplification","summary":" This paper presents a practical approach for the optimization of topological\nsimplification, a central pre-processing step for the analysis and\nvisualization of scalar data. Given an input scalar field f and a set of\n\"signal\" persistence pairs to maintain, our approach produces an output field g\nthat is close to f and which optimizes (i) the cancellation of \"non-signal\"\npairs, while (ii) preserving the \"signal\" pairs. In contrast to pre-existing\nsimplification algorithms, our approach is not restricted to persistence pairs\ninvolving extrema and can thus address a larger class of topological features,\nin particular saddle pairs in three-dimensional scalar data. Our approach\nleverages recent generic persistence optimization frameworks and extends them\nwith tailored accelerations specific to the problem of topological\nsimplification. Extensive experiments report substantial accelerations over\nthese frameworks, thereby making topological simplification optimization\npractical for real-life datasets. Our approach enables a direct visualization\nand analysis of the topologically simplified data, e.g., via isosurfaces of\nsimplified topology (fewer components and handles). We apply our approach to\nthe extraction of prominent filament structures in three-dimensional data.\nSpecifically, we show that our pre-simplification of the data leads to\npractical improvements over standard topological techniques for removing\nfilament loops. We also show how our approach can be used to repair genus\ndefects in surface processing. Finally, we provide a C++ implementation for\nreproducibility purposes.\n","authors":["Mohamed Kissi","Mathieu Pont","Joshua A. Levine","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2407.12399v2.pdf","comment":"13 pages, 10 figures, IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2408.06814v1","updated":"2024-08-13T11:10:26Z","published":"2024-08-13T11:10:26Z","title":"Structure-preserving Planar Simplification for Indoor Environments","summary":" This paper presents a novel approach for structure-preserving planar\nsimplification of indoor scene point clouds for both simulated and real-world\nenvironments. Initially, the scene point cloud undergoes preprocessing steps,\nincluding noise reduction and Manhattan world alignment, to ensure robustness\nand coherence in subsequent analyses. We segment each captured scene into\nstructured (walls-ceiling-floor) and non-structured (indoor objects) scenes.\nLeveraging a RANSAC algorithm, we extract primitive planes from the input point\ncloud, facilitating the segmentation and simplification of the structured\nscene. The best-fitting wall meshes are then generated from the primitives,\nfollowed by adjacent mesh merging with the vertex-translation algorithm which\npreserves the mesh layout. To accurately represent ceilings and floors, we\nemploy the mesh clipping algorithm which clips the ceiling and floor meshes\nwith respect to wall normals. In the case of indoor scenes, we apply a surface\nreconstruction technique to enhance the fidelity. This paper focuses on the\nintricate steps of the proposed scene simplification methodology, addressing\ncomplex scenarios such as multi-story and slanted walls and ceilings. We also\nconduct qualitative and quantitative performance comparisons against popular\nsurface reconstruction, shape approximation, and floorplan generation\napproaches.\n","authors":["Bishwash Khanal","Sanjay Rijal","Manish Awale","Vaghawan Ojha"],"pdf_url":"https://arxiv.org/pdf/2408.06814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06811v1","updated":"2024-08-13T11:00:51Z","published":"2024-08-13T11:00:51Z","title":"Oracle Bone Script Similiar Character Screening Approach Based on\n Simsiam Contrastive Learning and Supervised Learning","summary":" This project proposes a new method that uses fuzzy comprehensive evaluation\nmethod to integrate ResNet-50 self-supervised and RepVGG supervised learning.\nThe source image dataset HWOBC oracle is taken as input, the target image is\nselected, and finally the most similar image is output in turn without any\nmanual intervention. The same feature encoding method is not used for images of\ndifferent modalities. Before the model training, the image data is\npreprocessed, and the image is enhanced by random rotation processing,\nself-square graph equalization theory algorithm, and gamma transform, which\neffectively enhances the key feature learning. Finally, the fuzzy comprehensive\nevaluation method is used to combine the results of supervised training and\nunsupervised training, which can better solve the \"most similar\" problem that\nis difficult to quantify. At present, there are many unknown oracle-bone\ninscriptions waiting for us to crack. Contacting with the glyphs can provide\nnew ideas for cracking.\n","authors":["Xinying Weng","Yifan Li","Shuaidong Hao","Jialiang Hou"],"pdf_url":"https://arxiv.org/pdf/2408.06811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06806v1","updated":"2024-08-13T10:54:10Z","published":"2024-08-13T10:54:10Z","title":"Unmasking the Uniqueness: A Glimpse into Age-Invariant Face Recognition\n of Indigenous African Faces","summary":" The task of recognizing the age-separated faces of an individual,\nAge-Invariant Face Recognition (AIFR), has received considerable research\nefforts in Europe, America, and Asia, compared to Africa. Thus, AIFR research\nefforts have often under-represented/misrepresented the African ethnicity with\nnon-indigenous Africans. This work developed an AIFR system for indigenous\nAfrican faces to reduce the misrepresentation of African ethnicity in facial\nimage analysis research. We adopted a pre-trained deep learning model (VGGFace)\nfor AIFR on a dataset of 5,000 indigenous African faces (FAGE\\_v2) collected\nfor this study. FAGE\\_v2 was curated via Internet image searches of 500\nindividuals evenly distributed across 10 African countries. VGGFace was trained\non FAGE\\_v2 to obtain the best accuracy of 81.80\\%. We also performed\nexperiments on an African-American subset of the CACD dataset and obtained the\nbest accuracy of 91.5\\%. The results show a significant difference in the\nrecognition accuracies of indigenous versus non-indigenous Africans.\n","authors":["Fakunle Ajewole","Joseph Damilola Akinyemi","Khadijat Tope Ladoja","Olufade Falade Williams Onifade"],"pdf_url":"https://arxiv.org/pdf/2408.06806v1.pdf","comment":"Keywords: Age-Invariant Face Recognition, CACD, FAGE_v2, VGGFace"},{"id":"http://arxiv.org/abs/2404.18213v2","updated":"2024-08-13T10:47:13Z","published":"2024-04-28T15:12:56Z","title":"S$^2$Mamba: A Spatial-spectral State Space Model for Hyperspectral Image\n Classification","summary":" Land cover analysis using hyperspectral images (HSI) remains an open problem\ndue to their low spatial resolution and complex spectral information. Recent\nstudies are primarily dedicated to designing Transformer-based architectures\nfor spatial-spectral long-range dependencies modeling, which is computationally\nexpensive with quadratic complexity. Selective structured state space model\n(Mamba), which is efficient for modeling long-range dependencies with linear\ncomplexity, has recently shown promising progress. However, its potential in\nhyperspectral image processing that requires handling numerous spectral bands\nhas not yet been explored. In this paper, we innovatively propose S$^2$Mamba, a\nspatial-spectral state space model for hyperspectral image classification, to\nexcavate spatial-spectral contextual features, resulting in more efficient and\naccurate land cover analysis. In S$^2$Mamba, two selective structured state\nspace models through different dimensions are designed for feature extraction,\none for spatial, and the other for spectral, along with a spatial-spectral\nmixture gate for optimal fusion. More specifically, S$^2$Mamba first captures\nspatial contextual relations by interacting each pixel with its adjacent\nthrough a Patch Cross Scanning module and then explores semantic information\nfrom continuous spectral bands through a Bi-directional Spectral Scanning\nmodule. Considering the distinct expertise of the two attributes in homogenous\nand complicated texture scenes, we realize the Spatial-spectral Mixture Gate by\na group of learnable matrices, allowing for the adaptive incorporation of\nrepresentations learned across different dimensions. Extensive experiments\nconducted on HSI classification benchmarks demonstrate the superiority and\nprospect of S$^2$Mamba. The code will be made available at:\nhttps://github.com/PURE-melo/S2Mamba.\n","authors":["Guanchun Wang","Xiangrong Zhang","Zelin Peng","Tianyang Zhang","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2404.18213v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.06803v1","updated":"2024-08-13T10:46:42Z","published":"2024-08-13T10:46:42Z","title":"Integrating Saliency Ranking and Reinforcement Learning for Enhanced\n Object Detection","summary":" With the ever-growing variety of object detection approaches, this study\nexplores a series of experiments that combine reinforcement learning (RL)-based\nvisual attention methods with saliency ranking techniques to investigate\ntransparent and sustainable solutions. By integrating saliency ranking for\ninitial bounding box prediction and subsequently applying RL techniques to\nrefine these predictions through a finite set of actions over multiple time\nsteps, this study aims to enhance RL object detection accuracy. Presented as a\nseries of experiments, this research investigates the use of various image\nfeature extraction methods and explores diverse Deep Q-Network (DQN)\narchitectural variations for deep reinforcement learning-based localisation\nagent training. Additionally, we focus on optimising the detection pipeline at\nevery step by prioritising lightweight and faster models, while also\nincorporating the capability to classify detected objects, a feature absent in\nprevious RL approaches. We show that by evaluating the performance of these\ntrained agents using the Pascal VOC 2007 dataset, faster and more optimised\nmodels were developed. Notably, the best mean Average Precision (mAP) achieved\nin this study was 51.4, surpassing benchmarks set by RL-based single object\ndetectors in the literature.\n","authors":["Matthias Bartolo","Dylan Seychell","Josef Bajada"],"pdf_url":"https://arxiv.org/pdf/2408.06803v1.pdf","comment":"Resultant work from Dissertation, Department of AI, University of\n Malta. Code available at: https://github.com/mbar0075/SaRLVision"},{"id":"http://arxiv.org/abs/2408.06798v1","updated":"2024-08-13T10:36:43Z","published":"2024-08-13T10:36:43Z","title":"Token Compensator: Altering Inference Cost of Vision Transformer without\n Re-Tuning","summary":" Token compression expedites the training and inference of Vision Transformers\n(ViTs) by reducing the number of the redundant tokens, e.g., pruning\ninattentive tokens or merging similar tokens. However, when applied to\ndownstream tasks, these approaches suffer from significant performance drop\nwhen the compression degrees are mismatched between training and inference\nstages, which limits the application of token compression on off-the-shelf\ntrained models. In this paper, we propose a model arithmetic framework to\ndecouple the compression degrees between the two stages. In advance, we\nadditionally perform a fast parameter-efficient self-distillation stage on the\npre-trained models to obtain a small plugin, called Token Compensator (ToCom),\nwhich describes the gap between models across different compression degrees.\nDuring inference, ToCom can be directly inserted into any downstream\noff-the-shelf models with any mismatched training and inference compression\ndegrees to acquire universal performance improvements without further training.\nExperiments on over 20 downstream tasks demonstrate the effectiveness of our\nframework. On CIFAR100, fine-grained visual classification, and VTAB-1k, ToCom\ncan yield up to a maximum improvement of 2.3%, 1.5%, and 2.0% in the average\nperformance of DeiT-B, respectively. Code: https://github.com/JieShibo/ToCom\n","authors":["Shibo Jie","Yehui Tang","Jianyuan Guo","Zhi-Hong Deng","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06798v1.pdf","comment":"Accepted to ECCV2024"},{"id":"http://arxiv.org/abs/2401.08687v2","updated":"2024-08-13T10:20:11Z","published":"2024-01-13T04:21:24Z","title":"DA-BEV: Unsupervised Domain Adaptation for Bird's Eye View Perception","summary":" Camera-only Bird's Eye View (BEV) has demonstrated great potential in\nenvironment perception in a 3D space. However, most existing studies were\nconducted under a supervised setup which cannot scale well while handling\nvarious new data. Unsupervised domain adaptive BEV, which effective learning\nfrom various unlabelled target data, is far under-explored. In this work, we\ndesign DA-BEV, the first domain adaptive camera-only BEV framework that\naddresses domain adaptive BEV challenges by exploiting the complementary nature\nof image-view features and BEV features. DA-BEV introduces the idea of query\ninto the domain adaptation framework to derive useful information from\nimage-view and BEV features. It consists of two query-based designs, namely,\nquery-based adversarial learning (QAL) and query-based self-training (QST),\nwhich exploits image-view features or BEV features to regularize the adaptation\nof the other. Extensive experiments show that DA-BEV achieves superior domain\nadaptive BEV perception performance consistently across multiple datasets and\ntasks such as 3D object detection and 3D scene segmentation.\n","authors":["Kai Jiang","Jiaxing Huang","Weiying Xie","Yunsong Li","Ling Shao","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06788v1","updated":"2024-08-13T10:16:10Z","published":"2024-08-13T10:16:10Z","title":"Visual Neural Decoding via Improved Visual-EEG Semantic Consistency","summary":" Visual neural decoding refers to the process of extracting and interpreting\noriginal visual experiences from human brain activity. Recent advances in\nmetric learning-based EEG visual decoding methods have delivered promising\nresults and demonstrated the feasibility of decoding novel visual categories\nfrom brain activity. However, methods that directly map EEG features to the\nCLIP embedding space may introduce mapping bias and cause semantic\ninconsistency among features, thereby degrading alignment and impairing\ndecoding performance. To further explore the semantic consistency between\nvisual and neural signals. In this work, we construct a joint semantic space\nand propose a Visual-EEG Semantic Decouple Framework that explicitly extracts\nthe semantic-related features of these two modalities to facilitate optimal\nalignment. Specifically, a cross-modal information decoupling module is\nintroduced to guide the extraction of semantic-related information from\nmodalities. Then, by quantifying the mutual information between visual image\nand EEG features, we observe a strong positive correlation between the decoding\nperformance and the magnitude of mutual information. Furthermore, inspired by\nthe mechanisms of visual object understanding from neuroscience, we propose an\nintra-class geometric consistency approach during the alignment process. This\nstrategy maps visual samples within the same class to consistent neural\npatterns, which further enhances the robustness and the performance of EEG\nvisual decoding. Experiments on a large Image-EEG dataset show that our method\nachieves state-of-the-art results in zero-shot neural decoding tasks.\n","authors":["Hongzhou Chen","Lianghua He","Yihang Liu","Longzhen Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06784v1","updated":"2024-08-13T10:13:33Z","published":"2024-08-13T10:13:33Z","title":"Enhancing Diabetic Retinopathy Diagnosis: A Lightweight CNN Architecture\n for Efficient Exudate Detection in Retinal Fundus Images","summary":" Retinal fundus imaging plays an essential role in diagnosing various stages\nof diabetic retinopathy, where exudates are critical markers of early disease\nonset. Prompt detection of these exudates is pivotal for enabling optometrists\nto arrest or significantly decelerate the disease progression. This paper\nintroduces a novel, lightweight convolutional neural network architecture\ntailored for automated exudate detection, designed to identify these markers\nefficiently and accurately. To address the challenge of limited training data,\nwe have incorporated domain-specific data augmentations to enhance the model's\ngeneralizability. Furthermore, we applied a suite of regularization techniques\nwithin our custom architecture to boost diagnostic accuracy while optimizing\ncomputational efficiency. Remarkably, this streamlined model contains only 4.73\nmillion parameters a reduction of nearly 60% compared to the standard ResNet-18\nmodel, which has 11.69 million parameters. Despite its reduced complexity, our\nmodel achieves an impressive F1 score of 90%, demonstrating its efficacy in the\nearly detection of diabetic retinopathy through fundus imaging.\n","authors":["Mujadded Al Rabbani Alif"],"pdf_url":"https://arxiv.org/pdf/2408.06784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06781v1","updated":"2024-08-13T10:06:53Z","published":"2024-08-13T10:06:53Z","title":"Do Vision-Language Foundational models show Robust Visual Perception?","summary":" Recent advances in vision-language foundational models have enabled\ndevelopment of systems that can perform visual understanding and reasoning\ntasks. However, it is unclear if these models are robust to distribution\nshifts, and how their performance and generalization capabilities vary under\nchanges in data distribution. In this project we strive to answer the question\n\"Are vision-language foundational models robust to distribution shifts like\nhuman perception?\" Specifically, we consider a diverse range of vision-language\nmodels and compare how the performance of these systems is affected by\ncorruption based distribution shifts (such as \\textit{motion blur, fog, snow,\ngaussian noise}) commonly found in practical real-world scenarios. We analyse\nthe generalization capabilities qualitatively and quantitatively on zero-shot\nimage classification task under aforementioned distribution shifts. Our code\nwill be avaible at \\url{https://github.com/shivam-chandhok/CPSC-540-Project}\n","authors":["Shivam Chandhok","Pranav Tandon"],"pdf_url":"https://arxiv.org/pdf/2408.06781v1.pdf","comment":"UBC Report"},{"id":"http://arxiv.org/abs/2408.06779v1","updated":"2024-08-13T10:05:20Z","published":"2024-08-13T10:05:20Z","title":"ED$^4$: Explicit Data-level Debiasing for Deepfake Detection","summary":" Learning intrinsic bias from limited data has been considered the main reason\nfor the failure of deepfake detection with generalizability. Apart from the\ndiscovered content and specific-forgery bias, we reveal a novel spatial bias,\nwhere detectors inertly anticipate observing structural forgery clues appearing\nat the image center, also can lead to the poor generalization of existing\nmethods. We present ED$^4$, a simple and effective strategy, to address\naforementioned biases explicitly at the data level in a unified framework\nrather than implicit disentanglement via network design. In particular, we\ndevelop ClockMix to produce facial structure preserved mixtures with arbitrary\nsamples, which allows the detector to learn from an exponentially extended data\ndistribution with much more diverse identities, backgrounds, local manipulation\ntraces, and the co-occurrence of multiple forgery artifacts. We further propose\nthe Adversarial Spatial Consistency Module (AdvSCM) to prevent extracting\nfeatures with spatial bias, which adversarially generates spatial-inconsistent\nimages and constrains their extracted feature to be consistent. As a\nmodel-agnostic debiasing strategy, ED$^4$ is plug-and-play: it can be\nintegrated with various deepfake detectors to obtain significant benefits. We\nconduct extensive experiments to demonstrate its effectiveness and superiority\nover existing deepfake detection approaches.\n","authors":["Jikang Cheng","Ying Zhang","Qin Zou","Zhiyuan Yan","Chao Liang","Zhongyuan Wang","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.06779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06772v1","updated":"2024-08-13T09:55:38Z","published":"2024-08-13T09:55:38Z","title":"Exploring Domain Shift on Radar-Based 3D Object Detection Amidst Diverse\n Environmental Conditions","summary":" The rapid evolution of deep learning and its integration with autonomous\ndriving systems have led to substantial advancements in 3D perception using\nmultimodal sensors. Notably, radar sensors show greater robustness compared to\ncameras and lidar under adverse weather and varying illumination conditions.\nThis study delves into the often-overlooked yet crucial issue of domain shift\nin 4D radar-based object detection, examining how varying environmental\nconditions, such as different weather patterns and road types, impact 3D object\ndetection performance. Our findings highlight distinct domain shifts across\nvarious weather scenarios, revealing unique dataset sensitivities that\nunderscore the critical role of radar point cloud generation. Additionally, we\ndemonstrate that transitioning between different road types, especially from\nhighways to urban settings, introduces notable domain shifts, emphasizing the\nnecessity for diverse data collection across varied road environments. To the\nbest of our knowledge, this is the first comprehensive analysis of domain shift\neffects on 4D radar-based object detection. We believe this empirical study\ncontributes to understanding the complex nature of domain shifts in radar data\nand suggests paths forward for data collection strategy in the face of\nenvironmental variability.\n","authors":["Miao Zhang","Sherif Abdulatif","Benedikt Loesch","Marco Altmann","Marius Schwarz","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06772v1.pdf","comment":"6 pages, 5 figures, 3 tables, accepted in IEEE International\n Conference on Intelligent Transportation Systems (ITSC) 2024"},{"id":"http://arxiv.org/abs/2406.16464v4","updated":"2024-08-13T09:52:57Z","published":"2024-06-24T09:13:42Z","title":"InterCLIP-MEP: Interactive CLIP and Memory-Enhanced Predictor for\n Multi-modal Sarcasm Detection","summary":" The prevalence of sarcasm in social media, conveyed through text-image\ncombinations, presents significant challenges for sentiment analysis and\nintention mining. Existing multi-modal sarcasm detection methods have been\nproven to overestimate performance, as they struggle to effectively capture the\nintricate sarcastic cues that arise from the interaction between an image and\ntext. To address these issues, we propose InterCLIP-MEP, a novel framework for\nmulti-modal sarcasm detection. Specifically, we introduce an Interactive CLIP\n(InterCLIP) as the backbone to extract text-image representations, enhancing\nthem by embedding cross-modality information directly within each encoder,\nthereby improving the representations to capture text-image interactions\nbetter. Furthermore, an efficient training strategy is designed to adapt\nInterCLIP for our proposed Memory-Enhanced Predictor (MEP). MEP uses a dynamic,\nfixed-length dual-channel memory to store historical knowledge of valuable test\nsamples during inference. It then leverages this memory as a non-parametric\nclassifier to derive the final prediction, offering a more robust recognition\nof multi-modal sarcasm. Experiments demonstrate that InterCLIP-MEP achieves\nstate-of-the-art performance on the MMSD2.0 benchmark, with an accuracy\nimprovement of 1.08% and an F1 score improvement of 1.51% over the previous\nbest method.\n","authors":["Junjie Chen","Hang Yu","Weidong Liu","Subin Huang","Sanmin Liu"],"pdf_url":"https://arxiv.org/pdf/2406.16464v4.pdf","comment":"9 pages, 6 figures, 3 tables; Code and data are available at\n https://github.com/CoderChen01/InterCLIP-MEP"},{"id":"http://arxiv.org/abs/2408.06761v1","updated":"2024-08-13T09:37:26Z","published":"2024-08-13T09:37:26Z","title":"Cross-View Geolocalization and Disaster Mapping with Street-View and VHR\n Satellite Imagery: A Case Study of Hurricane IAN","summary":" Nature disasters play a key role in shaping human-urban infrastructure\ninteractions. Effective and efficient response to natural disasters is\nessential for building resilience and a sustainable urban environment. Two\ntypes of information are usually the most necessary and difficult to gather in\ndisaster response. The first information is about disaster damage perception,\nwhich shows how badly people think that urban infrastructure has been damaged.\nThe second information is geolocation awareness, which means how people\nwhereabouts are made available. In this paper, we proposed a novel disaster\nmapping framework, namely CVDisaster, aiming at simultaneously addressing\ngeolocalization and damage perception estimation using cross-view Street-View\nImagery (SVI) and Very High-Resolution satellite imagery. CVDisaster consists\nof two cross-view models, where CVDisaster-Geoloc refers to a cross-view\ngeolocalization model based on a contrastive learning objective with a Siamese\nConvNeXt image encoder, and CVDisaster-Est is a cross-view classification model\nbased on a Couple Global Context Vision Transformer (CGCViT). Taking Hurricane\nIAN as a case study, we evaluate the CVDisaster framework by creating a novel\ncross-view dataset (CVIAN) and conducting extensive experiments. As a result,\nwe show that CVDisaster can achieve highly competitive performance (over 80%\nfor geolocalization and 75% for damage perception estimation) with even limited\nfine-tuning efforts, which largely motivates future cross-view models and\napplications within a broader GeoAI research community. The data and code are\npublicly available at: https://github.com/tum-bgd/CVDisaster.\n","authors":["Hao Li","Fabian Deuser","Wenping Yina","Xuanshu Luo","Paul Walther","Gengchen Mai","Wei Huang","Martin Werner"],"pdf_url":"https://arxiv.org/pdf/2408.06761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06755v1","updated":"2024-08-13T09:26:41Z","published":"2024-08-13T09:26:41Z","title":"Sumotosima: A Framework and Dataset for Classifying and Summarizing\n Otoscopic Images","summary":" Otoscopy is a diagnostic procedure to examine the ear canal and eardrum using\nan otoscope. It identifies conditions like infections, foreign bodies, ear drum\nperforations and ear abnormalities. We propose a novel resource efficient deep\nlearning and transformer based framework, Sumotosima (Summarizer for otoscopic\nimages), an end-to-end pipeline for classification followed by summarization.\nOur framework works on combination of triplet and cross-entropy losses.\nAdditionally, we use Knowledge Enhanced Multimodal BART whose input is fused\ntextual and image embedding. The objective is to provide summaries that are\nwell-suited for patients, ensuring clarity and efficiency in understanding\notoscopic images. Given the lack of existing datasets, we have curated our own\nOCASD (Otoscopic Classification And Summary Dataset), which includes 500 images\nwith 5 unique categories annotated with their class and summaries by\nOtolaryngologists. Sumotosima achieved a result of 98.03%, which is 7.00%,\n3.10%, 3.01% higher than K-Nearest Neighbors, Random Forest and Support Vector\nMachines, respectively, in classification tasks. For summarization, Sumotosima\noutperformed GPT-4o and LLaVA by 88.53% and 107.57% in ROUGE scores,\nrespectively. We have made our code and dataset publicly available at\nhttps://github.com/anas2908/Sumotosima\n","authors":["Eram Anwarul Khan","Anas Anwarul Haq Khan"],"pdf_url":"https://arxiv.org/pdf/2408.06755v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2408.06753v1","updated":"2024-08-13T09:19:59Z","published":"2024-08-13T09:19:59Z","title":"Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies","summary":" Existing methods on audio-visual deepfake detection mainly focus on\nhigh-level features for modeling inconsistencies between audio and visual data.\nAs a result, these approaches usually overlook finer audio-visual artifacts,\nwhich are inherent to deepfakes. Herein, we propose the introduction of\nfine-grained mechanisms for detecting subtle artifacts in both spatial and\ntemporal domains. First, we introduce a local audio-visual model capable of\ncapturing small spatial regions that are prone to inconsistencies with audio.\nFor that purpose, a fine-grained mechanism based on a spatially-local distance\ncoupled with an attention module is adopted. Second, we introduce a\ntemporally-local pseudo-fake augmentation to include samples incorporating\nsubtle temporal inconsistencies in our training set. Experiments on the DFDC\nand the FakeAVCeleb datasets demonstrate the superiority of the proposed method\nin terms of generalization as compared to the state-of-the-art under both\nin-dataset and cross-dataset settings.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2408.06753v1.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2408.06747v1","updated":"2024-08-13T09:10:48Z","published":"2024-08-13T09:10:48Z","title":"ReCLIP++: Learn to Rectify the Bias of CLIP for Unsupervised Semantic\n Segmentation","summary":" Recent works utilize CLIP to perform the challenging unsupervised semantic\nsegmentation task where only images without annotations are available. However,\nwe observe that when adopting CLIP to such a pixel-level understanding task,\nunexpected bias (including class-preference bias and space-preference bias)\noccurs. Previous works don't explicitly model the bias, which largely\nconstrains the segmentation performance. In this paper, we propose to\nexplicitly model and rectify the bias existing in CLIP to facilitate the\nunsupervised semantic segmentation task. Specifically, we design a learnable\n''Reference'' prompt to encode class-preference bias and a projection of the\npositional embedding in vision transformer to encode space-preference bias\nrespectively. To avoid interference, two kinds of biases are firstly\nindependently encoded into the Reference feature and the positional feature.\nVia a matrix multiplication between two features, a bias logit map is generated\nto explicitly represent two kinds of biases. Then we rectify the logits of CLIP\nvia a simple element-wise subtraction. To make the rectified results smoother\nand more contextual, we design a mask decoder which takes the feature of CLIP\nand rectified logits as input and outputs a rectified segmentation mask with\nthe help of Gumbel-Softmax operation. To make the bias modeling and\nrectification process meaningful and effective, a contrastive loss based on\nmasked visual features and the text features of different classes is imposed.\nTo further improve the segmentation, we distill the knowledge from the\nrectified CLIP to the advanced segmentation architecture via minimizing our\ndesigned mask-guided, feature-guided and text-guided loss terms. Extensive\nexperiments on various benchmarks demonstrate that ReCLIP++ performs favorably\nagainst previous SOTAs. The implementation is available at:\nhttps://github.com/dogehhh/ReCLIP.\n","authors":["Jingyun Wang","Guoliang Kang"],"pdf_url":"https://arxiv.org/pdf/2408.06747v1.pdf","comment":"Extended version of our CVPR 24 paper"},{"id":"http://arxiv.org/abs/2408.06742v1","updated":"2024-08-13T09:03:00Z","published":"2024-08-13T09:03:00Z","title":"Long-Tailed Out-of-Distribution Detection: Prioritizing Attention to\n Tail","summary":" Current out-of-distribution (OOD) detection methods typically assume balanced\nin-distribution (ID) data, while most real-world data follow a long-tailed\ndistribution. Previous approaches to long-tailed OOD detection often involve\nbalancing the ID data by reducing the semantics of head classes. However, this\nreduction can severely affect the classification accuracy of ID data. The main\nchallenge of this task lies in the severe lack of features for tail classes,\nleading to confusion with OOD data. To tackle this issue, we introduce a novel\nPrioritizing Attention to Tail (PATT) method using augmentation instead of\nreduction. Our main intuition involves using a mixture of von Mises-Fisher\n(vMF) distributions to model the ID data and a temperature scaling module to\nboost the confidence of ID data. This enables us to generate infinite\ncontrastive pairs, implicitly enhancing the semantics of ID classes while\npromoting differentiation between ID and OOD data. To further strengthen the\ndetection of OOD data without compromising the classification performance of ID\ndata, we propose feature calibration during the inference phase. By extracting\nan attention weight from the training set that prioritizes the tail classes and\nreduces the confidence in OOD data, we improve the OOD detection capability.\nExtensive experiments verified that our method outperforms the current\nstate-of-the-art methods on various benchmarks.\n","authors":["Yina He","Lei Peng","Yongcun Zhang","Juanjuan Weng","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2408.06742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06741v1","updated":"2024-08-13T09:01:12Z","published":"2024-08-13T09:01:12Z","title":"Improving Synthetic Image Detection Towards Generalization: An Image\n Transformation Perspective","summary":" With recent generative models facilitating photo-realistic image synthesis,\nthe proliferation of synthetic images has also engendered certain negative\nimpacts on social platforms, thereby raising an urgent imperative to develop\neffective detectors. Current synthetic image detection (SID) pipelines are\nprimarily dedicated to crafting universal artifact features, accompanied by an\noversight about SID training paradigm. In this paper, we re-examine the SID\nproblem and identify two prevalent biases in current training paradigms, i.e.,\nweakened artifact features and overfitted artifact features. Meanwhile, we\ndiscover that the imaging mechanism of synthetic images contributes to\nheightened local correlations among pixels, suggesting that detectors should be\nequipped with local awareness. In this light, we propose SAFE, a lightweight\nand effective detector with three simple image transformations. Firstly, for\nweakened artifact features, we substitute the down-sampling operator with the\ncrop operator in image pre-processing to help circumvent artifact distortion.\nSecondly, for overfitted artifact features, we include ColorJitter and\nRandomRotation as additional data augmentations, to help alleviate irrelevant\nbiases from color discrepancies and semantic differences in limited training\nsamples. Thirdly, for local awareness, we propose a patch-based random masking\nstrategy tailored for SID, forcing the detector to focus on local regions at\ntraining. Comparative experiments are conducted on an open-world dataset,\ncomprising synthetic images generated by 26 distinct generative models. Our\npipeline achieves a new state-of-the-art performance, with remarkable\nimprovements of 4.5% in accuracy and 2.9% in average precision against existing\nmethods.\n","authors":["Ouxiang Li","Jiayin Cai","Yanbin Hao","Xiaolong Jiang","Yao Hu","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2408.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02382v2","updated":"2024-08-13T09:00:42Z","published":"2024-08-05T11:14:23Z","title":"Cross Pseudo Supervision Framework for Sparsely Labelled Geospatial\n Images","summary":" Land Use Land Cover (LULC) mapping is a vital tool for urban and resource\nplanning, playing a key role in the development of innovative and sustainable\ncities. This study introduces a semi-supervised segmentation model for LULC\nprediction using high-resolution satellite images with a vast diversity of data\ndistributions in different areas of India. Our approach ensures a robust\ngeneralization across different types of buildings, roads, trees, and water\nbodies within these distinct areas. We propose a modified Cross Pseudo\nSupervision framework to train image segmentation models on sparsely labelled\ndata. The proposed framework addresses the limitations of the famous 'Cross\nPseudo Supervision' technique for semi-supervised learning, specifically\ntackling the challenges of training segmentation models on noisy satellite\nimage data with sparse and inaccurate labels. This comprehensive approach\nsignificantly enhances the accuracy and utility of LULC mapping, providing\nvaluable insights for urban and resource planning applications.\n","authors":["Yash Dixit","Naman Srivastava","Joel D Joy","Rohan Olikara","Swarup E","Rakshit Ramesh"],"pdf_url":"https://arxiv.org/pdf/2408.02382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06740v1","updated":"2024-08-13T09:00:35Z","published":"2024-08-13T09:00:35Z","title":"DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with\n Diffusion","summary":" Personalized text-to-image generation has gained significant attention for\nits capability to generate high-fidelity portraits of specific identities\nconditioned on user-defined prompts. Existing methods typically involve\ntest-time fine-tuning or instead incorporating an additional pre-trained\nbranch. However, these approaches struggle to simultaneously address the\ndemands of efficiency, identity fidelity, and preserving the model's original\ngenerative capabilities. In this paper, we propose DiffLoRA, a novel approach\nthat leverages diffusion models as a hypernetwork to predict personalized\nlow-rank adaptation (LoRA) weights based on the reference images. By\nintegrating these LoRA weights into the text-to-image model, DiffLoRA achieves\npersonalization during inference without further training. Additionally, we\npropose an identity-oriented LoRA weight construction pipeline to facilitate\nthe training of DiffLoRA. By utilizing the dataset produced by this pipeline,\nour DiffLoRA consistently generates high-performance and accurate LoRA weights.\nExtensive evaluations demonstrate the effectiveness of our method, achieving\nboth time efficiency and maintaining identity fidelity throughout the\npersonalization process.\n","authors":["Yujia Wu","Yiming Shi","Jiwei Wei","Chengwei Sun","Yuyang Zhou","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2408.06740v1.pdf","comment":"9 pages,8 figures"},{"id":"http://arxiv.org/abs/2408.05661v2","updated":"2024-08-13T08:41:26Z","published":"2024-08-11T00:33:45Z","title":"Performance Evaluation of YOLOv8 Model Configurations, for Instance\n Segmentation of Strawberry Fruit Development Stages in an Open Field\n Environment","summary":" Accurate identification of strawberries during their maturing stages is\ncrucial for optimizing yield management, and pest control, and making informed\ndecisions related to harvest and post-harvest logistics. This study evaluates\nthe performance of YOLOv8 model configurations for instance segmentation of\nstrawberries into ripe and unripe stages in an open field environment. The\nYOLOv8n model demonstrated superior segmentation accuracy with a mean Average\nPrecision (mAP) of 80.9\\%, outperforming other YOLOv8 configurations. In terms\nof inference speed, YOLOv8n processed images at 12.9 milliseconds, while\nYOLOv8s, the least-performing model, processed at 22.2 milliseconds. Over 86\ntest images with 348 ground truth labels, YOLOv8n detected 235 ripe fruit\nclasses and 51 unripe fruit classes out of 251 ground truth ripe fruits and 97\nunripe ground truth labels, respectively. In comparison, YOLOv8s detected 204\nripe fruits and 37 unripe fruits. Overall, YOLOv8n achieved the fastest\ninference speed of 24.2 milliseconds, outperforming YOLOv8s, YOLOv8m, YOLOv8l,\nand YOLOv8x, which processed images at 33.0 milliseconds, 44.3 milliseconds,\n53.6 milliseconds, and 62.5 milliseconds, respectively. These results\nunderscore the potential of advanced object segmentation algorithms to address\ncomplex visual recognition tasks in open-field agriculture effectively to\naddress complex visual recognition tasks in open-field agriculture effectively.\n","authors":["Abdul-Razak Alhassan Gamani","Ibrahim Arhin","Adrena Kyeremateng Asamoah"],"pdf_url":"https://arxiv.org/pdf/2408.05661v2.pdf","comment":"15 page, 18 figures"},{"id":"http://arxiv.org/abs/2408.06725v1","updated":"2024-08-13T08:36:15Z","published":"2024-08-13T08:36:15Z","title":"Enhancing Visual Dialog State Tracking through Iterative Object-Entity\n Alignment in Multi-Round Conversations","summary":" Visual Dialog (VD) is a task where an agent answers a series of image-related\nquestions based on a multi-round dialog history. However, previous VD methods\noften treat the entire dialog history as a simple text input, disregarding the\ninherent conversational information flows at the round level. In this paper, we\nintroduce Multi-round Dialogue State Tracking model (MDST), a framework that\naddresses this limitation by leveraging the dialogue state learned from dialog\nhistory to answer questions. MDST captures each round of dialog history,\nconstructing internal dialogue state representations defined as 2-tuples of\nvision-language representations. These representations effectively ground the\ncurrent question, enabling the generation of accurate answers. Experimental\nresults on the VisDial v1.0 dataset demonstrate that MDST achieves a new\nstate-of-the-art performance in generative setting. Furthermore, through a\nseries of human studies, we validate the effectiveness of MDST in generating\nlong, consistent, and human-like answers while consistently answering a series\nof questions correctly.\n","authors":["Wei Pang","Ruixue Duan","Jinfu Yang","Ning Li"],"pdf_url":"https://arxiv.org/pdf/2408.06725v1.pdf","comment":"This article has been accepted in CAAI Transactions on Intelligence\n Technology! Article ID: CIT2_12370, Article DOI: 10.1049/cit2.12370"},{"id":"http://arxiv.org/abs/2407.06979v2","updated":"2024-08-13T08:31:42Z","published":"2024-07-09T15:54:06Z","title":"Can virtual staining for high-throughput screening generalize?","summary":" The large volume and variety of imaging data from high-throughput screening\n(HTS) in the pharmaceutical industry present an excellent resource for training\nvirtual staining models. However, the potential of models trained under one set\nof experimental conditions to generalize to other conditions remains\nunderexplored. This study systematically investigates whether data from three\ncell types (lung, ovarian, and breast) and two phenotypes (toxic and non-toxic\nconditions) commonly found in HTS can effectively train virtual staining models\nto generalize across three typical HTS distribution shifts: unseen phenotypes,\nunseen cell types, and the combination of both. Utilizing a dataset of 772,416\npaired bright-field, cytoplasm, nuclei, and DNA-damage stain images, we\nevaluate the generalization capabilities of models across pixel-based,\ninstance-wise, and biological-feature-based levels. Our findings indicate that\ntraining virtual nuclei and cytoplasm models on non-toxic condition samples not\nonly generalizes to toxic condition samples but leads to improved performance\nacross all evaluation levels compared to training on toxic condition samples.\nGeneralization to unseen cell types shows variability depending on the cell\ntype; models trained on ovarian or lung cell samples often perform well under\nother conditions, while those trained on breast cell samples consistently show\npoor generalization. Generalization to unseen cell types and phenotypes shows\ngood generalization across all levels of evaluation compared to addressing\nunseen cell types alone. This study represents the first large-scale,\ndata-centric analysis of the generalization capability of virtual staining\nmodels trained on diverse HTS datasets, providing valuable strategies for\nexperimental training data generation.\n","authors":["Samuel Tonks","Cuong Nguyen","Steve Hood","Ryan Musso","Ceridwen Hopely","Steve Titus","Minh Doan","Iain Styles","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2407.06979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10943v3","updated":"2024-08-13T08:28:19Z","published":"2024-06-16T13:47:40Z","title":"Rectified Iterative Disparity for Stereo Matching","summary":" Both uncertainty-assisted and iteration-based methods have achieved great\nsuccess in stereo matching. However, existing uncertainty estimation methods\ntake a single image and the corresponding disparity as input, which imposes\nhigher demands on the estimation network. In this paper, we propose Cost\nvolume-based disparity Uncertainty Estimation (UEC). Based on the rich\nsimilarity information in the cost volume coming from the image pairs, the\nproposed UEC can achieve competitive performance with low computational cost.\nSecondly, we propose two methods of uncertainty-assisted disparity estimation,\nUncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity\nupdate Conditioning (UDC). These two methods optimise the disparity update\nprocess of the iterative-based approach without adding extra parameters. In\naddition, we propose Disparity Rectification loss that significantly improves\nthe accuracy of small amount of disparity updates. We present a\nhigh-performance stereo architecture, DR Stereo, which is a combination of the\nproposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014,\nand ETH3D show that DR-Stereo achieves very competitive disparity estimation\nperformance.\n","authors":["Weiqing Xiao","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.10943v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06721v1","updated":"2024-08-13T08:26:32Z","published":"2024-08-13T08:26:32Z","title":"Response Wide Shut: Surprising Observations in Basic Vision Language\n Model Capabilities","summary":" Vision-Language Models (VLMs) have emerged as general purpose tools for\naddressing a variety of complex computer vision problems. Such models have been\nshown to be highly capable, but, at the same time, also lacking some basic\nvisual understanding skills. In this paper, we set out to understand the\nlimitations of SoTA VLMs on fundamental visual tasks: object classification,\nunderstanding spatial arrangement, and ability to delineate individual object\ninstances (through counting), by constructing a series of tests that probe\nwhich components of design, specifically, maybe lacking. Importantly, we go\nsignificantly beyond the current benchmarks, that simply measure final\nperformance of VLM, by also comparing and contrasting it to performance of\nprobes trained directly on features obtained from visual encoder (image\nembeddings), as well as intermediate vision-language projection used to bridge\nimage-encoder and LLM-decoder ouput in many SoTA models (e.g., LLaVA, BLIP,\nInstructBLIP). In doing so, we uncover nascent shortcomings in VLMs response\nand make a number of important observations which could help train and develop\nmore effective VLM models in future.\n","authors":["Shivam Chandhok","Wan-Cyuan Fan","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2408.06721v1.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2408.06720v1","updated":"2024-08-13T08:24:52Z","published":"2024-08-13T08:24:52Z","title":"Multimodal Analysis of White Blood Cell Differentiation in Acute Myeloid\n Leukemia Patients using a β-Variational Autoencoder","summary":" Biomedical imaging and RNA sequencing with single-cell resolution improves\nour understanding of white blood cell diseases like leukemia. By combining\nmorphological and transcriptomic data, we can gain insights into cellular\nfunctions and trajectoriess involved in blood cell differentiation. However,\nexisting methodologies struggle with integrating morphological and\ntranscriptomic data, leaving a significant research gap in comprehensively\nunderstanding the dynamics of cell differentiation. Here, we introduce an\nunsupervised method that explores and reconstructs these two modalities and\nuncovers the relationship between different subtypes of white blood cells from\nhuman peripheral blood smears in terms of morphology and their corresponding\ntranscriptome. Our method is based on a beta-variational autoencoder\n(\\beta-VAE) with a customized loss function, incorporating a R-CNN architecture\nto distinguish single-cell from background and to minimize any interference\nfrom artifacts. This implementation of \\beta-VAE shows good reconstruction\ncapability along with continuous latent embeddings, while maintaining clear\ndifferentiation between single-cell classes. Our novel approach is especially\nhelpful to uncover the correlation of two latent features in complex biological\nprocesses such as formation of granules in the cell (granulopoiesis) with gene\nexpression patterns. It thus provides a unique tool to improve the\nunderstanding of white blood cell maturation for biomedicine and diagnostics.\n","authors":["Gizem Mert","Ario Sadafi","Raheleh Salehi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2408.06720v1.pdf","comment":"Accepted for publication at MICCAI 2024 workshop on AI for Imaging\n Genomics Learning (AIIG)"},{"id":"http://arxiv.org/abs/2408.05159v2","updated":"2024-08-13T08:23:06Z","published":"2024-08-09T16:31:02Z","title":"EasyInv: Toward Fast and Better DDIM Inversion","summary":" This paper introduces EasyInv, an easy yet novel approach that significantly\nadvances the field of DDIM Inversion by addressing the inherent inefficiencies\nand performance limitations of traditional iterative optimization methods. At\nthe core of our EasyInv is a refined strategy for approximating inversion\nnoise, which is pivotal for enhancing the accuracy and reliability of the\ninversion process. By prioritizing the initial latent state, which encapsulates\nrich information about the original images, EasyInv steers clear of the\niterative refinement of noise items. Instead, we introduce a methodical\naggregation of the latent state from the preceding time step with the current\nstate, effectively increasing the influence of the initial latent state and\nmitigating the impact of noise. We illustrate that EasyInv is capable of\ndelivering results that are either on par with or exceed those of the\nconventional DDIM Inversion approach, especially under conditions where the\nmodel's precision is limited or computational resources are scarce.\nConcurrently, our EasyInv offers an approximate threefold enhancement regarding\ninference efficiency over off-the-shelf iterative optimization techniques.\n","authors":["Ziyue Zhang","Mingbao Lin","Shuicheng Yan","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.05159v2.pdf","comment":"9 pages not including reference"},{"id":"http://arxiv.org/abs/2408.06716v1","updated":"2024-08-13T08:20:47Z","published":"2024-08-13T08:20:47Z","title":"Towards Cross-Domain Single Blood Cell Image Classification via\n Large-Scale LoRA-based Segment Anything Model","summary":" Accurate classification of blood cells plays a vital role in hematological\nanalysis as it aids physicians in diagnosing various medical conditions. In\nthis study, we present a novel approach for classifying blood cell images known\nas BC-SAM. BC-SAM leverages the large-scale foundation model of Segment\nAnything Model (SAM) and incorporates a fine-tuning technique using LoRA,\nallowing it to extract general image embeddings from blood cell images. To\nenhance the applicability of BC-SAM across different blood cell image datasets,\nwe introduce an unsupervised cross-domain autoencoder that focuses on learning\nintrinsic features while suppressing artifacts in the images. To assess the\nperformance of BC-SAM, we employ four widely used machine learning classifiers\n(Random Forest, Support Vector Machine, Artificial Neural Network, and XGBoost)\nto construct blood cell classification models and compare them against existing\nstate-of-the-art methods. Experimental results conducted on two publicly\navailable blood cell datasets (Matek-19 and Acevedo-20) demonstrate that our\nproposed BC-SAM achieves a new state-of-the-art result, surpassing the baseline\nmethods with a significant improvement. The source code of this paper is\navailable at https://github.com/AnoK3111/BC-SAM.\n","authors":["Yongcheng Li","Lingcong Cai","Ying Lu","Yupeng Zhang","Jingyan Jiang","Genan Dai","Bowen Zhang","Jingzhou Cao","Xiangzhong Zhang","Xiaomao Fan"],"pdf_url":"https://arxiv.org/pdf/2408.06716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10981v2","updated":"2024-08-13T08:16:08Z","published":"2024-03-16T17:24:46Z","title":"Automatic Spatial Calibration of Near-Field MIMO Radar With Respect to\n Optical Depth Sensors","summary":" Despite an emerging interest in MIMO radar, the utilization of its\ncomplementary strengths in combination with optical depth sensors has so far\nbeen limited to far-field applications, due to the challenges that arise from\nmutual sensor calibration in the near field. In fact, most related approaches\nin the autonomous industry propose target-based calibration methods using\ncorner reflectors that have proven to be unsuitable for the near field. In\ncontrast, we propose a novel, joint calibration approach for optical RGB-D\nsensors and MIMO radars that is designed to operate in the radar's near-field\nrange, within decimeters from the sensors. Our pipeline consists of a bespoke\ncalibration target, allowing for automatic target detection and localization,\nfollowed by the spatial calibration of the two sensor coordinate systems\nthrough target registration. We validate our approach using two different depth\nsensing technologies from the optical domain. The experiments show the\nefficiency and accuracy of our calibration for various target displacements, as\nwell as its robustness of our localization in terms of signal ambiguities.\n","authors":["Vanessa Wirth","Johanna Bräunig","Danti Khouri","Florian Gutsche","Martin Vossiek","Tim Weyrich","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2403.10981v2.pdf","comment":"8 pages, 9 figures, accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2408.04840v2","updated":"2024-08-13T08:10:32Z","published":"2024-08-09T03:25:42Z","title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal\n Large Language Models","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in executing instructions for a variety of single-image tasks.\nDespite this progress, significant challenges remain in modeling long image\nsequences. In this work, we introduce the versatile multi-modal large language\nmodel, mPLUG-Owl3, which enhances the capability for long image-sequence\nunderstanding in scenarios that incorporate retrieved image-text knowledge,\ninterleaved image-text, and lengthy videos. Specifically, we propose novel\nhyper attention blocks to efficiently integrate vision and language into a\ncommon language-guided semantic space, thereby facilitating the processing of\nextended multi-image scenarios. Extensive experimental results suggest that\nmPLUG-Owl3 achieves state-of-the-art performance among models with a similar\nsize on single-image, multi-image, and video benchmarks. Moreover, we propose a\nchallenging long visual sequence evaluation named Distractor Resistance to\nassess the ability of models to maintain focus amidst distractions. Finally,\nwith the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance\non ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to\nthe development of more efficient and powerful multimodal large language\nmodels.\n","authors":["Jiabo Ye","Haiyang Xu","Haowei Liu","Anwen Hu","Ming Yan","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.04840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06709v1","updated":"2024-08-13T08:08:45Z","published":"2024-08-13T08:08:45Z","title":"Review Learning: Advancing All-in-One Ultra-High-Definition Image\n Restoration Training Method","summary":" All-in-one image restoration tasks are becoming increasingly important,\nespecially for ultra-high-definition (UHD) images. Existing all-in-one UHD\nimage restoration methods usually boost the model's performance by introducing\nprompt or customized dynamized networks for different degradation types. For\nthe inference stage, it might be friendly, but in the training stage, since the\nmodel encounters multiple degraded images of different quality in an epoch,\nthese cluttered learning objectives might be information pollution for the\nmodel. To address this problem, we propose a new training paradigm for general\nimage restoration models, which we name \\textbf{Review Learning}, which enables\nimage restoration models to be capable enough to handle multiple types of\ndegradation without prior knowledge and prompts. This approach begins with\nsequential training of an image restoration model on several degraded datasets,\ncombined with a review mechanism that enhances the image restoration model's\nmemory for several previous classes of degraded datasets. In addition, we\ndesign a lightweight all-purpose image restoration network that can efficiently\nreason about degraded images with 4K ($3840 \\times 2160$) resolution on a\nsingle consumer-grade GPU.\n","authors":["Xin Su","Zhuoran Zheng","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06707v1","updated":"2024-08-13T08:04:23Z","published":"2024-08-13T08:04:23Z","title":"MAIR++: Improving Multi-view Attention Inverse Rendering with Implicit\n Lighting Representation","summary":" In this paper, we propose a scene-level inverse rendering framework that uses\nmulti-view images to decompose the scene into geometry, SVBRDF, and 3D\nspatially-varying lighting. While multi-view images have been widely used for\nobject-level inverse rendering, scene-level inverse rendering has primarily\nbeen studied using single-view images due to the lack of a dataset containing\nhigh dynamic range multi-view images with ground-truth geometry, material, and\nspatially-varying lighting. To improve the quality of scene-level inverse\nrendering, a novel framework called Multi-view Attention Inverse Rendering\n(MAIR) was recently introduced. MAIR performs scene-level multi-view inverse\nrendering by expanding the OpenRooms dataset, designing efficient pipelines to\nhandle multi-view images, and splitting spatially-varying lighting. Although\nMAIR showed impressive results, its lighting representation is fixed to\nspherical Gaussians, which limits its ability to render images realistically.\nConsequently, MAIR cannot be directly used in applications such as material\nediting. Moreover, its multi-view aggregation networks have difficulties\nextracting rich features because they only focus on the mean and variance\nbetween multi-view features. In this paper, we propose its extended version,\ncalled MAIR++. MAIR++ addresses the aforementioned limitations by introducing\nan implicit lighting representation that accurately captures the lighting\nconditions of an image while facilitating realistic rendering. Furthermore, we\ndesign a directional attention-based multi-view aggregation network to infer\nmore intricate relationships between views. Experimental results show that\nMAIR++ not only achieves better performance than MAIR and single-view-based\nmethods, but also displays robust performance on unseen real-world scenes.\n","authors":["JunYong Choi","SeokYeong Lee","Haesol Park","Seung-Won Jung","Ig-Jae Kim","Junghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2408.06707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06697v1","updated":"2024-08-13T07:51:37Z","published":"2024-08-13T07:51:37Z","title":"SlotLifter: Slot-guided Feature Lifting for Learning Object-centric\n Radiance Fields","summary":" The ability to distill object-centric abstractions from intricate visual\nscenes underpins human-level generalization. Despite the significant progress\nin object-centric learning methods, learning object-centric representations in\nthe 3D physical world remains a crucial challenge. In this work, we propose\nSlotLifter, a novel object-centric radiance model addressing scene\nreconstruction and decomposition jointly via slot-guided feature lifting. Such\na design unites object-centric learning representations and image-based\nrendering methods, offering state-of-the-art performance in scene decomposition\nand novel-view synthesis on four challenging synthetic and four complex\nreal-world datasets, outperforming existing 3D object-centric learning methods\nby a large margin. Through extensive ablative studies, we showcase the efficacy\nof designs in SlotLifter, revealing key insights for potential future\ndirections.\n","authors":["Yu Liu","Baoxiong Jia","Yixin Chen","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06697v1.pdf","comment":"Accepted by ECCV 2024. Project website: https://slotlifter.github.io"},{"id":"http://arxiv.org/abs/2403.02781v5","updated":"2024-08-13T07:50:02Z","published":"2024-03-05T08:53:30Z","title":"PromptKD: Unsupervised Prompt Distillation for Vision-Language Models","summary":" Prompt learning has emerged as a valuable technique in enhancing\nvision-language models (VLMs) such as CLIP for downstream tasks in specific\ndomains. Existing work mainly focuses on designing various learning forms of\nprompts, neglecting the potential of prompts as effective distillers for\nlearning from larger teacher models. In this paper, we introduce an\nunsupervised domain prompt distillation framework, which aims to transfer the\nknowledge of a larger teacher model to a lightweight target model through\nprompt-driven imitation using unlabeled domain images. Specifically, our\nframework consists of two distinct stages. In the initial stage, we pre-train a\nlarge CLIP teacher model using domain (few-shot) labels. After pre-training, we\nleverage the unique decoupled-modality characteristics of CLIP by pre-computing\nand storing the text features as class vectors only once through the teacher\ntext encoder. In the subsequent stage, the stored class vectors are shared\nacross teacher and student image encoders for calculating the predicted logits.\nFurther, we align the logits of both the teacher and student models via KL\ndivergence, encouraging the student image encoder to generate similar\nprobability distributions to the teacher through the learnable prompts. The\nproposed prompt distillation process eliminates the reliance on labeled data,\nenabling the algorithm to leverage a vast amount of unlabeled images within the\ndomain. Finally, the well-trained student image encoders and pre-stored text\nfeatures (class vectors) are utilized for inference. To our best knowledge, we\nare the first to (1) perform unsupervised domain-specific prompt-driven\nknowledge distillation for CLIP, and (2) establish a practical pre-storing\nmechanism of text features as shared class vectors between teacher and student.\nExtensive experiments on 11 datasets demonstrate the effectiveness of our\nmethod.\n","authors":["Zheng Li","Xiang Li","Xinyi Fu","Xin Zhang","Weiqiang Wang","Shuo Chen","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.02781v5.pdf","comment":"CVPR 2024. Project Page: https://zhengli97.github.io/PromptKD. Code:\n https://github.com/zhengli97/PromptKD"},{"id":"http://arxiv.org/abs/2408.06693v1","updated":"2024-08-13T07:35:56Z","published":"2024-08-13T07:35:56Z","title":"DC3DO: Diffusion Classifier for 3D Objects","summary":" Inspired by Geoffrey Hinton emphasis on generative modeling, To recognize\nshapes, first learn to generate them, we explore the use of 3D diffusion models\nfor object classification. Leveraging the density estimates from these models,\nour approach, the Diffusion Classifier for 3D Objects (DC3DO), enables\nzero-shot classification of 3D shapes without additional training. On average,\nour method achieves a 12.5 percent improvement compared to its multiview\ncounterparts, demonstrating superior multimodal reasoning over discriminative\napproaches. DC3DO employs a class-conditional diffusion model trained on\nShapeNet, and we run inferences on point clouds of chairs and cars. This work\nhighlights the potential of generative models in 3D object classification.\n","authors":["Nursena Koprucu","Meher Shashwat Nigam","Shicheng Xu","Biruk Abere","Gabriele Dominici","Andrew Rodriguez","Sharvaree Vadgam","Berfin Inal","Alberto Tono"],"pdf_url":"https://arxiv.org/pdf/2408.06693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.04730v2","updated":"2024-08-13T07:30:54Z","published":"2022-04-10T17:13:52Z","title":"Deep Non-rigid Structure-from-Motion: A Sequence-to-Sequence Translation\n Perspective","summary":" Directly regressing the non-rigid shape and camera pose from the individual\n2D frame is ill-suited to the Non-Rigid Structure-from-Motion (NRSfM) problem.\nThis frame-by-frame 3D reconstruction pipeline overlooks the inherent\nspatial-temporal nature of NRSfM, i.e., reconstructing the whole 3D sequence\nfrom the input 2D sequence. In this paper, we propose to model deep NRSfM from\na sequence-to-sequence translation perspective, where the input 2D frame\nsequence is taken as a whole to reconstruct the deforming 3D non-rigid shape\nsequence. First, we apply a shape-motion predictor to estimate the initial\nnon-rigid shape and camera motion from a single frame. Then we propose a\ncontext modeling module to model camera motions and complex non-rigid shapes.\nTo tackle the difficulty in enforcing the global structure constraint within\nthe deep framework, we propose to impose the union-of-subspace structure by\nreplacing the self-expressiveness layer with multi-head attention and delayed\nregularizers, which enables end-to-end batch-wise training. Experimental\nresults across different datasets such as Human3.6M, CMU Mocap and InterHand\nprove the superiority of our framework.\n","authors":["Hui Deng","Tong Zhang","Yuchao Dai","Jiawei Shi","Yiran Zhong","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2204.04730v2.pdf","comment":"has been accepted by IEEE Transactions on Pattern Analysis and\n Machine Intelligence"},{"id":"http://arxiv.org/abs/2408.06687v1","updated":"2024-08-13T07:27:02Z","published":"2024-08-13T07:27:02Z","title":"Masked Image Modeling: A Survey","summary":" In this work, we survey recent studies on masked image modeling (MIM), an\napproach that emerged as a powerful self-supervised learning technique in\ncomputer vision. The MIM task involves masking some information, e.g. pixels,\npatches, or even latent representations, and training a model, usually an\nautoencoder, to predicting the missing information by using the context\navailable in the visible part of the input. We identify and formalize two\ncategories of approaches on how to implement MIM as a pretext task, one based\non reconstruction and one based on contrastive learning. Then, we construct a\ntaxonomy and review the most prominent papers in recent years. We complement\nthe manually constructed taxonomy with a dendrogram obtained by applying a\nhierarchical clustering algorithm. We further identify relevant clusters via\nmanually inspecting the resulting dendrogram. Our review also includes datasets\nthat are commonly used in MIM research. We aggregate the performance results of\nvarious masked image modeling methods on the most popular datasets, to\nfacilitate the comparison of competing methods. Finally, we identify research\ngaps and propose several interesting directions of future work.\n","authors":["Vlad Hondru","Florinel Alin Croitoru","Shervin Minaee","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.06687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06684v1","updated":"2024-08-13T07:23:53Z","published":"2024-08-13T07:23:53Z","title":"How to Best Combine Demosaicing and Denoising?","summary":" Image demosaicing and denoising play a critical role in the raw imaging\npipeline. These processes have often been treated as independent, without\nconsidering their interactions. Indeed, most classic denoising methods handle\nnoisy RGB images, not raw images. Conversely, most demosaicing methods address\nthe demosaicing of noise free images. The real problem is to jointly denoise\nand demosaic noisy raw images. But the question of how to proceed is still not\nyet clarified. In this paper, we carry-out extensive experiments and a\nmathematical analysis to tackle this problem by low complexity algorithms.\nIndeed, both problems have been only addressed jointly by end-to-end heavy\nweight convolutional neural networks (CNNs), which are currently incompatible\nwith low power portable imaging devices and remain by nature domain (or device)\ndependent. Our study leads us to conclude that, with moderate noise,\ndemosaicing should be applied first, followed by denoising. This requires a\nsimple adaptation of classic denoising algorithms to demosaiced noise, which we\njustify and specify. Although our main conclusion is ``demosaic first, then\ndenoise'', we also discover that for high noise, there is a moderate PSNR gain\nby a more complex strategy: partial CFA denoising followed by demosaicing, and\nby a second denoising on the RGB image. These surprising results are obtained\nby a black-box optimization of the pipeline, which could be applied to any\nother pipeline. We validate our results on simulated and real noisy CFA images\nobtained from several benchmarks.\n","authors":["Yu Guo","Qiyu Jin","Jean-Michel Morel","Gabriele Facciolo"],"pdf_url":"https://arxiv.org/pdf/2408.06684v1.pdf","comment":"This paper was accepted by Inverse Problems and Imaging on October,\n 2023"},{"id":"http://arxiv.org/abs/2408.06681v1","updated":"2024-08-13T07:19:40Z","published":"2024-08-13T07:19:40Z","title":"Coherence Awareness in Diffractive Neural Networks","summary":" Diffractive neural networks hold great promise for applications requiring\nintensive computational processing. Considerable attention has focused on\ndiffractive networks for either spatially coherent or spatially incoherent\nillumination. Here we illustrate that, as opposed to imaging systems, in\ndiffractive networks the degree of spatial coherence has a dramatic effect. In\nparticular, we show that when the spatial coherence length on the object is\ncomparable to the minimal feature size preserved by the optical system, neither\nthe incoherent nor the coherent extremes serve as acceptable approximations.\nImportantly, this situation is inherent to many settings involving active\nillumination, including reflected light microscopy, autonomous vehicles and\nsmartphones. Following this observation, we propose a general framework for\ntraining diffractive networks for any specified degree of spatial and temporal\ncoherence, supporting all types of linear and nonlinear layers. Using our\nmethod, we numerically optimize networks for image classification, and\nthoroughly investigate their performance dependence on the illumination\ncoherence properties. We further introduce the concept of coherence-blind\nnetworks, which have enhanced resilience to changes in illumination conditions.\nOur findings serve as a steppingstone toward adopting all-optical neural\nnetworks in real-world applications, leveraging nothing but natural light.\n","authors":["Matan Kleiner","Lior Michaeli","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2408.06681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16694v4","updated":"2024-08-13T07:12:16Z","published":"2024-01-30T02:41:05Z","title":"etuner: Redundancy-Aware Efficient Continual Learning on Edge Devices","summary":" Many emerging applications, such as robot-assisted eldercare and object\nrecognition, generally employ deep learning neural networks (DNNs) and require\nthe deployment of DNN models on edge devices. These applications naturally\nrequire i) handling streaming-in inference requests and ii) fine-tuning the\ndeployed models to adapt to possible deployment scenario changes. Continual\nlearning (CL) is widely adopted to satisfy these needs. CL is a popular deep\nlearning paradigm that handles both continuous model fine-tuning and overtime\ninference requests. However, an inappropriate model fine-tuning scheme could\ninvolve significant redundancy and consume considerable time and energy, making\nit challenging to apply CL on edge devices. In this paper, we propose ETuner,\nan efficient edge continual learning framework that optimizes inference\naccuracy, fine-tuning execution time, and energy efficiency through both\ninter-tuning and intra-tuning optimizations. Experimental results show that, on\naverage, ETuner reduces overall fine-tuning execution time by 64%, energy\nconsumption by 56%, and improves average inference accuracy by 1.75% over the\nimmediate model fine-tuning approach.\n","authors":["Sheng Li","Geng Yuan","Yawen Wu","Yue Dai","Tianyu Wang","Chao Wu","Alex K. Jones","Jingtong Hu","Yanzhi Wang","Xulong Tang"],"pdf_url":"https://arxiv.org/pdf/2401.16694v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11895v2","updated":"2024-08-13T06:48:37Z","published":"2024-04-18T04:47:28Z","title":"FreeDiff: Progressive Frequency Truncation for Image Editing with\n Diffusion Models","summary":" Precise image editing with text-to-image models has attracted increasing\ninterest due to their remarkable generative capabilities and user-friendly\nnature. However, such attempts face the pivotal challenge of misalignment\nbetween the intended precise editing target regions and the broader area\nimpacted by the guidance in practice. Despite excellent methods leveraging\nattention mechanisms that have been developed to refine the editing guidance,\nthese approaches necessitate modifications through complex network architecture\nand are limited to specific editing tasks. In this work, we re-examine the\ndiffusion process and misalignment problem from a frequency perspective,\nrevealing that, due to the power law of natural images and the decaying noise\nschedule, the denoising network primarily recovers low-frequency image\ncomponents during the earlier timesteps and thus brings excessive low-frequency\nsignals for editing. Leveraging this insight, we introduce a novel fine-tuning\nfree approach that employs progressive $\\textbf{Fre}$qu$\\textbf{e}$ncy\ntruncation to refine the guidance of $\\textbf{Diff}$usion models for universal\nediting tasks ($\\textbf{FreeDiff}$). Our method achieves comparable results\nwith state-of-the-art methods across a variety of editing tasks and on a\ndiverse set of images, highlighting its potential as a versatile tool in image\nediting applications.\n","authors":["Wei Wu","Qingnan Fan","Shuai Qin","Hong Gu","Ruoyu Zhao","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2404.11895v2.pdf","comment":"Accepted by ECCV-2024"},{"id":"http://arxiv.org/abs/2408.06662v1","updated":"2024-08-13T06:25:54Z","published":"2024-08-13T06:25:54Z","title":"Bi-directional Contextual Attention for 3D Dense Captioning","summary":" 3D dense captioning is a task involving the localization of objects and the\ngeneration of descriptions for each object in a 3D scene. Recent approaches\nhave attempted to incorporate contextual information by modeling relationships\nwith object pairs or aggregating the nearest neighbor features of an object.\nHowever, the contextual information constructed in these scenarios is limited\nin two aspects: first, objects have multiple positional relationships that\nexist across the entire global scene, not only near the object itself. Second,\nit faces with contradicting objectives--where localization and attribute\ndescriptions are generated better with tight localization, while descriptions\ninvolving global positional relations are generated better with contextualized\nfeatures of the global scene. To overcome this challenge, we introduce BiCA, a\ntransformer encoder-decoder pipeline that engages in 3D dense captioning for\neach object with Bi-directional Contextual Attention. Leveraging parallelly\ndecoded instance queries for objects and context queries for non-object\ncontexts, BiCA generates object-aware contexts, where the contexts relevant to\neach object is summarized, and context-aware objects, where the objects\nrelevant to the summarized object-aware contexts are aggregated. This extension\nrelieves previous methods from the contradicting objectives, enhancing both\nlocalization performance and enabling the aggregation of contextual features\nthroughout the global scene; thus improving caption generation performance\nsimultaneously. Extensive experiments on two of the most widely-used 3D dense\ncaptioning datasets demonstrate that our proposed method achieves a significant\nimprovement over prior methods.\n","authors":["Minjung Kim","Hyung Suk Lim","Soonyoung Lee","Bumsoo Kim","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2408.06662v1.pdf","comment":"Accepted to ECCV 2024 (Oral)"},{"id":"http://arxiv.org/abs/2306.09626v2","updated":"2024-08-13T06:17:23Z","published":"2023-06-16T04:51:18Z","title":"PAtt-Lite: Lightweight Patch and Attention MobileNet for Challenging\n Facial Expression Recognition","summary":" Facial Expression Recognition (FER) is a machine learning problem that deals\nwith recognizing human facial expressions. While existing work has achieved\nperformance improvements in recent years, FER in the wild and under challenging\nconditions remains a challenge. In this paper, a lightweight patch and\nattention network based on MobileNetV1, referred to as PAtt-Lite, is proposed\nto improve FER performance under challenging conditions. A truncated\nImageNet-pre-trained MobileNetV1 is utilized as the backbone feature extractor\nof the proposed method. In place of the truncated layers is a patch extraction\nblock that is proposed for extracting significant local facial features to\nenhance the representation from MobileNetV1, especially under challenging\nconditions. An attention classifier is also proposed to improve the learning of\nthese patched feature maps from the extremely lightweight feature extractor.\nThe experimental results on public benchmark databases proved the effectiveness\nof the proposed method. PAtt-Lite achieved state-of-the-art results on CK+,\nRAF-DB, FER2013, FERPlus, and the challenging conditions subsets for RAF-DB and\nFERPlus.\n","authors":["Jia Le Ngwe","Kian Ming Lim","Chin Poo Lee","Thian Song Ong"],"pdf_url":"https://arxiv.org/pdf/2306.09626v2.pdf","comment":"Copyright 2024 IEEE. Personal use of this material is permitted. IEEE\n Access 2024"},{"id":"http://arxiv.org/abs/2310.05338v2","updated":"2024-08-13T05:48:31Z","published":"2023-10-09T01:52:27Z","title":"Negative Object Presence Evaluation (NOPE) to Measure Object\n Hallucination in Vision-Language Models","summary":" Object hallucination poses a significant challenge in vision-language (VL)\nmodels, often leading to the generation of nonsensical or unfaithful responses\nwith non-existent objects. However, the absence of a general measurement for\nevaluating object hallucination in VL models has hindered our understanding and\nability to mitigate this issue. In this work, we present NOPE (Negative Object\nPresence Evaluation), a novel benchmark designed to assess object hallucination\nin VL models through visual question answering (VQA). We propose a\ncost-effective and scalable approach utilizing large language models to\ngenerate 29.5k synthetic negative pronoun (NegP) data of high quality for NOPE.\nWe extensively investigate the performance of 10 state-of-the-art VL models in\ndiscerning the non-existence of objects in visual questions, where the ground\ntruth answers are denoted as NegP (e.g., \"none\"). Additionally, we evaluate\ntheir standard performance on visual questions on 9 other VQA datasets. Through\nour experiments, we demonstrate that no VL model is immune to the vulnerability\nof object hallucination, as all models achieve accuracy below 10\\% on NegP.\nFurthermore, we uncover that lexically diverse visual questions, question types\nwith large scopes, and scene-relevant objects capitalize the risk of object\nhallucination in VL models.\n","authors":["Holy Lovenia","Wenliang Dai","Samuel Cahyawijaya","Ziwei Ji","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2310.05338v2.pdf","comment":"Published in ALVR Workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2408.06646v1","updated":"2024-08-13T05:30:41Z","published":"2024-08-13T05:30:41Z","title":"Hybrid SD: Edge-Cloud Collaborative Inference for Stable Diffusion\n Models","summary":" Stable Diffusion Models (SDMs) have shown remarkable proficiency in image\nsynthesis. However, their broad application is impeded by their large model\nsizes and intensive computational requirements, which typically require\nexpensive cloud servers for deployment. On the flip side, while there are many\ncompact models tailored for edge devices that can reduce these demands, they\noften compromise on semantic integrity and visual quality when compared to\nfull-sized SDMs. To bridge this gap, we introduce Hybrid SD, an innovative,\ntraining-free SDMs inference framework designed for edge-cloud collaborative\ninference. Hybrid SD distributes the early steps of the diffusion process to\nthe large models deployed on cloud servers, enhancing semantic planning.\nFurthermore, small efficient models deployed on edge devices can be integrated\nfor refining visual details in the later stages. Acknowledging the diversity of\nedge devices with differing computational and storage capacities, we employ\nstructural pruning to the SDMs U-Net and train a lightweight VAE. Empirical\nevaluations demonstrate that our compressed models achieve state-of-the-art\nparameter efficiency (225.8M) on edge devices with competitive image quality.\nAdditionally, Hybrid SD reduces the cloud cost by 66% with edge-cloud\ncollaborative inference.\n","authors":["Chenqian Yan","Songwei Liu","Hongjian Liu","Xurui Peng","Xiaojian Wang","Fangming Chen","Lean Fu","Xing Mei"],"pdf_url":"https://arxiv.org/pdf/2408.06646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06644v1","updated":"2024-08-13T05:27:23Z","published":"2024-08-13T05:27:23Z","title":"Specialized Change Detection using Segment Anything","summary":" Change detection (CD) is a fundamental task in Earth observation. While most\nchange detection methods detect all changes, there is a growing need for\nspecialized methods targeting specific changes relevant to particular\napplications while discarding the other changes. For instance, urban management\nmight prioritize detecting the disappearance of buildings due to natural\ndisasters or other reasons. Furthermore, while most supervised change detection\nmethods require large-scale training datasets, in many applications only one or\ntwo training examples might be available instead of large datasets. Addressing\nsuch needs, we propose a focused CD approach using the Segment Anything Model\n(SAM), a versatile vision foundation model. Our method leverages a binary mask\nof the object of interest in pre-change images to detect their disappearance in\npost-change images. By using SAM's robust segmentation capabilities, we create\nprompts from the pre-change mask, use those prompts to segment the post-change\nimage, and identify missing objects. This unsupervised approach demonstrated\nfor building disappearance detection, is adaptable to various domains requiring\nspecialized CD. Our contributions include defining a novel CD problem,\nproposing a method using SAM, and demonstrating its effectiveness. The proposed\nmethod also has benefits related to privacy preservation.\n","authors":["Tahir Ahmad","Sudipan Saha"],"pdf_url":"https://arxiv.org/pdf/2408.06644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06640v1","updated":"2024-08-13T05:21:03Z","published":"2024-08-13T05:21:03Z","title":"Attention Based Feature Fusion Network for Monkeypox Skin Lesion\n Detection","summary":" The recent monkeypox outbreak has raised significant public health concerns\ndue to its rapid spread across multiple countries. Monkeypox can be difficult\nto distinguish from chickenpox and measles in the early stages because the\nsymptoms of all three diseases are similar. Modern deep learning algorithms can\nbe used to identify diseases, including COVID-19, by analyzing images of the\naffected areas. In this study, we introduce a lightweight model that merges two\npre-trained architectures, EfficientNetV2B3 and ResNet151V2, to classify human\nmonkeypox disease. We have also incorporated the squeeze-and-excitation\nattention network module to focus on the important parts of the feature maps\nfor classifying the monkeypox images. This attention module provides channels\nand spatial attention to highlight significant areas within feature maps. We\nevaluated the effectiveness of our model by extensively testing it on a\npublicly available Monkeypox Skin Lesions Dataset using a four-fold\ncross-validation approach. The evaluation metrics of our model were compared\nwith the existing others. Our model achieves a mean validation accuracy of\n96.52%, with precision, recall, and F1-score values of 96.58%, 96.52%, and\n96.51%, respectively.\n","authors":["Niloy Kumar Kundu","Mainul Karim","Sarah Kobir","Dewan Md. Farid"],"pdf_url":"https://arxiv.org/pdf/2408.06640v1.pdf","comment":"6 pages with 6 figures"},{"id":"http://arxiv.org/abs/2408.05940v2","updated":"2024-08-13T05:18:42Z","published":"2024-08-12T06:33:38Z","title":"Spb3DTracker: A Robust LiDAR-Based Person Tracker for Noisy Environment","summary":" Person detection and tracking (PDT) has seen significant advancements with 2D\ncamera-based systems in the autonomous vehicle field, leading to widespread\nadoption of these algorithms. However, growing privacy concerns have recently\nemerged as a major issue, prompting a shift towards LiDAR-based PDT as a viable\nalternative. Within this domain, \"Tracking-by-Detection\" (TBD) has become a\nprominent methodology. Despite its effectiveness, LiDAR-based PDT has not yet\nachieved the same level of performance as camera-based PDT. This paper examines\nkey components of the LiDAR-based PDT framework, including detection\npost-processing, data association, motion modeling, and lifecycle management.\nBuilding upon these insights, we introduce SpbTrack, a robust person tracker\ndesigned for diverse environments. Our method achieves superior performance on\nnoisy datasets and state-of-the-art results on KITTI Dataset benchmarks and\ncustom office indoor dataset among LiDAR-based trackers.\n","authors":["Eunsoo Im","Changhyun Jee","Jung Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2408.05940v2.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.05518v2","updated":"2024-08-13T05:16:07Z","published":"2024-08-10T11:02:03Z","title":"Long working distance portable smartphone microscopy for metallic mesh\n defect detection","summary":" Metallic mesh is a transparent electromagnetic shielding film with a fine\nmetal line structure. However, it can develop defects that affect the\noptoelectronic performance whether in the production preparation or in actual\nuse. The development of in-situ non-destructive testing (NDT) devices for\nmetallic mesh requires long working distances, reflective optical path design,\nand miniaturization. To address the limitations of existing smartphone\nmicroscopes, which feature short working distances and inadequate transmission\nimaging for industrial in-situ inspection, we propose a novel long-working\ndistance reflective smartphone microscopy system (LD-RSM). LD-RSM builds a 4f\noptical imaging system with external optical components and a smartphone,\nutilizing a beam splitter to achieve reflective imaging with the illumination\nsystem and imaging system on the same side of the sample. It achieves an\noptical resolution of 4.92$\\mu$m and a working distance of up to 22.23 mm.\nAdditionally, we introduce a dual prior weighted Robust Principal Component\nAnalysis (DW-RPCA) for defect detection. This approach leverages spectral\nfilter fusion and Hough transform to model different defect types, enhancing\nthe accuracy and efficiency of defect identification. Coupled with an optimized\nthreshold segmentation algorithm, DW-RPCA method achieves a pixel-level\naccuracy of 84.8%. Our work showcases strong potential for growth in the field\nof in-situ on-line inspection of industrial products.\n","authors":["Zhengang Lu","Hongsheng Qin","Jing Li","Ming Sun","Jiubin Tan"],"pdf_url":"https://arxiv.org/pdf/2408.05518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06638v1","updated":"2024-08-13T05:08:13Z","published":"2024-08-13T05:08:13Z","title":"COD: Learning Conditional Invariant Representation for Domain Adaptation\n Regression","summary":" Aiming to generalize the label knowledge from a source domain with continuous\noutputs to an unlabeled target domain, Domain Adaptation Regression (DAR) is\ndeveloped for complex practical learning problems. However, due to the\ncontinuity problem in regression, existing conditional distribution alignment\ntheory and methods with discrete prior, which are proven to be effective in\nclassification settings, are no longer applicable. In this work, focusing on\nthe feasibility problems in DAR, we establish the sufficiency theory for the\nregression model, which shows the generalization error can be sufficiently\ndominated by the cross-domain conditional discrepancy. Further, to characterize\nconditional discrepancy with continuous conditioning variable, a novel\nConditional Operator Discrepancy (COD) is proposed, which admits the metric\nproperty on conditional distributions via the kernel embedding theory. Finally,\nto minimize the discrepancy, a COD-based conditional invariant representation\nlearning model is proposed, and the reformulation is derived to show that\nreasonable modifications on moment statistics can further improve the\ndiscriminability of the adaptation model. Extensive experiments on standard DAR\ndatasets verify the validity of theoretical results and the superiority over\nSOTA DAR methods.\n","authors":["Hao-Ran Yang","Chuan-Xian Ren","You-Wei Luo"],"pdf_url":"https://arxiv.org/pdf/2408.06638v1.pdf","comment":"Accepted to ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2309.07640v3","updated":"2024-08-13T05:01:28Z","published":"2023-09-14T12:05:29Z","title":"Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid\n Representation and Normal Prior Enhancement","summary":" The reconstruction of indoor scenes from multi-view RGB images is challenging\ndue to the coexistence of flat and texture-less regions alongside delicate and\nfine-grained regions. Recent methods leverage neural radiance fields aided by\npredicted surface normal priors to recover the scene geometry. These methods\nexcel in producing complete and smooth results for floor and wall areas.\nHowever, they struggle to capture complex surfaces with high-frequency\nstructures due to the inadequate neural representation and the inaccurately\npredicted normal priors. This work aims to reconstruct high-fidelity surfaces\nwith fine-grained details by addressing the above limitations. To improve the\ncapacity of the implicit representation, we propose a hybrid architecture to\nrepresent low-frequency and high-frequency regions separately. To enhance the\nnormal priors, we introduce a simple yet effective image sharpening and\ndenoising technique, coupled with a network that estimates the pixel-wise\nuncertainty of the predicted surface normal vectors. Identifying such\nuncertainty can prevent our model from being misled by unreliable surface\nnormal supervisions that hinder the accurate reconstruction of intricate\ngeometries. Experiments on the benchmark datasets show that our method\noutperforms existing methods in terms of reconstruction quality. Furthermore,\nthe proposed method also generalizes well to real-world indoor scenarios\ncaptured by our hand-held mobile phones. Our code is publicly available at:\nhttps://github.com/yec22/Fine-Grained-Indoor-Recon.\n","authors":["Sheng Ye","Yubin Hu","Matthieu Lin","Yu-Hui Wen","Wang Zhao","Yong-Jin Liu","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2309.07640v3.pdf","comment":"accepted by TVCG"},{"id":"http://arxiv.org/abs/2408.06636v1","updated":"2024-08-13T04:56:45Z","published":"2024-08-13T04:56:45Z","title":"Unified-IoU: For High-Quality Object Detection","summary":" Object detection is an important part in the field of computer vision, and\nthe effect of object detection is directly determined by the regression\naccuracy of the prediction box. As the key to model training, IoU (Intersection\nover Union) greatly shows the difference between the current prediction box and\nthe Ground Truth box. Subsequent researchers have continuously added more\nconsiderations to IoU, such as center distance, aspect ratio, and so on.\nHowever, there is an upper limit to just refining the geometric differences;\nAnd there is a potential connection between the new consideration index and the\nIoU itself, and the direct addition or subtraction between the two may lead to\nthe problem of \"over-consideration\". Based on this, we propose a new IoU loss\nfunction, called Unified-IoU (UIoU), which is more concerned with the weight\nassignment between different quality prediction boxes. Specifically, the loss\nfunction dynamically shifts the model's attention from low-quality prediction\nboxes to high-quality prediction boxes in a novel way to enhance the model's\ndetection performance on high-precision or intensive datasets and achieve a\nbalance in training speed. Our proposed method achieves better performance on\nmultiple datasets, especially at a high IoU threshold, UIoU has a more\nsignificant improvement effect compared with other improved IoU losses. Our\ncode is publicly available at: https://github.com/lxj-drifter/UIOU_files.\n","authors":["Xiangjie Luo","Zhihao Cai","Bo Shao","Yingxun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06635v1","updated":"2024-08-13T04:53:48Z","published":"2024-08-13T04:53:48Z","title":"IDRetracor: Towards Visual Forensics Against Malicious Face Swapping","summary":" The face swapping technique based on deepfake methods poses significant\nsocial risks to personal identity security. While numerous deepfake detection\nmethods have been proposed as countermeasures against malicious face swapping,\nthey can only output binary labels (Fake/Real) for distinguishing fake content\nwithout reliable and traceable evidence. To achieve visual forensics and target\nface attribution, we propose a novel task named face retracing, which considers\nretracing the original target face from the given fake one via inverse mapping.\nToward this goal, we propose an IDRetracor that can retrace arbitrary original\ntarget identities from fake faces generated by multiple face swapping methods.\nSpecifically, we first adopt a mapping resolver to perceive the possible\nsolution space of the original target face for the inverse mappings. Then, we\npropose mapping-aware convolutions to retrace the original target face from the\nfake one. Such convolutions contain multiple kernels that can be combined under\nthe control of the mapping resolver to tackle different face swapping mappings\ndynamically. Extensive experiments demonstrate that the IDRetracor exhibits\npromising retracing performance from both quantitative and qualitative\nperspectives.\n","authors":["Jikang Cheng","Jiaxin Ai","Zhen Han","Chao Liang","Qin Zou","Zhongyuan Wang","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06633v1","updated":"2024-08-13T04:42:02Z","published":"2024-08-13T04:42:02Z","title":"A lightweight YOLOv5-FFM model for occlusion pedestrian detection","summary":" The development of autonomous driving technology must be inseparable from\npedestrian detection. Because of the fast speed of the vehicle, the accuracy\nand real-time performance of the pedestrian detection algorithm are very\nimportant. YOLO, as an efficient and simple one-stage target detection method,\nis often used for pedestrian detection in various environments. However, this\nseries of detectors face some challenges, such as excessive computation and\nundesirable detection rate when facing occluded pedestrians. In this paper, we\npropose an improved lightweight YOLOv5 model to deal with these problems. This\nmodel can achieve better pedestrian detection accuracy with fewer\nfloating-point operations (FLOPs), especially for occluded targets. In order to\nachieve the above goals, we made improvements based on the YOLOv5 model\nframework and introduced Ghost module and SE block. Furthermore, we designed a\nlocal feature fusion module (FFM) to deal with occlusion in pedestrian\ndetection. To verify the validity of our method, two datasets, Citypersons and\nCUHK Occlusion, were selected for the experiment. The experimental results show\nthat, compared with the original yolov5s model, the average precision (AP) of\nour method is significantly improved, while the number of parameters is reduced\nby 27.9% and FLOPs are reduced by 19.0%.\n","authors":["Xiangjie Luo","Bo Shao","Zhihao Cai","Yingxun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11459v3","updated":"2024-08-13T04:35:02Z","published":"2023-12-18T18:59:05Z","title":"VolumeDiffusion: Flexible Text-to-3D Generation with Efficient\n Volumetric Encoder","summary":" This paper introduces a pioneering 3D volumetric encoder designed for\ntext-to-3D generation. To scale up the training data for the diffusion model, a\nlightweight network is developed to efficiently acquire feature volumes from\nmulti-view images. The 3D volumes are then trained on a diffusion model for\ntext-to-3D generation using a 3D U-Net. This research further addresses the\nchallenges of inaccurate object captions and high-dimensional feature volumes.\nThe proposed model, trained on the public Objaverse dataset, demonstrates\npromising outcomes in producing diverse and recognizable samples from text\nprompts. Notably, it empowers finer control over object part characteristics\nthrough textual cues, fostering model creativity by seamlessly combining\nmultiple concepts within a single object. This research significantly\ncontributes to the progress of 3D generation by introducing an efficient,\nflexible, and scalable representation methodology.\n","authors":["Zhicong Tang","Shuyang Gu","Chunyu Wang","Ting Zhang","Jianmin Bao","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2312.11459v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14623v4","updated":"2024-08-13T04:34:58Z","published":"2024-03-21T17:59:41Z","title":"Simplified Diffusion Schrödinger Bridge","summary":" This paper introduces a novel theoretical simplification of the Diffusion\nSchr\\\"odinger Bridge (DSB) that facilitates its unification with Score-based\nGenerative Models (SGMs), addressing the limitations of DSB in complex data\ngeneration and enabling faster convergence and enhanced performance. By\nemploying SGMs as an initial solution for DSB, our approach capitalizes on the\nstrengths of both frameworks, ensuring a more efficient training process and\nimproving the performance of SGM. We also propose a reparameterization\ntechnique that, despite theoretical approximations, practically improves the\nnetwork's fitting capabilities. Our extensive experimental evaluations confirm\nthe effectiveness of the simplified DSB, demonstrating its significant\nimprovements. We believe the contributions of this work pave the way for\nadvanced generative modeling.\n","authors":["Zhicong Tang","Tiankai Hang","Shuyang Gu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14623v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06629v1","updated":"2024-08-13T04:33:23Z","published":"2024-08-13T04:33:23Z","title":"Fast Information Streaming Handler (FisH): A Unified Seismic Neural\n Network for Single Station Real-Time Earthquake Early Warning","summary":" Existing EEW approaches often treat phase picking, location estimation, and\nmagnitude estimation as separate tasks, lacking a unified framework.\nAdditionally, most deep learning models in seismology rely on full\nthree-component waveforms and are not suitable for real-time streaming data. To\naddress these limitations, we propose a novel unified seismic neural network\ncalled Fast Information Streaming Handler (FisH). FisH is designed to process\nreal-time streaming seismic data and generate simultaneous results for phase\npicking, location estimation, and magnitude estimation in an end-to-end\nfashion. By integrating these tasks within a single model, FisH simplifies the\noverall process and leverages the nonlinear relationships between tasks for\nimproved performance. The FisH model utilizes RetNet as its backbone, enabling\nparallel processing during training and recurrent handling during inference.\nThis capability makes FisH suitable for real-time applications, reducing\nlatency in EEW systems. Extensive experiments conducted on the STEAD benchmark\ndataset provide strong validation for the effectiveness of our proposed FisH\nmodel. The results demonstrate that FisH achieves impressive performance across\nmultiple seismic event detection and characterization tasks. Specifically, it\nachieves an F1 score of 0.99/0.96. Also, FisH demonstrates precise earthquake\nlocation estimation, with location error of only 6.0km, a distance error of\n2.6km, and a back-azimuth error of 19{\\deg}. The model also exhibits accurate\nearthquake magnitude estimation, with a magnitude error of just 0.14.\nAdditionally, FisH is capable of generating real-time estimations, providing\nlocation and magnitude estimations with a location error of 8.06km and a\nmagnitude error of 0.18 within a mere 3 seconds after the P-wave arrives.\n","authors":["Tianning Zhang","Feng Liu","Yuming Yuan","Rui Su","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2408.06629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06625v1","updated":"2024-08-13T04:25:13Z","published":"2024-08-13T04:25:13Z","title":"DePatch: Towards Robust Adversarial Patch for Evading Person Detectors\n in the Real World","summary":" Recent years have seen an increasing interest in physical adversarial\nattacks, which aim to craft deployable patterns for deceiving deep neural\nnetworks, especially for person detectors. However, the adversarial patterns of\nexisting patch-based attacks heavily suffer from the self-coupling issue, where\na degradation, caused by physical transformations, in any small patch segment\ncan result in a complete adversarial dysfunction, leading to poor robustness in\nthe complex real world. Upon this observation, we introduce the Decoupled\nadversarial Patch (DePatch) attack to address the self-coupling issue of\nadversarial patches. Specifically, we divide the adversarial patch into\nblock-wise segments, and reduce the inter-dependency among these segments\nthrough randomly erasing out some segments during the optimization. We further\nintroduce a border shifting operation and a progressive decoupling strategy to\nimprove the overall attack capabilities. Extensive experiments demonstrate the\nsuperior performance of our method over other physical adversarial attacks,\nespecially in the real world.\n","authors":["Jikang Cheng","Ying Zhang","Zhongyuan Wang","Zou Qin","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.06625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06622v1","updated":"2024-08-13T04:18:32Z","published":"2024-08-13T04:18:32Z","title":"ActPrompt: In-Domain Feature Adaptation via Action Cues for Video\n Temporal Grounding","summary":" Video temporal grounding is an emerging topic aiming to identify specific\nclips within videos. In addition to pre-trained video models, contemporary\nmethods utilize pre-trained vision-language models (VLM) to capture detailed\ncharacteristics of diverse scenes and objects from video frames. However, as\npre-trained on images, VLM may struggle to distinguish action-sensitive\npatterns from static objects, making it necessary to adapt them to specific\ndata domains for effective feature representation over temporal grounding. We\naddress two primary challenges to achieve this goal. Specifically, to mitigate\nhigh adaptation costs, we propose an efficient preliminary in-domain\nfine-tuning paradigm for feature adaptation, where downstream-adaptive features\nare learned through several pretext tasks. Furthermore, to integrate\naction-sensitive information into VLM, we introduce Action-Cue-Injected\nTemporal Prompt Learning (ActPrompt), which injects action cues into the image\nencoder of VLM for better discovering action-sensitive patterns. Extensive\nexperiments demonstrate that ActPrompt is an off-the-shelf training framework\nthat can be effectively applied to various SOTA methods, resulting in notable\nimprovements. The complete code used in this study is provided in the\nsupplementary materials.\n","authors":["Yubin Wang","Xinyang Jiang","De Cheng","Dongsheng Li","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.06622v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.16425v2","updated":"2024-08-13T04:16:48Z","published":"2024-03-25T05:10:34Z","title":"Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in\n Event Cameras","summary":" Event cameras are increasingly popular in robotics due to beneficial features\nsuch as low latency, energy efficiency, and high dynamic range. Nevertheless,\ntheir downstream task performance is greatly influenced by the optimization of\nbias parameters. These parameters, for instance, regulate the necessary change\nin light intensity to trigger an event, which in turn depends on factors such\nas the environment lighting and camera motion. This paper introduces feedback\ncontrol algorithms that automatically tune the bias parameters through two\ninteracting methods: 1) An immediate, on-the-fly \\textit{fast} adaptation of\nthe refractory period, which sets the minimum interval between consecutive\nevents, and 2) if the event rate exceeds the specified bounds even after\nchanging the refractory period repeatedly, the controller adapts the pixel\nbandwidth and event thresholds, which stabilizes after a short period of noise\nevents across all pixels (\\textit{slow} adaptation). Our evaluation focuses on\nthe visual place recognition task, where incoming query images are compared to\na given reference database. We conducted comprehensive evaluations of our\nalgorithms' adaptive feedback control in real-time. To do so, we collected the\nQCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366\nrepeated traversals of a Scout Mini robot navigating through a 100 meter long\nindoor lab setting (totaling over 35km distance traveled) in varying brightness\nconditions with ground truth location information. Our proposed feedback\ncontrollers result in superior performance when compared to the standard bias\nsettings and prior feedback control methods. Our findings also detail the\nimpact of bias adjustments on task performance and feature ablation studies on\nthe fast and slow adaptation mechanisms.\n","authors":["Gokul B. Nair","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2403.16425v2.pdf","comment":"8 pages, 9 figures, paper accepted to the 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2407.01537v2","updated":"2024-08-13T04:15:32Z","published":"2024-03-12T14:06:27Z","title":"WaveShot: A Compact Portable Unmanned Surface Vessel for Dynamic Water\n Surface Videography and Media Production","summary":" This paper presents WaveShot, an innovative portable unmanned surface vessel\nthat aims to transform water surface videography by offering a highly\nmaneuverable, cost-effective, and safe alternative to traditional filming\nmethods. WaveShot is designed for the modern demands of film production,\nadvertising, documentaries, and visual arts, equipped with professional-grade\nwaterproof cameras and advanced technology to capture static and dynamic scenes\non waterways. We discuss the development and advantages of WaveShot,\nhighlighting its portability, ease of transport, and rapid deployment\ncapabilities. Experimental validation showcasing WaveShot's stability and\nhigh-quality video capture in various water conditions, and the integration of\nmonocular depth estimation algorithms to enhance the operator's spatial\nperception. The paper concludes by exploring WaveShot's real-world\napplications, its user-friendly remote operation, and future enhancements such\nas gimbal integration and advanced computer vision for optimized videography on\nwater surfaces.\n","authors":["Shijian Ma","Shicong Ma","Jianhao Jiao"],"pdf_url":"https://arxiv.org/pdf/2407.01537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02586v2","updated":"2024-08-13T04:04:26Z","published":"2024-05-04T06:53:18Z","title":"Enhancing Vision-Language Models Generalization via Diversity-Driven\n Novel Feature Synthesis","summary":" Vision-language foundation models like CLIP have shown impressive zero-shot\ngeneralization, but finetuning on downstream datasets can cause overfitting and\nloss of its generalization ability on unseen domains. Although collecting\nadditional data from new domains of interest is possible, this method is often\nimpractical due to the challenges in obtaining annotated data. To address this,\nwe propose a plug-and-play feature synthesis method called LDFS\n(Language-Guided Diverse Feature Synthesis) to synthesize new domain features\nand improve existing CLIP fine-tuning strategies. LDFS has three main\ncontributions: 1) To synthesize novel domain features and promote diversity, we\npropose an instance-conditional feature augmentation strategy based on a\ntext-guided feature augmentation loss. 2) To maintain feature quality after\naugmenting, we introduce a pairwise regularizer to preserve augmented feature\ncoherence within the CLIP feature space. 3) We propose to use stochastic text\nfeature augmentation to reduce the modality gap and further facilitate the\nprocess of text-guided feature synthesis. Extensive experiments show LDFS\nsuperiority in improving CLIP generalization ability on unseen domains without\ncollecting data from those domains. The code will be made publicly available.\n","authors":["Siyuan Yan","Cheng Luo","Zhen Yu","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2405.02586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06614v1","updated":"2024-08-13T03:57:35Z","published":"2024-08-13T03:57:35Z","title":"ViMo: Generating Motions from Casual Videos","summary":" Although humans have the innate ability to imagine multiple possible actions\nfrom videos, it remains an extraordinary challenge for computers due to the\nintricate camera movements and montages. Most existing motion generation\nmethods predominantly rely on manually collected motion datasets, usually\ntediously sourced from motion capture (Mocap) systems or Multi-View cameras,\nunavoidably resulting in a limited size that severely undermines their\ngeneralizability. Inspired by recent advance of diffusion models, we probe a\nsimple and effective way to capture motions from videos and propose a novel\nVideo-to-Motion-Generation framework (ViMo) which could leverage the immense\ntrove of untapped video content to produce abundant and diverse 3D human\nmotions. Distinct from prior work, our videos could be more causal, including\ncomplicated camera movements and occlusions. Striking experimental results\ndemonstrate the proposed model could generate natural motions even for videos\nwhere rapid movements, varying perspectives, or frequent occlusions might\nexist. We also show this work could enable three important downstream\napplications, such as generating dancing motions according to arbitrary music\nand source video style. Extensive experimental results prove that our model\noffers an effective and scalable way to generate diversity and realistic\nmotions. Code and demos will be public soon.\n","authors":["Liangdong Qiu","Chengxing Yu","Yanran Li","Zhao Wang","Haibin Huang","Chongyang Ma","Di Zhang","Pengfei Wan","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2408.06614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06610v1","updated":"2024-08-13T03:45:11Z","published":"2024-08-13T03:45:11Z","title":"CROME: Cross-Modal Adapters for Efficient Multimodal LLM","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable\nimage-language capabilities, but their widespread use faces challenges in\ncost-effective training and adaptation. Existing approaches often necessitate\nexpensive language model retraining and limited adaptability. Additionally, the\ncurrent focus on zero-shot performance improvements offers insufficient\nguidance for task-specific tuning. We propose CROME, an efficient\nvision-language instruction tuning framework. It features a novel gated\ncross-modal adapter that effectively combines visual and textual\nrepresentations prior to input into a frozen LLM. This lightweight adapter,\ntrained with minimal parameters, enables efficient cross-modal understanding.\nNotably, CROME demonstrates superior zero-shot performance on standard visual\nquestion answering and instruction-following benchmarks. Moreover, it yields\nfine-tuning with exceptional parameter efficiency, competing with task-specific\nspecialist state-of-the-art methods. CROME demonstrates the potential of pre-LM\nalignment for building scalable, adaptable, and parameter-efficient multimodal\nmodels.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Tejas Nama","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2408.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05220v2","updated":"2024-08-13T03:43:30Z","published":"2024-04-08T06:32:11Z","title":"StylizedGS: Controllable Stylization for 3D Gaussian Splatting","summary":" As XR technology continues to advance rapidly, 3D generation and editing are\nincreasingly crucial. Among these, stylization plays a key role in enhancing\nthe appearance of 3D models. By utilizing stylization, users can achieve\nconsistent artistic effects in 3D editing using a single reference style image,\nmaking it a user-friendly editing method. However, recent NeRF-based 3D\nstylization methods encounter efficiency issues that impact the user\nexperience, and their implicit nature limits their ability to accurately\ntransfer geometric pattern styles. Additionally, the ability for artists to\napply flexible control over stylized scenes is considered highly desirable to\nfoster an environment conducive to creative exploration. To address the above\nissues, we introduce StylizedGS, an efficient 3D neural style transfer\nframework with adaptable control over perceptual factors based on 3D Gaussian\nSplatting (3DGS) representation. We propose a filter-based refinement to\neliminate floaters that affect the stylization effects in the scene\nreconstruction process. The nearest neighbor-based style loss is introduced to\nachieve stylization by fine-tuning the geometry and color parameters of 3DGS,\nwhile a depth preservation loss with other regularizations is proposed to\nprevent the tampering of geometry content. Moreover, facilitated by specially\ndesigned losses, StylizedGS enables users to control color, stylized scale, and\nregions during the stylization to possess customization capabilities. Our\nmethod achieves high-quality stylization results characterized by faithful\nbrushstrokes and geometric consistency with flexible controls. Extensive\nexperiments across various scenes and styles demonstrate the effectiveness and\nefficiency of our method concerning both stylization quality and inference\nspeed.\n","authors":["Dingxi Zhang","Yu-Jie Yuan","Zhuoxun Chen","Fang-Lue Zhang","Zhenliang He","Shiguang Shan","Lin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.05220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16828v3","updated":"2024-08-13T03:41:48Z","published":"2024-04-25T17:59:56Z","title":"Made to Order: Discovering monotonic temporal changes via\n self-supervised video ordering","summary":" Our objective is to discover and localize monotonic temporal changes in a\nsequence of images. To achieve this, we exploit a simple proxy task of ordering\na shuffled image sequence, with `time' serving as a supervisory signal, since\nonly changes that are monotonic with time can give rise to the correct\nordering. We also introduce a transformer-based model for ordering of image\nsequences of arbitrary length with built-in attribution maps. After training,\nthe model successfully discovers and localizes monotonic changes while ignoring\ncyclic and stochastic ones. We demonstrate applications of the model in\nmultiple domains covering different scene and object types, discovering both\nobject-level and environmental changes in unseen sequences. We also demonstrate\nthat the attention-based attribution maps function as effective prompts for\nsegmenting the changing regions, and that the learned representations can be\nused for downstream applications. Finally, we show that the model achieves the\nstate-of-the-art on standard benchmarks for image ordering.\n","authors":["Charig Yang","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.16828v3.pdf","comment":"ECCV 2024 Oral. Project page: https://charigyang.github.io/order/"},{"id":"http://arxiv.org/abs/2408.06604v1","updated":"2024-08-13T03:37:13Z","published":"2024-08-13T03:37:13Z","title":"MV-DETR: Multi-modality indoor object detection by Multi-View DEtecton\n TRansformers","summary":" We introduce a novel MV-DETR pipeline which is effective while efficient\ntransformer based detection method. Given input RGBD data, we notice that there\nare super strong pretraining weights for RGB data while less effective works\nfor depth related data. First and foremost , we argue that geometry and texture\ncues are both of vital importance while could be encoded separately. Secondly,\nwe find that visual texture feature is relatively hard to extract compared with\ngeometry feature in 3d space. Unfortunately, single RGBD dataset with thousands\nof data is not enough for training an discriminating filter for visual texture\nfeature extraction. Last but certainly not the least, we designed a lightweight\nVG module consists of a visual textual encoder, a geometry encoder and a VG\nconnector. Compared with previous state of the art works like V-DETR, gains\nfrom pretrained visual encoder could be seen. Extensive experiments on\nScanNetV2 dataset shows the effectiveness of our method. It is worth mentioned\nthat our method achieve 78\\% AP which create new state of the art on ScanNetv2\nbenchmark.\n","authors":["Zichao Dong","Yilin Zhang","Xufeng Huang","Hang Ji","Zhan Shi","Xin Zhan","Junbo Chen"],"pdf_url":"https://arxiv.org/pdf/2408.06604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06600v1","updated":"2024-08-13T03:32:59Z","published":"2024-08-13T03:32:59Z","title":"Deep Inertia $L_p$ Half-Quadratic Splitting Unrolling Network for Sparse\n View CT Reconstruction","summary":" Sparse view computed tomography (CT) reconstruction poses a challenging\nill-posed inverse problem, necessitating effective regularization techniques.\nIn this letter, we employ $L_p$-norm ($0\\cos^2(\\pi/8)$.\n States with stabilizer dimension $n - t$: We give a protocol that runs in\ntime $n^3\\cdot(2^t/\\tau)^{O(\\log(1/\\epsilon))}$, extending recent work on\nlearning quantum states prepared by circuits with few non-Clifford gates, which\nonly applied in the realizable setting where $\\tau = 1$ [30, 37, 46, 61].\n Discrete product states: If $C = K^{\\otimes n}$ for some $\\mu$-separated\ndiscrete set $K$ of single-qubit states, we give a protocol that runs in time\n$(n/\\mu)^{O((1 + \\log (1/\\tau))/\\mu)}/\\epsilon^2$. This strictly generalizes a\nprior guarantee which applied to stabilizer product states [39]. For stabilizer\nproduct states, we give a further improved protocol that runs in time\n$(n^2/\\epsilon^2)\\cdot (1/\\tau)^{O(\\log(1/\\tau))}$.\n As a corollary, we give the first protocol for estimating stabilizer\nfidelity, a standard measure of magic for quantum states, to error $\\epsilon$\nin $n^3 \\mathrm{quasipoly}(1/\\epsilon)$ time.\n","authors":["Sitan Chen","Weiyuan Gong","Qi Ye","Zhihan Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06967v1.pdf","comment":"68 pages"},{"id":"http://arxiv.org/abs/2408.06966v1","updated":"2024-08-13T15:21:46Z","published":"2024-08-13T15:21:46Z","title":"DyG-Mamba: Continuous State Space Modeling on Dynamic Graphs","summary":" Dynamic graph learning aims to uncover evolutionary laws in real-world\nsystems, enabling accurate social recommendation (link prediction) or early\ndetection of cancer cells (classification). Inspired by the success of state\nspace models, e.g., Mamba, for efficiently capturing long-term dependencies in\nlanguage modeling, we propose DyG-Mamba, a new continuous state space model\n(SSM) for dynamic graph learning. Specifically, we first found that using\ninputs as control signals for SSM is not suitable for continuous-time dynamic\nnetwork data with irregular sampling intervals, resulting in models being\ninsensitive to time information and lacking generalization properties. Drawing\ninspiration from the Ebbinghaus forgetting curve, which suggests that memory of\npast events is strongly correlated with time intervals rather than specific\ndetails of the events themselves, we directly utilize irregular time spans as\ncontrol signals for SSM to achieve significant robustness and generalization.\nThrough exhaustive experiments on 12 datasets for dynamic link prediction and\ndynamic node classification tasks, we found that DyG-Mamba achieves\nstate-of-the-art performance on most of the datasets, while also demonstrating\nsignificantly improved computation and memory efficiency.\n","authors":["Dongyuan Li","Shiyin Tan","Ying Zhang","Ming Jin","Shirui Pan","Manabu Okumura","Renhe Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.06966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06960v1","updated":"2024-08-13T15:17:03Z","published":"2024-08-13T15:17:03Z","title":"Measuring User Understanding in Dialogue-based XAI Systems","summary":" The field of eXplainable Artificial Intelligence (XAI) is increasingly\nrecognizing the need to personalize and/or interactively adapt the explanation\nto better reflect users' explanation needs. While dialogue-based approaches to\nXAI have been proposed recently, the state-of-the-art in XAI is still\ncharacterized by what we call one-shot, non-personalized and one-way\nexplanations. In contrast, dialogue-based systems that can adapt explanations\nthrough interaction with a user promise to be superior to GUI-based or\ndashboard explanations as they offer a more intuitive way of requesting\ninformation. In general, while interactive XAI systems are often evaluated in\nterms of user satisfaction, there are limited studies that access user's\nobjective model understanding. This is in particular the case for\ndialogue-based XAI approaches. In this paper, we close this gap by carrying out\ncontrolled experiments within a dialogue framework in which we measure\nunderstanding of users in three phases by asking them to simulate the\npredictions of the model they are learning about. By this, we can quantify the\nlevel of (improved) understanding w.r.t. how the model works, comparing the\nstate prior, and after the interaction. We further analyze the data to reveal\npatterns of how the interaction between groups with high vs. low understanding\ngain differ. Overall, our work thus contributes to our understanding about the\neffectiveness of XAI approaches.\n","authors":["Dimitry Mindlin","Amelie Sophie Robrecht","Michael Morasch","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2408.06960v1.pdf","comment":"Accepted at the ECAI 2024 main conference - final version and code\n coming soon. 8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06958v1","updated":"2024-08-13T15:15:37Z","published":"2024-08-13T15:15:37Z","title":"AuToMATo: A Parameter-Free Persistence-Based Clustering Algorithm","summary":" We present AuToMATo, a novel parameter-free clustering algorithm based on\npersistent homology. AuToMATo combines the existing ToMATo clustering algorithm\nwith a bootstrapping procedure in order to separate significant peaks of an\nestimated density function from non-significant ones. We perform a thorough\ncomparison of AuToMATo against many other state-of-the-art clustering\nalgorithms. We find that not only that AuToMATo compares favorably against\nother parameter-free clustering algorithms, but in many instances also\nsignificantly outperforms even the best selection of parameters for other\nalgorithms. AuToMATo is motivated by applications in topological data analysis,\nin particular the Mapper algorithm, where it is desirable to work with a\nparameter-free clustering algorithm. Indeed, we provide evidence that AuToMATo\nperforms well when used with Mapper. Finally, we provide an open-source\nimplementation of AuToMATo in Python that is fully compatible with the\nstandardscikit-learn architecture.\n","authors":["Marius Huber","Sara Kalisnik","Patrick Schnider"],"pdf_url":"https://arxiv.org/pdf/2408.06958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01049v2","updated":"2024-08-13T15:06:41Z","published":"2024-07-01T07:56:48Z","title":"SE(3)-Hyena Operator for Scalable Equivariant Learning","summary":" Modeling global geometric context while maintaining equivariance is crucial\nfor accurate predictions in many fields such as biology, chemistry, or vision.\nYet, this is challenging due to the computational demands of processing\nhigh-dimensional data at scale. Existing approaches such as equivariant\nself-attention or distance-based message passing, suffer from quadratic\ncomplexity with respect to sequence length, while localized methods sacrifice\nglobal information. Inspired by the recent success of state-space and\nlong-convolutional models, in this work, we introduce SE(3)-Hyena operator, an\nequivariant long-convolutional model based on the Hyena operator. The\nSE(3)-Hyena captures global geometric context at sub-quadratic complexity while\nmaintaining equivariance to rotations and translations. Evaluated on\nequivariant associative recall and n-body modeling, SE(3)-Hyena matches or\noutperforms equivariant self-attention while requiring significantly less\nmemory and computational resources for long sequences. Our model processes the\ngeometric context of 20k tokens x3.5 times faster than the equivariant\ntransformer and allows x175 longer a context within the same memory budget.\n","authors":["Artem Moskalev","Mangal Prakash","Rui Liao","Tommaso Mansi"],"pdf_url":"https://arxiv.org/pdf/2407.01049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17034v2","updated":"2024-08-13T15:04:18Z","published":"2024-05-27T10:40:21Z","title":"FUGNN: Harmonizing Fairness and Utility in Graph Neural Networks","summary":" Fairness-aware Graph Neural Networks (GNNs) often face a challenging\ntrade-off, where prioritizing fairness may require compromising utility. In\nthis work, we re-examine fairness through the lens of spectral graph theory,\naiming to reconcile fairness and utility within the framework of spectral graph\nlearning. We explore the correlation between sensitive features and spectrum in\nGNNs, using theoretical analysis to delineate the similarity between original\nsensitive features and those after convolution under different spectra. Our\nanalysis reveals a reduction in the impact of similarity when the eigenvectors\nassociated with the largest magnitude eigenvalue exhibit directional\nsimilarity. Based on these theoretical insights, we propose FUGNN, a novel\nspectral graph learning approach that harmonizes the conflict between fairness\nand utility. FUGNN ensures algorithmic fairness and utility by truncating the\nspectrum and optimizing eigenvector distribution during the encoding process.\nThe fairness-aware eigenvector selection reduces the impact of convolution on\nsensitive features while concurrently minimizing the sacrifice of utility.\nFUGNN further optimizes the distribution of eigenvectors through a transformer\narchitecture. By incorporating the optimized spectrum into the graph\nconvolution network, FUGNN effectively learns node representations. Experiments\non six real-world datasets demonstrate the superiority of FUGNN over baseline\nmethods. The codes are available at https://github.com/yushuowiki/FUGNN.\n","authors":["Renqiang Luo","Huafei Huang","Shuo Yu","Zhuoyang Han","Estrid He","Xiuzhen Zhang","Feng Xia"],"pdf_url":"https://arxiv.org/pdf/2405.17034v2.pdf","comment":"Accepted in SIGKDD 2024"},{"id":"http://arxiv.org/abs/2408.06945v1","updated":"2024-08-13T15:03:46Z","published":"2024-08-13T15:03:46Z","title":"Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation","summary":" By using an parametric value function to replace the Monte-Carlo rollouts for\nvalue estimation, the actor-critic (AC) algorithms can reduce the variance of\nstochastic policy gradient so that to improve the convergence rate. While\nexisting works mainly focus on analyzing convergence rate of AC algorithms\nunder Markovian noise, the impacts of momentum on AC algorithms remain largely\nunexplored. In this work, we first propose a heavy-ball momentum based\nadvantage actor-critic (\\mbox{HB-A2C}) algorithm by integrating the heavy-ball\nmomentum into the critic recursion that is parameterized by a linear function.\nWhen the sample trajectory follows a Markov decision process, we quantitatively\ncertify the acceleration capability of the proposed HB-A2C algorithm. Our\ntheoretical results demonstrate that the proposed HB-A2C finds an\n$\\epsilon$-approximate stationary point with $\\oo{\\epsilon^{-2}}$ iterations\nfor reinforcement learning tasks with Markovian noise. Moreover, we also reveal\nthe dependence of learning rates on the length of the sample trajectory. By\ncarefully selecting the momentum factor of the critic recursion, the proposed\nHB-A2C can balance the errors introduced by the initialization and the\nstoschastic approximation.\n","authors":["Yanjie Dong","Haijun Zhang","Gang Wang","Shisheng Cui","Xiping Hu"],"pdf_url":"https://arxiv.org/pdf/2408.06945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06943v1","updated":"2024-08-13T15:01:33Z","published":"2024-08-13T15:01:33Z","title":"Towards Holistic Disease Risk Prediction using Small Language Models","summary":" Data in the healthcare domain arise from a variety of sources and modalities,\nsuch as x-ray images, continuous measurements, and clinical notes. Medical\npractitioners integrate these diverse data types daily to make informed and\naccurate decisions. With recent advancements in language models capable of\nhandling multimodal data, it is a logical progression to apply these models to\nthe healthcare sector. In this work, we introduce a framework that connects\nsmall language models to multiple data sources, aiming to predict the risk of\nvarious diseases simultaneously. Our experiments encompass 12 different tasks\nwithin a multitask learning setup. Although our approach does not surpass\nstate-of-the-art methods specialized for single tasks, it demonstrates\ncompetitive performance and underscores the potential of small language models\nfor multimodal reasoning in healthcare.\n","authors":["Liv Björkdahl","Oskar Pauli","Johan Östman","Chiara Ceccobello","Sara Lundell","Magnus Kjellberg"],"pdf_url":"https://arxiv.org/pdf/2408.06943v1.pdf","comment":"6 pages, submitted to ICMLA"},{"id":"http://arxiv.org/abs/2404.04662v3","updated":"2024-08-13T14:56:35Z","published":"2024-04-06T15:31:20Z","title":"Learning Minimal Neural Specifications","summary":" Formal verification is only as good as the specification of a system, which\nis also true for neural network verification. Existing specifications follow\nthe paradigm of data as specification, where the local neighborhood around a\nreference data point is considered correct or robust. While these\nspecifications provide a fair testbed for assessing model robustness, they are\ntoo restrictive for verifying unseen test data-a challenging task with\nsignificant real-world implications. Recent work shows great promise through a\nnew paradigm, neural representation as specification, which uses neural\nactivation patterns (NAPs) for this purpose. However, it computes the most\nrefined NAPs, which include many redundant neurons. In this paper, we study the\nfollowing problem: Given a neural network, find a minimal (general) NAP\nspecification that is sufficient for formal verification of the network's\nrobustness. Finding the minimal NAP specification not only expands verifiable\nbounds but also provides insights into which neurons contribute to the model's\nrobustness. To address this problem, we propose several exact and approximate\napproaches. Our exact approaches leverage the verification tool to find minimal\nNAP specifications in either a deterministic or statistical manner. Whereas the\napproximate methods efficiently estimate minimal NAPs using adversarial\nexamples and local gradients, without making calls to the verification tool.\nThis allows us to inspect potential causal links between neurons and the\nrobustness of state-of-the art neural networks, a task for which existing\nverification frameworks fail to scale. Our experimental results suggest that\nminimal NAP specifications require much smaller fractions of neurons compared\nto the most refined NAP specifications computed by previous work, yet they can\nsignificantly expand the verifiable boundaries to several orders of magnitude\nlarger.\n","authors":["Chuqin Geng","Zhaoyue Wang","Haolin Ye","Saifei Liao","Xujie Si"],"pdf_url":"https://arxiv.org/pdf/2404.04662v3.pdf","comment":"31 pages,9 figures"},{"id":"http://arxiv.org/abs/2309.00983v2","updated":"2024-08-13T14:48:39Z","published":"2023-09-02T16:48:02Z","title":"An Ensemble Score Filter for Tracking High-Dimensional Nonlinear\n Dynamical Systems","summary":" We propose an ensemble score filter (EnSF) for solving high-dimensional\nnonlinear filtering problems with superior accuracy. A major drawback of\nexisting filtering methods, e.g., particle filters or ensemble Kalman filters,\nis the low accuracy in handling high-dimensional and highly nonlinear problems.\nEnSF attacks this challenge by exploiting the score-based diffusion model,\ndefined in a pseudo-temporal domain, to characterizing the evolution of the\nfiltering density. EnSF stores the information of the recursively updated\nfiltering density function in the score function, instead of storing the\ninformation in a set of finite Monte Carlo samples (used in particle filters\nand ensemble Kalman filters). Unlike existing diffusion models that train\nneural networks to approximate the score function, we develop a training-free\nscore estimation that uses a mini-batch-based Monte Carlo estimator to directly\napproximate the score function at any pseudo-spatial-temporal location, which\nprovides sufficient accuracy in solving high-dimensional nonlinear problems as\nwell as saves a tremendous amount of time spent on training neural networks.\nHigh-dimensional Lorenz-96 systems are used to demonstrate the performance of\nour method. EnSF provides surprising performance, compared with the\nstate-of-the-art Local Ensemble Transform Kalman Filter method, in reliably and\nefficiently tracking extremely high-dimensional Lorenz systems (up to 1,000,000\ndimensions) with highly nonlinear observation processes.\n","authors":["Feng Bao","Zezhong Zhang","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00983v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.09282"},{"id":"http://arxiv.org/abs/2408.03360v2","updated":"2024-08-13T14:39:50Z","published":"2024-08-06T17:07:28Z","title":"Prioritize Alignment in Dataset Distillation","summary":" Dataset Distillation aims to compress a large dataset into a significantly\nmore compact, synthetic one without compromising the performance of the trained\nmodels. To achieve this, existing methods use the agent model to extract\ninformation from the target dataset and embed it into the distilled dataset.\nConsequently, the quality of extracted and embedded information determines the\nquality of the distilled dataset. In this work, we find that existing methods\nintroduce misaligned information in both information extraction and embedding\nstages. To alleviate this, we propose Prioritize Alignment in Dataset\nDistillation (PAD), which aligns information from the following two\nperspectives. 1) We prune the target dataset according to the compressing ratio\nto filter the information that can be extracted by the agent model. 2) We use\nonly deep layers of the agent model to perform the distillation to avoid\nexcessively introducing low-level information. This simple strategy effectively\nfilters out misaligned information and brings non-trivial improvement for\nmainstream matching-based distillation algorithms. Furthermore, built on\ntrajectory matching, \\textbf{PAD} achieves remarkable improvements on various\nbenchmarks, achieving state-of-the-art performance.\n","authors":["Zekai Li","Ziyao Guo","Wangbo Zhao","Tianle Zhang","Zhi-Qi Cheng","Samir Khaki","Kaipeng Zhang","Ahmad Sajedi","Konstantinos N Plataniotis","Kai Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2408.03360v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.11877v4","updated":"2024-08-13T14:38:59Z","published":"2024-05-20T08:41:15Z","title":"A Novel Cartography-Based Curriculum Learning Method Applied on RoNLI:\n The First Romanian Natural Language Inference Corpus","summary":" Natural language inference (NLI), the task of recognizing the entailment\nrelationship in sentence pairs, is an actively studied topic serving as a proxy\nfor natural language understanding. Despite the relevance of the task in\nbuilding conversational agents and improving text classification, machine\ntranslation and other NLP tasks, to the best of our knowledge, there is no\npublicly available NLI corpus for the Romanian language. To this end, we\nintroduce the first Romanian NLI corpus (RoNLI) comprising 58K training\nsentence pairs, which are obtained via distant supervision, and 6K validation\nand test sentence pairs, which are manually annotated with the correct labels.\nWe conduct experiments with multiple machine learning methods based on distant\nlearning, ranging from shallow models based on word embeddings to\ntransformer-based neural networks, to establish a set of competitive baselines.\nFurthermore, we improve on the best model by employing a new curriculum\nlearning strategy based on data cartography. Our dataset and code to reproduce\nthe baselines are available at https://github.com/Eduard6421/RONLI.\n","authors":["Eduard Poesina","Cornelia Caragea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2405.11877v4.pdf","comment":"Accepted at ACL 2024 (Main)"},{"id":"http://arxiv.org/abs/2408.06927v1","updated":"2024-08-13T14:29:00Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis.\n To overcome these constraints, this paper presents the Inter-class Feature\nCompensator (INFER), an innovative distillation approach that transcends the\nclass-specific data-label framework widely utilized in current dataset\ndistillation methods. Specifically, INFER leverages a Universal Feature\nCompensator (UFC) to enhance feature integration across classes, enabling the\ngeneration of multiple additional synthetic instances from a single UFC input.\nThis significantly improves the efficiency of the distillation budget.\n Moreover, INFER enriches inter-class interactions during the distillation,\nthereby enhancing the effectiveness and generalizability of the distilled data.\nBy allowing for the linear interpolation of labels similar to those in the\noriginal dataset, INFER meticulously optimizes the synthetic data and\ndramatically reduces the size of soft labels in the synthetic dataset to almost\nzero, establishing a new benchmark for efficiency and effectiveness in dataset\ndistillation.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14209v4","updated":"2024-08-13T14:27:38Z","published":"2023-09-25T15:14:54Z","title":"Continual Driving Policy Optimization with Closed-Loop Individualized\n Curricula","summary":" The safety of autonomous vehicles (AV) has been a long-standing top concern,\nstemming from the absence of rare and safety-critical scenarios in the\nlong-tail naturalistic driving distribution. To tackle this challenge, a surge\nof research in scenario-based autonomous driving has emerged, with a focus on\ngenerating high-risk driving scenarios and applying them to conduct\nsafety-critical testing of AV models. However, limited work has been explored\non the reuse of these extensive scenarios to iteratively improve AV models.\nMoreover, it remains intractable and challenging to filter through gigantic\nscenario libraries collected from other AV models with distinct behaviors,\nattempting to extract transferable information for current AV improvement.\nTherefore, we develop a continual driving policy optimization framework\nfeaturing Closed-Loop Individualized Curricula (CLIC), which we factorize into\na set of standardized sub-modules for flexible implementation choices: AV\nEvaluation, Scenario Selection, and AV Training. CLIC frames AV Evaluation as a\ncollision prediction task, where it estimates the chance of AV failures in\nthese scenarios at each iteration. Subsequently, by re-sampling from historical\nscenarios based on these failure probabilities, CLIC tailors individualized\ncurricula for downstream training, aligning them with the evaluated capability\nof AV. Accordingly, CLIC not only maximizes the utilization of the vast\npre-collected scenario library for closed-loop driving policy optimization but\nalso facilitates AV improvement by individualizing its training with more\nchallenging cases out of those poorly organized scenarios. Experimental results\nclearly indicate that CLIC surpasses other curriculum-based training\nstrategies, showing substantial improvement in managing risky scenarios, while\nstill maintaining proficiency in handling simpler cases.\n","authors":["Haoyi Niu","Yizhou Xu","Xingjian Jiang","Jianming Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14209v4.pdf","comment":"ICRA 2024"},{"id":"http://arxiv.org/abs/2201.08712v4","updated":"2024-08-13T14:22:10Z","published":"2022-01-21T14:16:56Z","title":"Improved Random Features for Dot Product Kernels","summary":" Dot product kernels, such as polynomial and exponential (softmax) kernels,\nare among the most widely used kernels in machine learning, as they enable\nmodeling the interactions between input features, which is crucial in\napplications like computer vision, natural language processing, and recommender\nsystems. We make several novel contributions for improving the efficiency of\nrandom feature approximations for dot product kernels, to make these kernels\nmore useful in large scale learning. First, we present a generalization of\nexisting random feature approximations for polynomial kernels, such as\nRademacher and Gaussian sketches and TensorSRHT, using complex-valued random\nfeatures. We show empirically that the use of complex features can\nsignificantly reduce the variances of these approximations. Second, we provide\na theoretical analysis for understanding the factors affecting the efficiency\nof various random feature approximations, by deriving closed-form expressions\nfor their variances. These variance formulas elucidate conditions under which\ncertain approximations (e.g., TensorSRHT) achieve lower variances than others\n(e.g., Rademacher sketches), and conditions under which the use of complex\nfeatures leads to lower variances than real features. Third, by using these\nvariance formulas, which can be evaluated in practice, we develop a data-driven\noptimization approach to improve random feature approximations for general dot\nproduct kernels, which is also applicable to the Gaussian kernel. We describe\nthe improvements brought by these contributions with extensive experiments on a\nvariety of tasks and datasets.\n","authors":["Jonas Wacker","Motonobu Kanagawa","Maurizio Filippone"],"pdf_url":"https://arxiv.org/pdf/2201.08712v4.pdf","comment":"To appear in Journal of Machine Learning Research (JMLR)"},{"id":"http://arxiv.org/abs/2408.06903v1","updated":"2024-08-13T13:56:17Z","published":"2024-08-13T13:56:17Z","title":"Heterogeneity: An Open Challenge for Federated On-board Machine Learning","summary":" The design of satellite missions is currently undergoing a paradigm shift\nfrom the historical approach of individualised monolithic satellites towards\ndistributed mission configurations, consisting of multiple small satellites.\nWith a rapidly growing number of such satellites now deployed in orbit, each\ncollecting large amounts of data, interest in on-board orbital edge computing\nis rising. Federated Learning is a promising distributed computing approach in\nthis context, allowing multiple satellites to collaborate efficiently in\ntraining on-board machine learning models. Though recent works on the use of\nFederated Learning in orbital edge computing have focused largely on\nhomogeneous satellite constellations, Federated Learning could also be employed\nto allow heterogeneous satellites to form ad-hoc collaborations, e.g. in the\ncase of communications satellites operated by different providers. Such an\napplication presents additional challenges to the Federated Learning paradigm,\narising largely from the heterogeneity of such a system. In this position\npaper, we offer a systematic review of these challenges in the context of the\ncross-provider use case, giving a brief overview of the state-of-the-art for\neach, and providing an entry point for deeper exploration of each issue.\n","authors":["Maria Hartmann","Grégoire Danoy","Pascal Bouvry"],"pdf_url":"https://arxiv.org/pdf/2408.06903v1.pdf","comment":"Accepted to the ESA SPAICE conference 2024"},{"id":"http://arxiv.org/abs/2009.07525v2","updated":"2024-08-13T13:49:43Z","published":"2020-09-16T07:44:27Z","title":"Detectability of hierarchical communities in networks","summary":" We study the problem of recovering a planted hierarchy of partitions in a\nnetwork. The detectability of a single planted partition has previously been\nanalysed in detail and a phase transition has been identified below which the\npartition cannot be detected. Here we show that, in the hierarchical setting,\nthere exist additional phases in which the presence of multiple consistent\npartitions can either help or hinder detection. Accordingly, the detectability\nlimit for non-hierarchical partitions typically provides insufficient\ninformation about the detectability of the complete hierarchical structure, as\nwe highlight with several constructive examples.\n","authors":["Leto Peel","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2009.07525v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.16205v3","updated":"2024-08-13T13:46:18Z","published":"2024-07-23T06:14:41Z","title":"Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought remarkable\ngenerative capabilities across diverse tasks. However, despite the impressive\nachievements, these LLMs still have numerous inherent vulnerabilities,\nparticularly when faced with jailbreak attacks. By investigating jailbreak\nattacks, we can uncover hidden weaknesses in LLMs and inform the development of\nmore robust defense mechanisms to fortify their security. In this paper, we\nfurther explore the boundary of jailbreak attacks on LLMs and propose\nAnalyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes\nadvantage of LLMs' growing analyzing and reasoning capability and reveals their\nunderlying vulnerabilities when facing analyzing-based tasks. We conduct a\ndetailed evaluation of ABJ across various open-source and closed-source LLMs,\nwhich achieves 94.8% attack success rate (ASR) and 1.06 attack efficiency (AE)\non GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and\nefficiency. Our research highlights the importance of prioritizing and\nenhancing the safety of LLMs to mitigate the risks of misuse. The code is\npublicly available at hhttps://github.com/theshi-1128/ABJ-Attack. Warning: This\npaper contains examples of LLMs that might be offensive or harmful.\n","authors":["Shi Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06891v1","updated":"2024-08-13T13:38:32Z","published":"2024-08-13T13:38:32Z","title":"Automatic Feature Recognition and Dimensional Attributes Extraction From\n CAD Models for Hybrid Additive-Subtractive Manufacturing","summary":" The integration of Computer-Aided Design (CAD), Computer-Aided Process\nPlanning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in\nmodern manufacturing, facilitating seamless transitions from digital designs to\nphysical products. However, a significant challenge within this integration is\nthe Automatic Feature Recognition (AFR) of CAD models, especially in the\ncontext of hybrid manufacturing that combines subtractive and additive\nmanufacturing processes. Traditional AFR methods, focused mainly on the\nidentification of subtractive (machined) features including holes, fillets,\nchamfers, pockets, and slots, fail to recognize features pertinent to additive\nmanufacturing. Furthermore, the traditional methods fall short in accurately\nextracting geometric dimensions and orientations, which are also key factors\nfor effective manufacturing process planning. This paper presents a novel\napproach for creating a synthetic CAD dataset that encompasses features\nrelevant to both additive and subtractive machining through Python Open\nCascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is\nimplemented to accurately identify the composite additive-subtractive features\nwithin the synthetic CAD dataset. The key novelty and contribution of the\nproposed methodology lie in its ability to recognize a wide range of\nmanufacturing features, and precisely extracting their dimensions,\norientations, and stock sizes. The proposed model demonstrates remarkable\nfeature recognition accuracy exceeding 97% and a dimension extraction accuracy\nof 100% for identified features. Therefore, the proposed methodology enhances\nthe integration of CAD, CAPP, and CAM within hybrid manufacturing by providing\nprecise feature recognition and dimension extraction. It facilitates improved\nmanufacturing process planning, by enabling more informed decision-making.\n","authors":["Muhammad Tayyab Khan","Wenhe Feng","Lequn Chen","Ye Han Ng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2408.06891v1.pdf","comment":"10 pages, 12 figures. This paper has been accepted for presentation\n at the ASME IDETC-CIE 2024 conference"},{"id":"http://arxiv.org/abs/2408.06890v1","updated":"2024-08-13T13:36:48Z","published":"2024-08-13T13:36:48Z","title":"BMFT: Achieving Fairness via Bias-based Weight Masking Fine-tuning","summary":" Developing models with robust group fairness properties is paramount,\nparticularly in ethically sensitive domains such as medical diagnosis. Recent\napproaches to achieving fairness in machine learning require a substantial\namount of training data and depend on model retraining, which may not be\npractical in real-world scenarios. To mitigate these challenges, we propose\nBias-based Weight Masking Fine-Tuning (BMFT), a novel post-processing method\nthat enhances the fairness of a trained model in significantly fewer epochs\nwithout requiring access to the original training data. BMFT produces a mask\nover model parameters, which efficiently identifies the weights contributing\nthe most towards biased predictions. Furthermore, we propose a two-step\ndebiasing strategy, wherein the feature extractor undergoes initial fine-tuning\non the identified bias-influenced weights, succeeded by a fine-tuning phase on\na reinitialised classification layer to uphold discriminative performance.\nExtensive experiments across four dermatological datasets and two sensitive\nattributes demonstrate that BMFT outperforms existing state-of-the-art (SOTA)\ntechniques in both diagnostic accuracy and fairness metrics. Our findings\nunderscore the efficacy and robustness of BMFT in advancing fairness across\nvarious out-of-distribution (OOD) settings. Our code is available at:\nhttps://github.com/vios-s/BMFT\n","authors":["Yuyang Xue","Junyu Yan","Raman Dutt","Fasih Haider","Jingshuai Liu","Steven McDonagh","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2408.06890v1.pdf","comment":"Accepted by MICCAI 2024 FAIMI Workshop Oral"},{"id":"http://arxiv.org/abs/2407.02461v4","updated":"2024-08-13T13:34:45Z","published":"2024-07-02T17:40:06Z","title":"Decentralized Intelligence Network (DIN)","summary":" Decentralized Intelligence Network (DIN) is a theoretical framework\naddressing data fragmentation and siloing challenges, enabling scalable AI\nthrough data sovereignty. It facilitates effective AI utilization within\nsovereign networks by overcoming barriers to accessing diverse data sources,\nleveraging: 1) personal data stores to ensure data sovereignty, where data\nremains securely within Participants' control; 2) a scalable federated learning\nprotocol implemented on a public blockchain for decentralized AI training,\nwhere only model parameter updates are shared, keeping data within the personal\ndata stores; and 3) a scalable, trustless cryptographic rewards mechanism on a\npublic blockchain to incentivize participation and ensure fair reward\ndistribution through a decentralized auditing protocol. This approach\nguarantees that no entity can prevent or control access to training data or\ninfluence financial benefits, as coordination and reward distribution are\nmanaged on the public blockchain with an immutable record. The framework\nsupports effective AI training by allowing Participants to maintain control\nover their data, benefit financially, and contribute to a decentralized,\nscalable ecosystem that leverages collective AI to develop beneficial\nalgorithms.\n","authors":["Abraham Nash"],"pdf_url":"https://arxiv.org/pdf/2407.02461v4.pdf","comment":"14 pages, 1 figure. DIN was presented by the author as a speaker at\n the Summit on Responsible Decentralized Intelligence - Future of\n Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the\n Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC"},{"id":"http://arxiv.org/abs/2404.09447v3","updated":"2024-08-13T13:24:33Z","published":"2024-04-15T04:20:01Z","title":"kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually\n Expanding Large Vocabularies","summary":" Continual segmentation has not yet tackled the challenge of improving\nopen-vocabulary segmentation models with training data for accurate\nsegmentation across large, continually expanding vocabularies. We discover that\ntraditional continual training results in severe catastrophic forgetting,\nfailing to outperform a zero-shot segmentation baseline. We introduce a novel\ntraining-free strategy, kNN-CLIP, which augments the model with a database of\ninstance embeddings for semantic and panoptic segmentation that achieves zero\nforgetting. We demonstrate that kNN-CLIP can adapt to continually growing\nvocabularies without the need for retraining or large memory costs. kNN-CLIP\nenables open-vocabulary segmentation methods to expand their vocabularies on\nany domain with a single pass through the data, while only storing compact\nembeddings. This approach minimizes both compute and memory costs. kNN-CLIP\nachieves state-of-the-art performance across large-vocabulary semantic and\npanoptic segmentation datasets. We hope kNN-CLIP represents a significant step\nforward in enabling more efficient and adaptable continual segmentation, paving\nthe way for advances in real-world large-vocabulary continual segmentation\nmethods.\n","authors":["Zhongrui Gui","Shuyang Sun","Runjia Li","Jianhao Yuan","Zhaochong An","Karsten Roth","Ameya Prabhu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2404.09447v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18066v2","updated":"2024-08-13T13:08:39Z","published":"2024-06-26T04:51:14Z","title":"Learning Optimal Filters Using Variational Inference","summary":" Filtering - the task of estimating the conditional distribution of states of\na dynamical system given partial, noisy, observations - is important in many\nareas of science and engineering, including weather and climate prediction.\nHowever, the filtering distribution is generally intractable to obtain for\nhigh-dimensional, nonlinear systems. Filters used in practice, such as the\nensemble Kalman filter (EnKF), are biased for nonlinear systems and have\nnumerous tuning parameters. Here, we present a framework for learning a\nparameterized analysis map - the map that takes a forecast distribution and\nobservations to the filtering distribution - using variational inference. We\nshow that this methodology can be used to learn gain matrices for filtering\nlinear and nonlinear dynamical systems, as well as inflation and localization\nparameters for an EnKF. Future work will apply this framework to learn new\nfiltering algorithms.\n","authors":["Enoch Luk","Eviatar Bach","Ricardo Baptista","Andrew Stuart"],"pdf_url":"https://arxiv.org/pdf/2406.18066v2.pdf","comment":"Workshop on Machine Learning for Earth System Modeling, International\n Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2408.06867v1","updated":"2024-08-13T13:05:36Z","published":"2024-08-13T13:05:36Z","title":"Optimal Bound for PCA with Outliers using Higher-Degree Voronoi Diagrams","summary":" In this paper, we introduce new algorithms for Principal Component Analysis\n(PCA) with outliers. Utilizing techniques from computational geometry,\nspecifically higher-degree Voronoi diagrams, we navigate to the optimal\nsubspace for PCA even in the presence of outliers. This approach achieves an\noptimal solution with a time complexity of\n$n^{d+\\mathcal{O}(1)}\\text{poly}(n,d)$. Additionally, we present a randomized\nalgorithm with a complexity of $2^{\\mathcal{O}(r(d-r))} \\times \\text{poly}(n,\nd)$. This algorithm samples subspaces characterized in terms of a Grassmannian\nmanifold. By employing such sampling method, we ensure a high likelihood of\ncapturing the optimal subspace, with the success probability $(1 - \\delta)^T$.\nWhere $\\delta$ represents the probability that a sampled subspace does not\ncontain the optimal solution, and $T$ is the number of subspaces sampled,\nproportional to $2^{r(d-r)}$. Our use of higher-degree Voronoi diagrams and\nGrassmannian based sampling offers a clearer conceptual pathway and practical\nadvantages, particularly in handling large datasets or higher-dimensional\nsettings.\n","authors":["Sajjad Hashemian","Mohammad Saeed Arvenaghi","Ebrahim Ardeshir-Larijani"],"pdf_url":"https://arxiv.org/pdf/2408.06867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08945v3","updated":"2024-08-13T12:44:09Z","published":"2023-08-17T12:35:02Z","title":"Interpretable Graph Neural Networks for Tabular Data","summary":" Data in tabular format is frequently occurring in real-world applications.\nGraph Neural Networks (GNNs) have recently been extended to effectively handle\nsuch data, allowing feature interactions to be captured through representation\nlearning. However, these approaches essentially produce black-box models, in\nthe form of deep neural networks, precluding users from following the logic\nbehind the model predictions. We propose an approach, called IGNNet\n(Interpretable Graph Neural Network for tabular data), which constrains the\nlearning algorithm to produce an interpretable model, where the model shows how\nthe predictions are exactly computed from the original input features. A\nlarge-scale empirical investigation is presented, showing that IGNNet is\nperforming on par with state-of-the-art machine-learning algorithms that target\ntabular data, including XGBoost, Random Forests, and TabNet. At the same time,\nthe results show that the explanations obtained from IGNNet are aligned with\nthe true Shapley values of the features without incurring any additional\ncomputational overhead.\n","authors":["Amr Alkhatib","Sofiane Ennadir","Henrik Boström","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2308.08945v3.pdf","comment":"Accepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.05640v2","updated":"2024-08-13T11:52:42Z","published":"2024-08-10T21:50:19Z","title":"Federated Smoothing Proximal Gradient for Quantile Regression with\n Non-Convex Penalties","summary":" Distributed sensors in the internet-of-things (IoT) generate vast amounts of\nsparse data. Analyzing this high-dimensional data and identifying relevant\npredictors pose substantial challenges, especially when data is preferred to\nremain on the device where it was collected for reasons such as data integrity,\ncommunication bandwidth, and privacy. This paper introduces a federated\nquantile regression algorithm to address these challenges. Quantile regression\nprovides a more comprehensive view of the relationship between variables than\nmean regression models. However, traditional approaches face difficulties when\ndealing with nonconvex sparse penalties and the inherent non-smoothness of the\nloss function. For this purpose, we propose a federated smoothing proximal\ngradient (FSPG) algorithm that integrates a smoothing mechanism with the\nproximal gradient framework, thereby enhancing both precision and computational\nspeed. This integration adeptly handles optimization over a network of devices,\neach holding local data samples, making it particularly effective in federated\nlearning scenarios. The FSPG algorithm ensures steady progress and reliable\nconvergence in each iteration by maintaining or reducing the value of the\nobjective function. By leveraging nonconvex penalties, such as the minimax\nconcave penalty (MCP) and smoothly clipped absolute deviation (SCAD), the\nproposed method can identify and preserve key predictors within sparse models.\nComprehensive simulations validate the robust theoretical foundations of the\nproposed algorithm and demonstrate improved estimation precision and reliable\nconvergence.\n","authors":["Reza Mirzaeifard","Diyako Ghaderyan","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2408.05640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06827v1","updated":"2024-08-13T11:39:07Z","published":"2024-08-13T11:39:07Z","title":"PRESENT: Zero-Shot Text-to-Prosody Control","summary":" Current strategies for achieving fine-grained prosody control in speech\nsynthesis entail extracting additional style embeddings or adopting more\ncomplex architectures. To enable zero-shot application of pretrained\ntext-to-speech (TTS) models, we present PRESENT (PRosody Editing without Style\nEmbeddings or New Training), which exploits explicit prosody prediction in\nFastSpeech2-based models by modifying the inference process directly. We apply\nour text-to-prosody framework to zero-shot language transfer using a JETS model\nexclusively trained on English LJSpeech data. We obtain character error rates\n(CER) of 12.8%, 18.7% and 5.9% for German, Hungarian and Spanish respectively,\nbeating the previous state-of-the-art CER by over 2x for all three languages.\nFurthermore, we allow subphoneme-level control, a first in this field. To\nevaluate its effectiveness, we show that PRESENT can improve the prosody of\nquestions, and use it to generate Mandarin, a tonal language where vowel pitch\nvaries at subphoneme level. We attain 25.3% hanzi CER and 13.0% pinyin CER with\nthe JETS model. All our code and audio samples are available online.\n","authors":["Perry Lam","Huayun Zhang","Nancy F. Chen","Berrak Sisman","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2408.06827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12399v2","updated":"2024-08-13T11:28:13Z","published":"2024-07-17T08:25:32Z","title":"A Practical Solver for Scalar Data Topological Simplification","summary":" This paper presents a practical approach for the optimization of topological\nsimplification, a central pre-processing step for the analysis and\nvisualization of scalar data. Given an input scalar field f and a set of\n\"signal\" persistence pairs to maintain, our approach produces an output field g\nthat is close to f and which optimizes (i) the cancellation of \"non-signal\"\npairs, while (ii) preserving the \"signal\" pairs. In contrast to pre-existing\nsimplification algorithms, our approach is not restricted to persistence pairs\ninvolving extrema and can thus address a larger class of topological features,\nin particular saddle pairs in three-dimensional scalar data. Our approach\nleverages recent generic persistence optimization frameworks and extends them\nwith tailored accelerations specific to the problem of topological\nsimplification. Extensive experiments report substantial accelerations over\nthese frameworks, thereby making topological simplification optimization\npractical for real-life datasets. Our approach enables a direct visualization\nand analysis of the topologically simplified data, e.g., via isosurfaces of\nsimplified topology (fewer components and handles). We apply our approach to\nthe extraction of prominent filament structures in three-dimensional data.\nSpecifically, we show that our pre-simplification of the data leads to\npractical improvements over standard topological techniques for removing\nfilament loops. We also show how our approach can be used to repair genus\ndefects in surface processing. Finally, we provide a C++ implementation for\nreproducibility purposes.\n","authors":["Mohamed Kissi","Mathieu Pont","Joshua A. Levine","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2407.12399v2.pdf","comment":"13 pages, 10 figures, IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2408.06820v1","updated":"2024-08-13T11:27:31Z","published":"2024-08-13T11:27:31Z","title":"Efficient Search for Customized Activation Functions with Gradient\n Descent","summary":" Different activation functions work best for different deep learning models.\nTo exploit this, we leverage recent advancements in gradient-based search\ntechniques for neural architectures to efficiently identify high-performing\nactivation functions for a given application. We propose a fine-grained search\ncell that combines basic mathematical operations to model activation functions,\nallowing for the exploration of novel activations. Our approach enables the\nidentification of specialized activations, leading to improved performance in\nevery model we tried, from image classification to language models. Moreover,\nthe identified activations exhibit strong transferability to larger models of\nthe same type, as well as new datasets. Importantly, our automated process for\ncreating customized activation functions is orders of magnitude more efficient\nthan previous approaches. It can easily be applied on top of arbitrary deep\nlearning pipelines and thus offers a promising practical avenue for enhancing\ndeep learning architectures.\n","authors":["Lukas Strack","Mahmoud Safari","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2408.06820v1.pdf","comment":"10 pages, 1 figure, excluding references and appendix"},{"id":"http://arxiv.org/abs/2408.06819v1","updated":"2024-08-13T11:25:22Z","published":"2024-08-13T11:25:22Z","title":"Enhancing Multiview Synergy: Robust Learning by Exploiting the Wave Loss\n Function with Consensus and Complementarity Principles","summary":" Multiview learning (MvL) is an advancing domain in machine learning,\nleveraging multiple data perspectives to enhance model performance through\nview-consistency and view-discrepancy. Despite numerous successful\nmultiview-based SVM models, existing frameworks predominantly focus on the\nconsensus principle, often overlooking the complementarity principle.\nFurthermore, they exhibit limited robustness against noisy, error-prone, and\nview-inconsistent samples, prevalent in multiview datasets. To tackle the\naforementioned limitations, this paper introduces Wave-MvSVM, a novel multiview\nsupport vector machine framework leveraging the wave loss (W-loss) function,\nspecifically designed to harness both consensus and complementarity principles.\nUnlike traditional approaches that often overlook the complementary information\namong different views, the proposed Wave-MvSVM ensures a more comprehensive and\nresilient learning process by integrating both principles effectively. The\nW-loss function, characterized by its smoothness, asymmetry, and bounded\nnature, is particularly effective in mitigating the adverse effects of noisy\nand outlier data, thereby enhancing model stability. Theoretically, the W-loss\nfunction also exhibits a crucial classification-calibrated property, further\nboosting its effectiveness. Wave-MvSVM employs a between-view co-regularization\nterm to enforce view consistency and utilizes an adaptive combination weight\nstrategy to maximize the discriminative power of each view. The optimization\nproblem is efficiently solved using a combination of GD and the ADMM, ensuring\nreliable convergence to optimal solutions. Theoretical analyses, grounded in\nRademacher complexity, validate the generalization capabilities of the\nWave-MvSVM model. Extensive empirical evaluations across diverse datasets\ndemonstrate the superior performance of Wave-MvSVM in comparison to existing\nbenchmark models.\n","authors":["A. Quadir","Mushir Akhtar","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2408.06819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07633v3","updated":"2024-08-13T10:43:06Z","published":"2023-11-13T13:19:34Z","title":"There is No Silver Bullet: Benchmarking Methods in Predictive\n Combinatorial Optimization","summary":" Predictive combinatorial optimization, where the parameters of combinatorial\noptimization (CO) are unknown at the decision-making time, is the precise\nmodeling of many real-world applications, including energy cost-aware\nscheduling and budget allocation on advertising. Tackling such a problem\nusually involves a prediction model and a CO solver. These two modules are\nintegrated into the predictive CO pipeline following two design principles:\n``Predict-then-Optimize (PtO)'', which learns predictions by supervised\ntraining and subsequently solves CO using predicted coefficients, while the\nother, named ``Predict-and-Optimize (PnO)'', directly optimizes towards the\nultimate decision quality and claims to yield better decisions than traditional\nPtO approaches. However, there lacks a systematic benchmark of both approaches,\nincluding the specific design choices at the module level, as well as an\nevaluation dataset that covers representative real-world scenarios. To this\nend, we develop a modular framework to benchmark 11 existing PtO/PnO methods on\n8 problems, including a new industrial dataset for combinatorial advertising\nthat will be released. Our study shows that PnO approaches are better than PtO\non 7 out of 8 benchmarks, but there is no silver bullet found for the specific\ndesign choices of PnO. A comprehensive categorization of current approaches and\nintegration of typical scenarios are provided under a unified benchmark.\nTherefore, this paper could serve as a comprehensive benchmark for future PnO\napproach development and also offer fast prototyping for application-focused\ndevelopment.\n","authors":["Haoyu Geng","Hang Ruan","Runzhong Wang","Yang Li","Yang Wang","Lei Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.07633v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06799v1","updated":"2024-08-13T10:42:32Z","published":"2024-08-13T10:42:32Z","title":"On a Scale-Invariant Approach to Bundle Recommendations in Candy Crush\n Saga","summary":" A good understanding of player preferences is crucial for increasing content\nrelevancy, especially in mobile games. This paper illustrates the use of\nattentive models for producing item recommendations in a mobile game scenario.\nThe methodology comprises a combination of supervised and unsupervised\napproaches to create user-level recommendations while introducing a novel\nscale-invariant approach to the prediction. The methodology is subsequently\napplied to a bundle recommendation in Candy Crush Saga. The strategy of\ndeployment, maintenance, and monitoring of ML models that are scaled up to\nserve millions of users is presented, along with the best practices and design\npatterns adopted to minimize technical debt typical of ML systems. The\nrecommendation approach is evaluated both offline and online, with a focus on\nunderstanding the increase in engagement, click- and take rates, novelty\neffects, recommendation diversity, and the impact of degenerate feedback loops.\nWe have demonstrated that the recommendation enhances user engagement by 30%\nconcerning click rate and by more than 40% concerning take rate. In addition,\nwe empirically quantify the diminishing effects of recommendation accuracy on\nuser engagement.\n","authors":["Styliani Katsarou","Francesca Carminati","Martin Dlask","Marta Braojos","Lavena Patra","Richard Perkins","Carlos Garcia Ling","Maria Paskevich"],"pdf_url":"https://arxiv.org/pdf/2408.06799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20775v2","updated":"2024-08-13T10:18:45Z","published":"2024-07-30T12:22:03Z","title":"Interpretable Pre-Trained Transformers for Heart Time-Series Data","summary":" Decoder-only transformers are the backbone of the popular generative\npre-trained transformer (GPT) series of large language models. In this work, we\nemploy this framework to the analysis of clinical heart time-series data, to\ncreate two pre-trained general purpose cardiac models, termed PPG-PT and\nECG-PT. We place a special emphasis on making both such pre-trained models\nfully interpretable. This is achieved firstly through aggregate attention maps\nwhich show that, in order to make predictions, the model focuses on similar\npoints in previous cardiac cycles and gradually broadens its attention in\ndeeper layers. Next, we show that tokens with the same value, which occur at\ndifferent distinct points in the electrocardiography (ECG) and\nphotoplethysmography (PPG) cycle, form separate clusters in high dimensional\nspace. The clusters form according to phase, as the tokens propagate through\nthe transformer blocks. Finally, we highlight that individual attention heads\nrespond to specific physiologically relevent features, such as the dicrotic\nnotch in PPG and the P-wave in ECG. It is also demonstrated that these\npre-trained models are straightforward to fine-tune for tasks such as\nclassification of atrial fibrillation (AF), and beat detection in\nphotoplethysmography. For the example of AF, the fine-tuning took 11 minutes of\ncomputer time, and achieved the respective leave-one-subject-out AUCs of 0.99\nand 0.93 for ECG and PPG within the MIMIC Perform AF dataset. In addition, the\nfine-tuned beat detector achieved a state-of-the-art F1 score of 98%, as well\nas uniquely providing a beat confidence level which acts as a signal quality\nestimator. Importantly, the fine-tuned models for AF screening are also fully\nexplainable, with attention shifting to regions in the context that are\nstrongly indicative of atrial fibrillation.\n","authors":["Harry J. Davies","James Monsen","Danilo P. Mandic"],"pdf_url":"https://arxiv.org/pdf/2407.20775v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06784v1","updated":"2024-08-13T10:13:33Z","published":"2024-08-13T10:13:33Z","title":"Enhancing Diabetic Retinopathy Diagnosis: A Lightweight CNN Architecture\n for Efficient Exudate Detection in Retinal Fundus Images","summary":" Retinal fundus imaging plays an essential role in diagnosing various stages\nof diabetic retinopathy, where exudates are critical markers of early disease\nonset. Prompt detection of these exudates is pivotal for enabling optometrists\nto arrest or significantly decelerate the disease progression. This paper\nintroduces a novel, lightweight convolutional neural network architecture\ntailored for automated exudate detection, designed to identify these markers\nefficiently and accurately. To address the challenge of limited training data,\nwe have incorporated domain-specific data augmentations to enhance the model's\ngeneralizability. Furthermore, we applied a suite of regularization techniques\nwithin our custom architecture to boost diagnostic accuracy while optimizing\ncomputational efficiency. Remarkably, this streamlined model contains only 4.73\nmillion parameters a reduction of nearly 60% compared to the standard ResNet-18\nmodel, which has 11.69 million parameters. Despite its reduced complexity, our\nmodel achieves an impressive F1 score of 90%, demonstrating its efficacy in the\nearly detection of diabetic retinopathy through fundus imaging.\n","authors":["Mujadded Al Rabbani Alif"],"pdf_url":"https://arxiv.org/pdf/2408.06784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11140v3","updated":"2024-08-13T10:06:57Z","published":"2023-04-21T17:22:08Z","title":"Convergence of Message Passing Graph Neural Networks with Generic\n Aggregation On Large Random Graphs","summary":" We study the convergence of message passing graph neural networks on random\ngraph models to their continuous counterpart as the number of nodes tends to\ninfinity. Until now, this convergence was only known for architectures with\naggregation functions in the form of normalized means, or, equivalently, of an\napplication of classical operators like the adjacency matrix or the graph\nLaplacian. We extend such results to a large class of aggregation functions,\nthat encompasses all classically used message passing graph neural networks,\nsuch as attention-based message passing, max convolutional message passing,\n(degree-normalized) convolutional message passing, or moment-based aggregation\nmessage passing. Under mild assumptions, we give non-asymptotic bounds with\nhigh probability to quantify this convergence. Our main result is based on the\nMcDiarmid inequality. Interestingly, this result does not apply to the case\nwhere the aggregation is a coordinate-wise maximum. We treat this case\nseparately and obtain a different convergence rate.\n","authors":["Matthieu Cordonnier","Nicolas Keriven","Nicolas Tremblay","Samuel Vaiter"],"pdf_url":"https://arxiv.org/pdf/2304.11140v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16422v2","updated":"2024-08-13T10:06:36Z","published":"2023-06-19T17:42:40Z","title":"Neural networks can detect model-free static arbitrage strategies","summary":" In this paper we demonstrate both theoretically as well as numerically that\nneural networks can detect model-free static arbitrage opportunities whenever\nthe market admits some. Due to the use of neural networks, our method can be\napplied to financial markets with a high number of traded securities and\nensures almost immediate execution of the corresponding trading strategies. To\ndemonstrate its tractability, effectiveness, and robustness we provide examples\nusing real financial data. From a technical point of view, we prove that a\nsingle neural network can approximately solve a class of convex semi-infinite\nprograms, which is the key result in order to derive our theoretical results\nthat neural networks can detect model-free static arbitrage strategies whenever\nthe financial market admits such opportunities.\n","authors":["Ariel Neufeld","Julian Sester"],"pdf_url":"https://arxiv.org/pdf/2306.16422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.10510v7","updated":"2024-08-13T09:58:44Z","published":"2021-12-20T13:07:39Z","title":"Transformers Can Do Bayesian Inference","summary":" Currently, it is hard to reap the benefits of deep learning for Bayesian\nmethods, which allow the explicit specification of prior knowledge and\naccurately capture model uncertainty. We present Prior-Data Fitted Networks\n(PFNs). PFNs leverage in-context learning in large-scale machine learning\ntechniques to approximate a large set of posteriors. The only requirement for\nPFNs to work is the ability to sample from a prior distribution over supervised\nlearning tasks (or functions). Our method restates the objective of posterior\napproximation as a supervised classification problem with a set-valued input:\nit repeatedly draws a task (or function) from the prior, draws a set of data\npoints and their labels from it, masks one of the labels and learns to make\nprobabilistic predictions for it based on the set-valued input of the rest of\nthe data points. Presented with a set of samples from a new supervised learning\ntask as input, PFNs make probabilistic predictions for arbitrary other data\npoints in a single forward propagation, having learned to approximate Bayesian\ninference. We demonstrate that PFNs can near-perfectly mimic Gaussian processes\nand also enable efficient Bayesian inference for intractable problems, with\nover 200-fold speedups in multiple setups compared to current methods. We\nobtain strong results in very diverse areas such as Gaussian process\nregression, Bayesian neural networks, classification for small tabular data\nsets, and few-shot image classification, demonstrating the generality of PFNs.\nCode and trained PFNs are released at\nhttps://github.com/automl/TransformersCanDoBayesianInference.\n","authors":["Samuel Müller","Noah Hollmann","Sebastian Pineda Arango","Josif Grabocka","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2112.10510v7.pdf","comment":"Published at ICLR 2022"},{"id":"http://arxiv.org/abs/2408.06772v1","updated":"2024-08-13T09:55:38Z","published":"2024-08-13T09:55:38Z","title":"Exploring Domain Shift on Radar-Based 3D Object Detection Amidst Diverse\n Environmental Conditions","summary":" The rapid evolution of deep learning and its integration with autonomous\ndriving systems have led to substantial advancements in 3D perception using\nmultimodal sensors. Notably, radar sensors show greater robustness compared to\ncameras and lidar under adverse weather and varying illumination conditions.\nThis study delves into the often-overlooked yet crucial issue of domain shift\nin 4D radar-based object detection, examining how varying environmental\nconditions, such as different weather patterns and road types, impact 3D object\ndetection performance. Our findings highlight distinct domain shifts across\nvarious weather scenarios, revealing unique dataset sensitivities that\nunderscore the critical role of radar point cloud generation. Additionally, we\ndemonstrate that transitioning between different road types, especially from\nhighways to urban settings, introduces notable domain shifts, emphasizing the\nnecessity for diverse data collection across varied road environments. To the\nbest of our knowledge, this is the first comprehensive analysis of domain shift\neffects on 4D radar-based object detection. We believe this empirical study\ncontributes to understanding the complex nature of domain shifts in radar data\nand suggests paths forward for data collection strategy in the face of\nenvironmental variability.\n","authors":["Miao Zhang","Sherif Abdulatif","Benedikt Loesch","Marco Altmann","Marius Schwarz","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06772v1.pdf","comment":"6 pages, 5 figures, 3 tables, accepted in IEEE International\n Conference on Intelligent Transportation Systems (ITSC) 2024"},{"id":"http://arxiv.org/abs/2404.09848v2","updated":"2024-08-13T09:51:39Z","published":"2024-04-15T15:00:17Z","title":"HyperMono: A Monotonicity-aware Approach to Hyper-Relational Knowledge\n Representation","summary":" In a hyper-relational knowledge graph (HKG), each fact is composed of a main\ntriple associated with attribute-value qualifiers, which express additional\nfactual knowledge. The hyper-relational knowledge graph completion (HKGC) task\naims at inferring plausible missing links in a HKG. Most existing approaches to\nHKGC focus on enhancing the communication between qualifier pairs and main\ntriples, while overlooking two important properties that emerge from the\nmonotonicity of the hyper-relational graphs representation regime. Stage\nReasoning allows for a two-step reasoning process, facilitating the integration\nof coarse-grained inference results derived solely from main triples and\nfine-grained inference results obtained from hyper-relational facts with\nqualifiers. In the initial stage, coarse-grained results provide an upper bound\nfor correct predictions, which are subsequently refined in the fine-grained\nstep. More generally, Qualifier Monotonicity implies that by attaching more\nqualifier pairs to a main triple, we may only narrow down the answer set, but\nnever enlarge it. This paper proposes the HyperMono model for hyper-relational\nknowledge graph completion, which realizes stage reasoning and qualifier\nmonotonicity. To implement qualifier monotonicity HyperMono resorts to cone\nembeddings. Experiments on three real-world datasets with three different\nscenario conditions demonstrate the strong performance of HyperMono when\ncompared to the SoTA.\n","authors":["Zhiwei Hu","Víctor Gutiérrez-Basulto","Zhiliang Xiang","Ru Li","Jeff Z. Pan"],"pdf_url":"https://arxiv.org/pdf/2404.09848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06766v1","updated":"2024-08-13T09:42:57Z","published":"2024-08-13T09:42:57Z","title":"Robust Black-box Testing of Deep Neural Networks using Co-Domain\n Coverage","summary":" Rigorous testing of machine learning models is necessary for trustworthy\ndeployments. We present a novel black-box approach for generating test-suites\nfor robust testing of deep neural networks (DNNs). Most existing methods create\ntest inputs based on maximizing some \"coverage\" criterion/metric such as a\nfraction of neurons activated by the test inputs. Such approaches, however, can\nonly analyze each neuron's behavior or each layer's output in isolation and are\nunable to capture their collective effect on the DNN's output, resulting in\ntest suites that often do not capture the various failure modes of the DNN\nadequately. These approaches also require white-box access, i.e., access to the\nDNN's internals (node activations). We present a novel black-box coverage\ncriterion called Co-Domain Coverage (CDC), which is defined as a function of\nthe model's output and thus takes into account its end-to-end behavior.\nSubsequently, we develop a new fuzz testing procedure named CoDoFuzz, which\nuses CDC to guide the fuzzing process to generate a test suite for a DNN. We\nextensively compare the test suite generated by CoDoFuzz with those generated\nusing several state-of-the-art coverage-based fuzz testing methods for the DNNs\ntrained on six publicly available datasets. Experimental results establish the\nefficiency and efficacy of CoDoFuzz in generating the largest number of\nmisclassified inputs and the inputs for which the model lacks confidence in its\ndecision.\n","authors":["Aishwarya Gupta","Indranil Saha","Piyush Rai"],"pdf_url":"https://arxiv.org/pdf/2408.06766v1.pdf","comment":"20 pages (including references), 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2404.16551v2","updated":"2024-08-13T09:42:34Z","published":"2024-04-25T12:07:41Z","title":"Surprisingly Strong Performance Prediction with Neural Graph Features","summary":" Performance prediction has been a key part of the neural architecture search\n(NAS) process, allowing to speed up NAS algorithms by avoiding\nresource-consuming network training. Although many performance predictors\ncorrelate well with ground truth performance, they require training data in the\nform of trained networks. Recently, zero-cost proxies have been proposed as an\nefficient method to estimate network performance without any training. However,\nthey are still poorly understood, exhibit biases with network properties, and\ntheir performance is limited. Inspired by the drawbacks of zero-cost proxies,\nwe propose neural graph features (GRAF), simple to compute properties of\narchitectural graphs. GRAF offers fast and interpretable performance prediction\nwhile outperforming zero-cost proxies and other common encodings. In\ncombination with other zero-cost proxies, GRAF outperforms most existing\nperformance predictors at a fraction of the cost.\n","authors":["Gabriela Kadlecová","Jovita Lukasik","Martin Pilát","Petra Vidnerová","Mahmoud Safari","Roman Neruda","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2404.16551v2.pdf","comment":"ICML 2024. Code at https://github.com/gabikadlecova/zc_combine , blog\n post: https://gabikadlecova.github.io/blog/2024/graf/"},{"id":"http://arxiv.org/abs/2407.07821v2","updated":"2024-08-13T09:06:08Z","published":"2024-07-10T16:45:52Z","title":"When to Accept Automated Predictions and When to Defer to Human\n Judgment?","summary":" Ensuring the reliability and safety of automated decision-making is crucial.\nIt is well-known that data distribution shifts in machine learning can produce\nunreliable outcomes. This paper proposes a new approach for measuring the\nreliability of predictions under distribution shifts. We analyze how the\noutputs of a trained neural network change using clustering to measure\ndistances between outputs and class centroids. We propose this distance as a\nmetric to evaluate the confidence of predictions under distribution shifts. We\nassign each prediction to a cluster with centroid representing the mean softmax\noutput for all correct predictions of a given class. We then define a safety\nthreshold for a class as the smallest distance from an incorrect prediction to\nthe given class centroid. We evaluate the approach on the MNIST and CIFAR-10\ndatasets using a Convolutional Neural Network and a Vision Transformer,\nrespectively. The results show that our approach is consistent across these\ndata sets and network models, and indicate that the proposed metric can offer\nan efficient way of determining when automated predictions are acceptable and\nwhen they should be deferred to human operators given a distribution shift.\n","authors":["Daniel Sikar","Artur Garcez","Tillman Weyde","Robin Bloomfield","Kaleem Peeroo"],"pdf_url":"https://arxiv.org/pdf/2407.07821v2.pdf","comment":"9 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.06743v1","updated":"2024-08-13T09:04:47Z","published":"2024-08-13T09:04:47Z","title":"Class-aware and Augmentation-free Contrastive Learning from Label\n Proportion","summary":" Learning from Label Proportion (LLP) is a weakly supervised learning scenario\nin which training data is organized into predefined bags of instances,\ndisclosing only the class label proportions per bag. This paradigm is essential\nfor user modeling and personalization, where user privacy is paramount,\noffering insights into user preferences without revealing individual data. LLP\nfaces a unique difficulty: the misalignment between bag-level supervision and\nthe objective of instance-level prediction, primarily due to the inherent\nambiguity in label proportion matching. Previous studies have demonstrated deep\nrepresentation learning can generate auxiliary signals to promote the\nsupervision level in the image domain. However, applying these techniques to\ntabular data presents significant challenges: 1) they rely heavily on\nlabel-invariant augmentation to establish multi-view, which is not feasible\nwith the heterogeneous nature of tabular datasets, and 2) tabular datasets\noften lack sufficient semantics for perfect class distinction, making them\nprone to suboptimality caused by the inherent ambiguity of label proportion\nmatching.\n To address these challenges, we propose an augmentation-free contrastive\nframework TabLLP-BDC that introduces class-aware supervision (explicitly aware\nof class differences) at the instance level. Our solution features a two-stage\nBag Difference Contrastive (BDC) learning mechanism that establishes robust\nclass-aware instance-level supervision by disassembling the nuance between bag\nlabel proportions, without relying on augmentations. Concurrently, our model\npresents a pioneering multi-task pretraining pipeline tailored for\ntabular-based LLP, capturing intrinsic tabular feature correlations in\nalignment with label proportion distribution. Extensive experiments demonstrate\nthat TabLLP-BDC achieves state-of-the-art performance for LLP in the tabular\ndomain.\n","authors":["Jialiang Wang","Ning Zhang","Shimin Di","Ruidong Wang","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2408.06743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06979v2","updated":"2024-08-13T08:31:42Z","published":"2024-07-09T15:54:06Z","title":"Can virtual staining for high-throughput screening generalize?","summary":" The large volume and variety of imaging data from high-throughput screening\n(HTS) in the pharmaceutical industry present an excellent resource for training\nvirtual staining models. However, the potential of models trained under one set\nof experimental conditions to generalize to other conditions remains\nunderexplored. This study systematically investigates whether data from three\ncell types (lung, ovarian, and breast) and two phenotypes (toxic and non-toxic\nconditions) commonly found in HTS can effectively train virtual staining models\nto generalize across three typical HTS distribution shifts: unseen phenotypes,\nunseen cell types, and the combination of both. Utilizing a dataset of 772,416\npaired bright-field, cytoplasm, nuclei, and DNA-damage stain images, we\nevaluate the generalization capabilities of models across pixel-based,\ninstance-wise, and biological-feature-based levels. Our findings indicate that\ntraining virtual nuclei and cytoplasm models on non-toxic condition samples not\nonly generalizes to toxic condition samples but leads to improved performance\nacross all evaluation levels compared to training on toxic condition samples.\nGeneralization to unseen cell types shows variability depending on the cell\ntype; models trained on ovarian or lung cell samples often perform well under\nother conditions, while those trained on breast cell samples consistently show\npoor generalization. Generalization to unseen cell types and phenotypes shows\ngood generalization across all levels of evaluation compared to addressing\nunseen cell types alone. This study represents the first large-scale,\ndata-centric analysis of the generalization capability of virtual staining\nmodels trained on diverse HTS datasets, providing valuable strategies for\nexperimental training data generation.\n","authors":["Samuel Tonks","Cuong Nguyen","Steve Hood","Ryan Musso","Ceridwen Hopely","Steve Titus","Minh Doan","Iain Styles","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2407.06979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06720v1","updated":"2024-08-13T08:24:52Z","published":"2024-08-13T08:24:52Z","title":"Multimodal Analysis of White Blood Cell Differentiation in Acute Myeloid\n Leukemia Patients using a β-Variational Autoencoder","summary":" Biomedical imaging and RNA sequencing with single-cell resolution improves\nour understanding of white blood cell diseases like leukemia. By combining\nmorphological and transcriptomic data, we can gain insights into cellular\nfunctions and trajectoriess involved in blood cell differentiation. However,\nexisting methodologies struggle with integrating morphological and\ntranscriptomic data, leaving a significant research gap in comprehensively\nunderstanding the dynamics of cell differentiation. Here, we introduce an\nunsupervised method that explores and reconstructs these two modalities and\nuncovers the relationship between different subtypes of white blood cells from\nhuman peripheral blood smears in terms of morphology and their corresponding\ntranscriptome. Our method is based on a beta-variational autoencoder\n(\\beta-VAE) with a customized loss function, incorporating a R-CNN architecture\nto distinguish single-cell from background and to minimize any interference\nfrom artifacts. This implementation of \\beta-VAE shows good reconstruction\ncapability along with continuous latent embeddings, while maintaining clear\ndifferentiation between single-cell classes. Our novel approach is especially\nhelpful to uncover the correlation of two latent features in complex biological\nprocesses such as formation of granules in the cell (granulopoiesis) with gene\nexpression patterns. It thus provides a unique tool to improve the\nunderstanding of white blood cell maturation for biomedicine and diagnostics.\n","authors":["Gizem Mert","Ario Sadafi","Raheleh Salehi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2408.06720v1.pdf","comment":"Accepted for publication at MICCAI 2024 workshop on AI for Imaging\n Genomics Learning (AIIG)"},{"id":"http://arxiv.org/abs/2408.06717v1","updated":"2024-08-13T08:22:01Z","published":"2024-08-13T08:22:01Z","title":"Computation-friendly Graph Neural Network Design by Accumulating\n Knowledge on Large Language Models","summary":" Graph Neural Networks (GNNs), like other neural networks, have shown\nremarkable success but are hampered by the complexity of their architecture\ndesigns, which heavily depend on specific data and tasks. Traditionally,\ndesigning proper architectures involves trial and error, which requires\nintensive manual effort to optimize various components. To reduce human\nworkload, researchers try to develop automated algorithms to design GNNs.\nHowever, both experts and automated algorithms suffer from two major issues in\ndesigning GNNs: 1) the substantial computational resources expended in\nrepeatedly trying candidate GNN architectures until a feasible design is\nachieved, and 2) the intricate and prolonged processes required for humans or\nalgorithms to accumulate knowledge of the interrelationship between graphs,\nGNNs, and performance.\n To further enhance the automation of GNN architecture design, we propose a\ncomputation-friendly way to empower Large Language Models (LLMs) with\nspecialized knowledge in designing GNNs, thereby drastically shortening the\ncomputational overhead and development cycle of designing GNN architectures.\nOur framework begins by establishing a knowledge retrieval pipeline that\ncomprehends the intercorrelations between graphs, GNNs, and performance. This\npipeline converts past model design experiences into structured knowledge for\nLLM reference, allowing it to quickly suggest initial model proposals.\nSubsequently, we introduce a knowledge-driven search strategy that emulates the\nexploration-exploitation process of human experts, enabling quick refinement of\ninitial proposals within a promising scope. Extensive experiments demonstrate\nthat our framework can efficiently deliver promising (e.g., Top-5.77%) initial\nmodel proposals for unseen datasets within seconds and without any prior\ntraining and achieve outstanding search performance in a few iterations.\n","authors":["Jialiang Wang","Shimin Di","Hanmo Liu","Zhili Wang","Jiachuan Wang","Lei Chen","Xiaofang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01929v3","updated":"2024-08-13T08:11:49Z","published":"2023-10-03T10:13:36Z","title":"Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of\n Text-To-Image Models","summary":" Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have\ndemonstrated remarkable prompt-based image generation capabilities.\nMultilingual encoders may have a substantial impact on the cultural agency of\nthese models, as language is a conduit of culture. In this study, we explore\nthe cultural perception embedded in TTI models by characterizing culture across\nthree hierarchical tiers: cultural dimensions, cultural domains, and cultural\nconcepts. Based on this ontology, we derive prompt templates to unlock the\ncultural knowledge in TTI models, and propose a comprehensive suite of\nevaluation techniques, including intrinsic evaluations using the CLIP space,\nextrinsic evaluations with a Visual-Question-Answer (VQA) model and human\nassessments, to evaluate the cultural content of TTI-generated images. To\nbolster our research, we introduce the CulText2I dataset, derived from six\ndiverse TTI models and spanning ten languages. Our experiments provide insights\nregarding Do, What, Which and How research questions about the nature of\ncultural encoding in TTI models, paving the way for cross-cultural applications\nof these models.\n","authors":["Mor Ventura","Eyal Ben-David","Anna Korhonen","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.01929v3.pdf","comment":"Project page: https://venturamor.github.io/CulText2IWeb/"},{"id":"http://arxiv.org/abs/2408.04840v2","updated":"2024-08-13T08:10:32Z","published":"2024-08-09T03:25:42Z","title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal\n Large Language Models","summary":" Multi-modal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in executing instructions for a variety of single-image tasks.\nDespite this progress, significant challenges remain in modeling long image\nsequences. In this work, we introduce the versatile multi-modal large language\nmodel, mPLUG-Owl3, which enhances the capability for long image-sequence\nunderstanding in scenarios that incorporate retrieved image-text knowledge,\ninterleaved image-text, and lengthy videos. Specifically, we propose novel\nhyper attention blocks to efficiently integrate vision and language into a\ncommon language-guided semantic space, thereby facilitating the processing of\nextended multi-image scenarios. Extensive experimental results suggest that\nmPLUG-Owl3 achieves state-of-the-art performance among models with a similar\nsize on single-image, multi-image, and video benchmarks. Moreover, we propose a\nchallenging long visual sequence evaluation named Distractor Resistance to\nassess the ability of models to maintain focus amidst distractions. Finally,\nwith the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance\non ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to\nthe development of more efficient and powerful multimodal large language\nmodels.\n","authors":["Jiabo Ye","Haiyang Xu","Haowei Liu","Anwen Hu","Ming Yan","Qi Qian","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.04840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06710v1","updated":"2024-08-13T08:09:05Z","published":"2024-08-13T08:09:05Z","title":"Variational Learning of Gaussian Process Latent Variable Models through\n Stochastic Gradient Annealed Importance Sampling","summary":" Gaussian Process Latent Variable Models (GPLVMs) have become increasingly\npopular for unsupervised tasks such as dimensionality reduction and missing\ndata recovery due to their flexibility and non-linear nature. An\nimportance-weighted version of the Bayesian GPLVMs has been proposed to obtain\na tighter variational bound. However, this version of the approach is primarily\nlimited to analyzing simple data structures, as the generation of an effective\nproposal distribution can become quite challenging in high-dimensional spaces\nor with complex data sets. In this work, we propose an Annealed Importance\nSampling (AIS) approach to address these issues. By transforming the posterior\ninto a sequence of intermediate distributions using annealing, we combine the\nstrengths of Sequential Monte Carlo samplers and VI to explore a wider range of\nposterior distributions and gradually approach the target distribution. We\nfurther propose an efficient algorithm by reparameterizing all variables in the\nevidence lower bound (ELBO). Experimental results on both toy and image\ndatasets demonstrate that our method outperforms state-of-the-art methods in\nterms of tighter variational bounds, higher log-likelihoods, and more robust\nconvergence.\n","authors":["Jian Xu","Shian Du","Junmei Yang","Qianli Ma","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2408.06710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06701v1","updated":"2024-08-13T07:56:21Z","published":"2024-08-13T07:56:21Z","title":"DiffSG: A Generative Solver for Network Optimization with Diffusion\n Model","summary":" Diffusion generative models, famous for their performance in image\ngeneration, are popular in various cross-domain applications. However, their\nuse in the communication community has been mostly limited to auxiliary tasks\nlike data modeling and feature extraction. These models hold greater promise\nfor fundamental problems in network optimization compared to traditional\nmachine learning methods. Discriminative deep learning often falls short due to\nits single-step input-output mapping and lack of global awareness of the\nsolution space, especially given the complexity of network optimization's\nobjective functions. In contrast, diffusion generative models can consider a\nbroader range of solutions and exhibit stronger generalization by learning\nparameters that describe the distribution of the underlying solution space,\nwith higher probabilities assigned to better solutions. We propose a new\nframework Diffusion Model-based Solution Generation (DiffSG), which leverages\nthe intrinsic distribution learning capabilities of diffusion generative models\nto learn high-quality solution distributions based on given inputs. The optimal\nsolution within this distribution is highly probable, allowing it to be\neffectively reached through repeated sampling. We validate the performance of\nDiffSG on several typical network optimization problems, including\nmixed-integer non-linear programming, convex optimization, and hierarchical\nnon-convex optimization. Our results show that DiffSG outperforms existing\nbaselines. In summary, we demonstrate the potential of diffusion generative\nmodels in tackling complex network optimization problems and outline a\npromising path for their broader application in the communication community.\n","authors":["Ruihuai Liang","Bo Yang","Zhiwen Yu","Bin Guo","Xuelin Cao","Mérouane Debbah","H. Vincent Poor","Chau Yuen"],"pdf_url":"https://arxiv.org/pdf/2408.06701v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06699v1","updated":"2024-08-13T07:53:39Z","published":"2024-08-13T07:53:39Z","title":"Information Geometry and Beta Link for Optimizing Sparse Variational\n Student-t Processes","summary":" Recently, a sparse version of Student-t Processes, termed sparse variational\nStudent-t Processes, has been proposed to enhance computational efficiency and\nflexibility for real-world datasets using stochastic gradient descent. However,\ntraditional gradient descent methods like Adam may not fully exploit the\nparameter space geometry, potentially leading to slower convergence and\nsuboptimal performance. To mitigate these issues, we adopt natural gradient\nmethods from information geometry for variational parameter optimization of\nStudent-t Processes. This approach leverages the curvature and structure of the\nparameter space, utilizing tools such as the Fisher information matrix which is\nlinked to the Beta function in our model. This method provides robust\nmathematical support for the natural gradient algorithm when using Student's\nt-distribution as the variational distribution. Additionally, we present a\nmini-batch algorithm for efficiently computing natural gradients. Experimental\nresults across four benchmark datasets demonstrate that our method consistently\naccelerates convergence speed.\n","authors":["Jian Xu","Delu Zeng","John Paisley"],"pdf_url":"https://arxiv.org/pdf/2408.06699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06697v1","updated":"2024-08-13T07:51:37Z","published":"2024-08-13T07:51:37Z","title":"SlotLifter: Slot-guided Feature Lifting for Learning Object-centric\n Radiance Fields","summary":" The ability to distill object-centric abstractions from intricate visual\nscenes underpins human-level generalization. Despite the significant progress\nin object-centric learning methods, learning object-centric representations in\nthe 3D physical world remains a crucial challenge. In this work, we propose\nSlotLifter, a novel object-centric radiance model addressing scene\nreconstruction and decomposition jointly via slot-guided feature lifting. Such\na design unites object-centric learning representations and image-based\nrendering methods, offering state-of-the-art performance in scene decomposition\nand novel-view synthesis on four challenging synthetic and four complex\nreal-world datasets, outperforming existing 3D object-centric learning methods\nby a large margin. Through extensive ablative studies, we showcase the efficacy\nof designs in SlotLifter, revealing key insights for potential future\ndirections.\n","authors":["Yu Liu","Baoxiong Jia","Yixin Chen","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.06697v1.pdf","comment":"Accepted by ECCV 2024. Project website: https://slotlifter.github.io"},{"id":"http://arxiv.org/abs/2403.13369v2","updated":"2024-08-13T07:35:31Z","published":"2024-03-20T08:01:33Z","title":"Clinical information extraction for Low-resource languages with Few-shot\n learning using Pre-trained language models and Prompting","summary":" Automatic extraction of medical information from clinical documents poses\nseveral challenges: high costs of required clinical expertise, limited\ninterpretability of model predictions, restricted computational resources and\nprivacy regulations. Recent advances in domain-adaptation and prompting methods\nshowed promising results with minimal training data using lightweight masked\nlanguage models, which are suited for well-established interpretability\nmethods. We are first to present a systematic evaluation of these methods in a\nlow-resource setting, by performing multi-class section classification on\nGerman doctor's letters. We conduct extensive class-wise evaluations supported\nby Shapley values, to validate the quality of our small training data set and\nto ensure the interpretability of model predictions. We demonstrate that a\nlightweight, domain-adapted pretrained model, prompted with just 20 shots,\noutperforms a traditional classification model by 30.5% accuracy. Our results\nserve as a process-oriented guideline for clinical information extraction\nprojects working with low-resource.\n","authors":["Phillip Richter-Pechanski","Philipp Wiesenbach","Dominic M. Schwab","Christina Kiriakou","Nicolas Geis","Christoph Dieterich","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2403.13369v2.pdf","comment":"Paper accepted for publication in the journal: Natural Language\n Engineering (Cambridge Core)"},{"id":"http://arxiv.org/abs/2408.06687v1","updated":"2024-08-13T07:27:02Z","published":"2024-08-13T07:27:02Z","title":"Masked Image Modeling: A Survey","summary":" In this work, we survey recent studies on masked image modeling (MIM), an\napproach that emerged as a powerful self-supervised learning technique in\ncomputer vision. The MIM task involves masking some information, e.g. pixels,\npatches, or even latent representations, and training a model, usually an\nautoencoder, to predicting the missing information by using the context\navailable in the visible part of the input. We identify and formalize two\ncategories of approaches on how to implement MIM as a pretext task, one based\non reconstruction and one based on contrastive learning. Then, we construct a\ntaxonomy and review the most prominent papers in recent years. We complement\nthe manually constructed taxonomy with a dendrogram obtained by applying a\nhierarchical clustering algorithm. We further identify relevant clusters via\nmanually inspecting the resulting dendrogram. Our review also includes datasets\nthat are commonly used in MIM research. We aggregate the performance results of\nvarious masked image modeling methods on the most popular datasets, to\nfacilitate the comparison of competing methods. Finally, we identify research\ngaps and propose several interesting directions of future work.\n","authors":["Vlad Hondru","Florinel Alin Croitoru","Shervin Minaee","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.06687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.13353v2","updated":"2024-08-13T07:19:59Z","published":"2022-06-16T17:58:47Z","title":"Is Power-Seeking AI an Existential Risk?","summary":" This report examines what I see as the core argument for concern about\nexistential risk from misaligned artificial intelligence. I proceed in two\nstages. First, I lay out a backdrop picture that informs such concern. On this\npicture, intelligent agency is an extremely powerful force, and creating agents\nmuch more intelligent than us is playing with fire -- especially given that if\ntheir objectives are problematic, such agents would plausibly have instrumental\nincentives to seek power over humans. Second, I formulate and evaluate a more\nspecific six-premise argument that creating agents of this kind will lead to\nexistential catastrophe by 2070. On this argument, by 2070: (1) it will become\npossible and financially feasible to build relevantly powerful and agentic AI\nsystems; (2) there will be strong incentives to do so; (3) it will be much\nharder to build aligned (and relevantly powerful/agentic) AI systems than to\nbuild misaligned (and relevantly powerful/agentic) AI systems that are still\nsuperficially attractive to deploy; (4) some such misaligned systems will seek\npower over humans in high-impact ways; (5) this problem will scale to the full\ndisempowerment of humanity; and (6) such disempowerment will constitute an\nexistential catastrophe. I assign rough subjective credences to the premises in\nthis argument, and I end up with an overall estimate of ~5% that an existential\ncatastrophe of this kind will occur by 2070. (May 2022 update: since making\nthis report public in April 2021, my estimate here has gone up, and is now at\n>10%.)\n","authors":["Joseph Carlsmith"],"pdf_url":"https://arxiv.org/pdf/2206.13353v2.pdf","comment":"57 pages, 1 figure. Edited to fix link to audio version, add links to\n short version and reviews, and fix a typo in section 2.1.2"},{"id":"http://arxiv.org/abs/2408.06681v1","updated":"2024-08-13T07:19:40Z","published":"2024-08-13T07:19:40Z","title":"Coherence Awareness in Diffractive Neural Networks","summary":" Diffractive neural networks hold great promise for applications requiring\nintensive computational processing. Considerable attention has focused on\ndiffractive networks for either spatially coherent or spatially incoherent\nillumination. Here we illustrate that, as opposed to imaging systems, in\ndiffractive networks the degree of spatial coherence has a dramatic effect. In\nparticular, we show that when the spatial coherence length on the object is\ncomparable to the minimal feature size preserved by the optical system, neither\nthe incoherent nor the coherent extremes serve as acceptable approximations.\nImportantly, this situation is inherent to many settings involving active\nillumination, including reflected light microscopy, autonomous vehicles and\nsmartphones. Following this observation, we propose a general framework for\ntraining diffractive networks for any specified degree of spatial and temporal\ncoherence, supporting all types of linear and nonlinear layers. Using our\nmethod, we numerically optimize networks for image classification, and\nthoroughly investigate their performance dependence on the illumination\ncoherence properties. We further introduce the concept of coherence-blind\nnetworks, which have enhanced resilience to changes in illumination conditions.\nOur findings serve as a steppingstone toward adopting all-optical neural\nnetworks in real-world applications, leveraging nothing but natural light.\n","authors":["Matan Kleiner","Lior Michaeli","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2408.06681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07818v3","updated":"2024-08-13T07:18:04Z","published":"2024-07-10T16:43:14Z","title":"The Misclassification Likelihood Matrix: Some Classes Are More Likely To\n Be Misclassified Than Others","summary":" This study introduces the Misclassification Likelihood Matrix (MLM) as a\nnovel tool for quantifying the reliability of neural network predictions under\ndistribution shifts. The MLM is obtained by leveraging softmax outputs and\nclustering techniques to measure the distances between the predictions of a\ntrained neural network and class centroids. By analyzing these distances, the\nMLM provides a comprehensive view of the model's misclassification tendencies,\nenabling decision-makers to identify the most common and critical sources of\nerrors. The MLM allows for the prioritization of model improvements and the\nestablishment of decision thresholds based on acceptable risk levels. The\napproach is evaluated on the MNIST dataset using a Convolutional Neural Network\n(CNN) and a perturbed version of the dataset to simulate distribution shifts.\nThe results demonstrate the effectiveness of the MLM in assessing the\nreliability of predictions and highlight its potential in enhancing the\ninterpretability and risk mitigation capabilities of neural networks. The\nimplications of this work extend beyond image classification, with ongoing\napplications in autonomous systems, such as self-driving cars, to improve the\nsafety and reliability of decision-making in complex, real-world environments.\n","authors":["Daniel Sikar","Artur Garcez","Robin Bloomfield","Tillman Weyde","Kaleem Peeroo","Naman Singh","Maeve Hutchinson","Dany Laksono","Mirela Reljan-Delaney"],"pdf_url":"https://arxiv.org/pdf/2407.07818v3.pdf","comment":"9 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.19136v6","updated":"2024-08-13T07:12:37Z","published":"2024-06-27T12:40:29Z","title":"YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph\n Convolutional Networks and Transformer-Attention","summary":" Accurate prediction of drug molecule solubility is crucial for therapeutic\neffectiveness and safety. Traditional methods often miss complex molecular\nstructures, leading to inaccuracies. We introduce the YZS-Model, a deep\nlearning framework integrating Graph Convolutional Networks (GCN), Transformer\narchitectures, and Long Short-Term Memory (LSTM) networks to enhance prediction\nprecision. GCNs excel at capturing intricate molecular topologies by modeling\nthe relationships between atoms and bonds. Transformers, with their\nself-attention mechanisms, effectively identify long-range dependencies within\nmolecules, capturing global interactions. LSTMs process sequential data,\npreserving long-term dependencies and integrating temporal information within\nmolecular sequences. This multifaceted approach leverages the strengths of each\ncomponent, resulting in a model that comprehensively understands and predicts\nmolecular properties. Trained on 9,943 compounds and tested on an anticancer\ndataset, the YZS-Model achieved an $R^2$ of 0.59 and an RMSE of 0.57,\noutperforming benchmark models ($R^2$ of 0.52 and RMSE of 0.61). In an\nindependent test, it demonstrated an RMSE of 1.05, improving accuracy by 45.9%.\nThe integration of these deep learning techniques allows the YZS-Model to learn\nvaluable features from complex data without predefined parameters, handle large\ndatasets efficiently, and adapt to various molecular types. This comprehensive\ncapability significantly improves predictive accuracy and model\ngeneralizability. Its precision in solubility predictions can expedite drug\ndevelopment by optimizing candidate selection, reducing costs, and enhancing\nefficiency. Our research underscores deep learning's transformative potential\nin pharmaceutical science, particularly for solubility prediction and drug\ndesign.\n","authors":["Chenxu Wang","Haowei Ming","Jian He","Yao Lu","Junhong Chen"],"pdf_url":"https://arxiv.org/pdf/2406.19136v6.pdf","comment":"23 pages, 16 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.16694v4","updated":"2024-08-13T07:12:16Z","published":"2024-01-30T02:41:05Z","title":"etuner: Redundancy-Aware Efficient Continual Learning on Edge Devices","summary":" Many emerging applications, such as robot-assisted eldercare and object\nrecognition, generally employ deep learning neural networks (DNNs) and require\nthe deployment of DNN models on edge devices. These applications naturally\nrequire i) handling streaming-in inference requests and ii) fine-tuning the\ndeployed models to adapt to possible deployment scenario changes. Continual\nlearning (CL) is widely adopted to satisfy these needs. CL is a popular deep\nlearning paradigm that handles both continuous model fine-tuning and overtime\ninference requests. However, an inappropriate model fine-tuning scheme could\ninvolve significant redundancy and consume considerable time and energy, making\nit challenging to apply CL on edge devices. In this paper, we propose ETuner,\nan efficient edge continual learning framework that optimizes inference\naccuracy, fine-tuning execution time, and energy efficiency through both\ninter-tuning and intra-tuning optimizations. Experimental results show that, on\naverage, ETuner reduces overall fine-tuning execution time by 64%, energy\nconsumption by 56%, and improves average inference accuracy by 1.75% over the\nimmediate model fine-tuning approach.\n","authors":["Sheng Li","Geng Yuan","Yawen Wu","Yue Dai","Tianyu Wang","Chao Wu","Alex K. Jones","Jingtong Hu","Yanzhi Wang","Xulong Tang"],"pdf_url":"https://arxiv.org/pdf/2401.16694v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06679v1","updated":"2024-08-13T07:08:54Z","published":"2024-08-13T07:08:54Z","title":"Case-based Explainability for Random Forest: Prototypes, Critics,\n Counter-factuals and Semi-factuals","summary":" The explainability of black-box machine learning algorithms, commonly known\nas Explainable Artificial Intelligence (XAI), has become crucial for financial\nand other regulated industrial applications due to regulatory requirements and\nthe need for transparency in business practices. Among the various paradigms of\nXAI, Explainable Case-Based Reasoning (XCBR) stands out as a pragmatic approach\nthat elucidates the output of a model by referencing actual examples from the\ndata used to train or test the model. Despite its potential, XCBR has been\nrelatively underexplored for many algorithms such as tree-based models until\nrecently. We start by observing that most XCBR methods are defined based on the\ndistance metric learned by the algorithm. By utilizing a recently proposed\ntechnique to extract the distance metric learned by Random Forests (RFs), which\nis both geometry- and accuracy-preserving, we investigate various XCBR methods.\nThese methods amount to identify special points from the training datasets,\nsuch as prototypes, critics, counter-factuals, and semi-factuals, to explain\nthe predictions for a given query of the RF. We evaluate these special points\nusing various evaluation metrics to assess their explanatory power and\neffectiveness.\n","authors":["Gregory Yampolsky","Dhruv Desai","Mingshu Li","Stefano Pasquali","Dhagash Mehta"],"pdf_url":"https://arxiv.org/pdf/2408.06679v1.pdf","comment":"8 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.11253v2","updated":"2024-08-13T07:08:49Z","published":"2024-07-15T21:43:41Z","title":"Separable Operator Networks","summary":" Operator learning has become a powerful tool in machine learning for modeling\ncomplex physical systems governed by partial differential equations (PDEs).\nAlthough Deep Operator Networks (DeepONet) show promise, they require extensive\ndata acquisition. Physics-informed DeepONets (PI-DeepONet) mitigate data\nscarcity but suffer from inefficient training processes. We introduce Separable\nOperator Networks (SepONet), a novel framework that significantly enhances the\nefficiency of physics-informed operator learning. SepONet uses independent\ntrunk networks to learn basis functions separately for different coordinate\naxes, enabling faster and more memory-efficient training via forward-mode\nautomatic differentiation. We provide a universal approximation theorem for\nSepONet proving that it generalizes to arbitrary operator learning problems,\nand then validate its performance through comprehensive benchmarking against\nPI-DeepONet. Our results demonstrate SepONet's superior performance across\nvarious nonlinear and inseparable PDEs, with SepONet's advantages increasing\nwith problem complexity, dimension, and scale. For 1D time-dependent PDEs,\nSepONet achieves up to $112\\times$ faster training and $82\\times$ reduction in\nGPU memory usage compared to PI-DeepONet, while maintaining comparable\naccuracy. For the 2D time-dependent nonlinear diffusion equation, SepONet\nefficiently handles the complexity, achieving a 6.44\\% mean relative $\\ell_{2}$\ntest error, while PI-DeepONet fails due to memory constraints. This work paves\nthe way for extreme-scale learning of continuous mappings between\ninfinite-dimensional function spaces. Open source code is available at\n\\url{https://github.com/HewlettPackard/separable-operator-networks}.\n","authors":["Xinling Yu","Sean Hooten","Ziyue Liu","Yequan Zhao","Marco Fiorentino","Thomas Van Vaerenbergh","Zheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.11253v2.pdf","comment":"SepONet version 2. This revised version polishes writing and open\n sources code. The initial version was submitted to arXiv on July 15, 2024"},{"id":"http://arxiv.org/abs/2402.17232v2","updated":"2024-08-13T06:53:06Z","published":"2024-02-27T05:57:45Z","title":"Two-scale Neural Networks for Partial Differential Equations with Small\n Parameters","summary":" We propose a two-scale neural network method for solving partial differential\nequations (PDEs) with small parameters using physics-informed neural networks\n(PINNs). We directly incorporate the small parameters into the architecture of\nneural networks. The proposed method enables solving PDEs with small parameters\nin a simple fashion, without adding Fourier features or other computationally\ntaxing searches of truncation parameters. Various numerical examples\ndemonstrate reasonable accuracy in capturing features of large derivatives in\nthe solutions caused by small parameters.\n","authors":["Qiao Zhuang","Chris Ziyi Yao","Zhongqiang Zhang","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2402.17232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06672v1","updated":"2024-08-13T06:47:59Z","published":"2024-08-13T06:47:59Z","title":"Leveraging Priors via Diffusion Bridge for Time Series Generation","summary":" Time series generation is widely used in real-world applications such as\nsimulation, data augmentation, and hypothesis test techniques. Recently,\ndiffusion models have emerged as the de facto approach for time series\ngeneration, emphasizing diverse synthesis scenarios based on historical or\ncorrelated time series data streams. Since time series have unique\ncharacteristics, such as fixed time order and data scaling, standard Gaussian\nprior might be ill-suited for general time series generation. In this paper, we\nexploit the usage of diverse prior distributions for synthesis. Then, we\npropose TimeBridge, a framework that enables flexible synthesis by leveraging\ndiffusion bridges to learn the transport between chosen prior and data\ndistributions. Our model covers a wide range of scenarios in time series\ndiffusion models, which leverages (i) data- and time-dependent priors for\nunconditional synthesis, and (ii) data-scale preserving synthesis with a\nconstraint as a prior for conditional generation. Experimentally, our model\nachieves state-of-the-art performance in both unconditional and conditional\ntime series generation tasks.\n","authors":["Jinseong Park","Seungyun Lee","Woojin Jeong","Yujin Choi","Jaewook Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06665v1","updated":"2024-08-13T06:34:56Z","published":"2024-08-13T06:34:56Z","title":"RW-NSGCN: A Robust Approach to Structural Attacks via Negative Sampling","summary":" Node classification using Graph Neural Networks (GNNs) has been widely\napplied in various practical scenarios, such as predicting user interests and\ndetecting communities in social networks. However, recent studies have shown\nthat graph-structured networks often contain potential noise and attacks, in\nthe form of topological perturbations and weight disturbances, which can lead\nto decreased classification performance in GNNs. To improve the robustness of\nthe model, we propose a novel method: Random Walk Negative Sampling Graph\nConvolutional Network (RW-NSGCN). Specifically, RW-NSGCN integrates the Random\nWalk with Restart (RWR) and PageRank (PGR) algorithms for negative sampling and\nemploys a Determinantal Point Process (DPP)-based GCN for convolution\noperations. RWR leverages both global and local information to manage noise and\nlocal variations, while PGR assesses node importance to stabilize the\ntopological structure. The DPP-based GCN ensures diversity among negative\nsamples and aggregates their features to produce robust node embeddings,\nthereby improving classification performance. Experimental results demonstrate\nthat the RW-NSGCN model effectively addresses network topology attacks and\nweight instability, increasing the accuracy of anomaly detection and overall\nstability. In terms of classification accuracy, RW-NSGCN significantly\noutperforms existing methods, showing greater resilience across various\nscenarios and effectively mitigating the impact of such vulnerabilities.\n","authors":["Shuqi He","Jun Zhuang","Ding Wang","Jun Song"],"pdf_url":"https://arxiv.org/pdf/2408.06665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06027v2","updated":"2024-08-13T06:22:49Z","published":"2024-08-12T09:29:26Z","title":"A Comprehensive Survey on EEG-Based Emotion Recognition: A Graph-Based\n Perspective","summary":" Compared to other modalities, electroencephalogram (EEG) based emotion\nrecognition can intuitively respond to emotional patterns in the human brain\nand, therefore, has become one of the most focused tasks in affective\ncomputing. The nature of emotions is a physiological and psychological state\nchange in response to brain region connectivity, making emotion recognition\nfocus more on the dependency between brain regions instead of specific brain\nregions. A significant trend is the application of graphs to encapsulate such\ndependency as dynamic functional connections between nodes across temporal and\nspatial dimensions. Concurrently, the neuroscientific underpinnings behind this\ndependency endow the application of graphs in this field with a distinctive\nsignificance. However, there is neither a comprehensive review nor a tutorial\nfor constructing emotion-relevant graphs in EEG-based emotion recognition. In\nthis paper, we present a comprehensive survey of these studies, delivering a\nsystematic review of graph-related methods in this field from a methodological\nperspective. We propose a unified framework for graph applications in this\nfield and categorize these methods on this basis. Finally, based on previous\nstudies, we also present several open challenges and future directions in this\nfield.\n","authors":["Chenyu Liu","Xinliang Zhou","Yihao Wu","Yi Ding","Liming Zhai","Kun Wang","Ziyu Jia","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01077v2","updated":"2024-08-13T05:35:57Z","published":"2024-02-02T00:31:01Z","title":"Recent Advances in Predictive Modeling with Electronic Health Records","summary":" The development of electronic health records (EHR) systems has enabled the\ncollection of a vast amount of digitized patient data. However, utilizing EHR\ndata for predictive modeling presents several challenges due to its unique\ncharacteristics. With the advancements in machine learning techniques, deep\nlearning has demonstrated its superiority in various applications, including\nhealthcare. This survey systematically reviews recent advances in deep\nlearning-based predictive models using EHR data. Specifically, we begin by\nintroducing the background of EHR data and providing a mathematical definition\nof the predictive modeling task. We then categorize and summarize predictive\ndeep models from multiple perspectives. Furthermore, we present benchmarks and\ntoolkits relevant to predictive modeling in healthcare. Finally, we conclude\nthis survey by discussing open challenges and suggesting promising directions\nfor future research.\n","authors":["Jiaqi Wang","Junyu Luo","Muchao Ye","Xiaochen Wang","Yuan Zhong","Aofei Chang","Guanjie Huang","Ziyi Yin","Cao Xiao","Jimeng Sun","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2402.01077v2.pdf","comment":"This paper has been accepted by IJCAI 24 Survey Track"},{"id":"http://arxiv.org/abs/2401.17548v6","updated":"2024-08-13T05:31:22Z","published":"2024-01-31T02:26:09Z","title":"Rethinking Channel Dependence for Multivariate Time Series Forecasting:\n Learning from Leading Indicators","summary":" Recently, channel-independent methods have achieved state-of-the-art\nperformance in multivariate time series (MTS) forecasting. Despite reducing\noverfitting risks, these methods miss potential opportunities in utilizing\nchannel dependence for accurate predictions. We argue that there exist locally\nstationary lead-lag relationships between variates, i.e., some lagged variates\nmay follow the leading indicators within a short time period. Exploiting such\nchannel dependence is beneficial since leading indicators offer advance\ninformation that can be used to reduce the forecasting difficulty of the lagged\nvariates. In this paper, we propose a new method named LIFT that first\nefficiently estimates leading indicators and their leading steps at each time\nstep and then judiciously allows the lagged variates to utilize the advance\ninformation from leading indicators. LIFT plays as a plugin that can be\nseamlessly collaborated with arbitrary time series forecasting methods.\nExtensive experiments on six real-world datasets demonstrate that LIFT improves\nthe state-of-the-art methods by 5.5% in average forecasting performance. Our\ncode is available at https://github.com/SJTU-Quant/LIFT.\n","authors":["Lifan Zhao","Yanyan Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17548v6.pdf","comment":"Accepted to ICLR 2024. Code is at https://github.com/SJTU-DMTai/LIFT"},{"id":"http://arxiv.org/abs/2408.06638v1","updated":"2024-08-13T05:08:13Z","published":"2024-08-13T05:08:13Z","title":"COD: Learning Conditional Invariant Representation for Domain Adaptation\n Regression","summary":" Aiming to generalize the label knowledge from a source domain with continuous\noutputs to an unlabeled target domain, Domain Adaptation Regression (DAR) is\ndeveloped for complex practical learning problems. However, due to the\ncontinuity problem in regression, existing conditional distribution alignment\ntheory and methods with discrete prior, which are proven to be effective in\nclassification settings, are no longer applicable. In this work, focusing on\nthe feasibility problems in DAR, we establish the sufficiency theory for the\nregression model, which shows the generalization error can be sufficiently\ndominated by the cross-domain conditional discrepancy. Further, to characterize\nconditional discrepancy with continuous conditioning variable, a novel\nConditional Operator Discrepancy (COD) is proposed, which admits the metric\nproperty on conditional distributions via the kernel embedding theory. Finally,\nto minimize the discrepancy, a COD-based conditional invariant representation\nlearning model is proposed, and the reformulation is derived to show that\nreasonable modifications on moment statistics can further improve the\ndiscriminability of the adaptation model. Extensive experiments on standard DAR\ndatasets verify the validity of theoretical results and the superiority over\nSOTA DAR methods.\n","authors":["Hao-Ran Yang","Chuan-Xian Ren","You-Wei Luo"],"pdf_url":"https://arxiv.org/pdf/2408.06638v1.pdf","comment":"Accepted to ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2408.06634v1","updated":"2024-08-13T04:53:31Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v1.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2403.14623v4","updated":"2024-08-13T04:34:58Z","published":"2024-03-21T17:59:41Z","title":"Simplified Diffusion Schrödinger Bridge","summary":" This paper introduces a novel theoretical simplification of the Diffusion\nSchr\\\"odinger Bridge (DSB) that facilitates its unification with Score-based\nGenerative Models (SGMs), addressing the limitations of DSB in complex data\ngeneration and enabling faster convergence and enhanced performance. By\nemploying SGMs as an initial solution for DSB, our approach capitalizes on the\nstrengths of both frameworks, ensuring a more efficient training process and\nimproving the performance of SGM. We also propose a reparameterization\ntechnique that, despite theoretical approximations, practically improves the\nnetwork's fitting capabilities. Our extensive experimental evaluations confirm\nthe effectiveness of the simplified DSB, demonstrating its significant\nimprovements. We believe the contributions of this work pave the way for\nadvanced generative modeling.\n","authors":["Zhicong Tang","Tiankai Hang","Shuyang Gu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2403.14623v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06621v1","updated":"2024-08-13T04:18:32Z","published":"2024-08-13T04:18:32Z","title":"Towards Robust and Cost-Efficient Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) have demonstrated strong reasoning and\nmemorization capabilities via pretraining on massive textual corpora. However,\ntraining LLMs on human-written text entails significant risk of privacy and\ncopyright violations, which demands an efficient machine unlearning framework\nto remove knowledge of sensitive data without retraining the model from\nscratch. While Gradient Ascent (GA) is widely used for unlearning by reducing\nthe likelihood of generating unwanted information, the unboundedness of\nincreasing the cross-entropy loss causes not only unstable optimization, but\nalso catastrophic forgetting of knowledge that needs to be retained. We also\ndiscover its joint application under low-rank adaptation results in\nsignificantly suboptimal computational cost vs. generative performance\ntrade-offs. In light of this limitation, we propose two novel techniques for\nrobust and cost-efficient unlearning on LLMs. We first design an Inverted Hinge\nloss that suppresses unwanted tokens by increasing the probability of the next\nmost likely token, thereby retaining fluency and structure in language\ngeneration. We also propose to initialize low-rank adapter weights based on\nFisher-weighted low-rank approximation, which induces faster unlearning and\nbetter knowledge retention by allowing model updates to be focused on\nparameters that are important in generating textual data we wish to remove.\n","authors":["Sungmin Cha","Sungjun Cho","Dasol Hwang","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.06621v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.06620v1","updated":"2024-08-13T04:08:17Z","published":"2024-08-13T04:08:17Z","title":"Unveiling the Flaws: A Critical Analysis of Initialization Effect on\n Time Series Anomaly Detection","summary":" Deep learning for time-series anomaly detection (TSAD) has gained significant\nattention over the past decade. Despite the reported improvements in several\npapers, the practical application of these models remains limited. Recent\nstudies have cast doubt on these models, attributing their results to flawed\nevaluation techniques. However, the impact of initialization has largely been\noverlooked. This paper provides a critical analysis of the initialization\neffects on TSAD model performance. Our extensive experiments reveal that TSAD\nmodels are highly sensitive to hyperparameters such as window size, seed\nnumber, and normalization. This sensitivity often leads to significant\nvariability in performance, which can be exploited to artificially inflate the\nreported efficacy of these models. We demonstrate that even minor changes in\ninitialization parameters can result in performance variations that overshadow\nthe claimed improvements from novel model architectures. Our findings highlight\nthe need for rigorous evaluation protocols and transparent reporting of\npreprocessing steps to ensure the reliability and fairness of anomaly detection\nmethods. This paper calls for a more cautious interpretation of TSAD\nadvancements and encourages the development of more robust and transparent\nevaluation practices to advance the field and its practical applications.\n","authors":["Alex Koran","Hadi Hojjati","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2408.06620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06618v1","updated":"2024-08-13T04:06:45Z","published":"2024-08-13T04:06:45Z","title":"Generalized knowledge-enhanced framework for biomedical entity and\n relation extraction","summary":" In recent years, there has been an increasing number of frameworks developed\nfor biomedical entity and relation extraction. This research effort aims to\naddress the accelerating growth in biomedical publications and the intricate\nnature of biomedical texts, which are written for mainly domain experts. To\nhandle these challenges, we develop a novel framework that utilizes external\nknowledge to construct a task-independent and reusable background knowledge\ngraph for biomedical entity and relation extraction. The design of our model is\ninspired by how humans learn domain-specific topics. In particular, humans\noften first acquire the most basic and common knowledge regarding a field to\nbuild the foundational knowledge and then use that as a basis for extending to\nvarious specialized topics. Our framework employs such common-knowledge-sharing\nmechanism to build a general neural-network knowledge graph that is learning\ntransferable to different domain-specific biomedical texts effectively.\nExperimental evaluations demonstrate that our model, equipped with this\ngeneralized and cross-transferable knowledge base, achieves competitive\nperformance benchmarks, including BioRelEx for binding interaction detection\nand ADE for Adverse Drug Effect identification.\n","authors":["Minh Nguyen","Phuong Le"],"pdf_url":"https://arxiv.org/pdf/2408.06618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05498v2","updated":"2024-08-13T04:04:20Z","published":"2024-08-10T09:13:42Z","title":"A Laplacian-based Quantum Graph Neural Network for Semi-Supervised\n Learning","summary":" Laplacian learning method is a well-established technique in classical\ngraph-based semi-supervised learning, but its potential in the quantum domain\nremains largely unexplored. This study investigates the performance of the\nLaplacian-based Quantum Semi-Supervised Learning (QSSL) method across four\nbenchmark datasets -- Iris, Wine, Breast Cancer Wisconsin, and Heart Disease.\nFurther analysis explores the impact of increasing Qubit counts, revealing that\nadding more Qubits to a quantum system doesn't always improve performance. The\neffectiveness of additional Qubits depends on the quantum algorithm and how\nwell it matches the dataset. Additionally, we examine the effects of varying\nentangling layers on entanglement entropy and test accuracy. The performance of\nLaplacian learning is highly dependent on the number of entangling layers, with\noptimal configurations varying across different datasets. Typically, moderate\nlevels of entanglement offer the best balance between model complexity and\ngeneralization capabilities. These observations highlight the crucial need for\nprecise hyperparameter tuning tailored to each dataset to achieve optimal\nperformance in Laplacian learning methods.\n","authors":["Hamed Gholipour","Farid Bozorgnia","Kailash Hambarde","Hamzeh Mohammadigheymasi","Javier Mancilla","Andre Sequeira","Joao Neves","Hugo Proença"],"pdf_url":"https://arxiv.org/pdf/2408.05498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15617v2","updated":"2024-08-13T03:47:38Z","published":"2024-04-24T03:11:12Z","title":"DPO: Differential reinforcement learning with application to optimal\n configuration search","summary":" Reinforcement learning (RL) with continuous state and action spaces remains\none of the most challenging problems within the field. Most current learning\nmethods focus on integral identities such as value functions to derive an\noptimal strategy for the learning agent. In this paper, we instead study the\ndual form of the original RL formulation to propose the first differential RL\nframework that can handle settings with limited training samples and\nshort-length episodes. Our approach introduces Differential Policy Optimization\n(DPO), a pointwise and stage-wise iteration method that optimizes policies\nencoded by local-movement operators. We prove a pointwise convergence estimate\nfor DPO and provide a regret bound comparable with the best current theoretical\nderivation. Such pointwise estimate ensures that the learned policy matches the\noptimal path uniformly across different steps. We then apply DPO to a class of\npractical RL problems with continuous state and action spaces, and which search\nfor optimal configurations with Lagrangian rewards. DPO is easy to implement,\nscalable, and shows competitive results on benchmarking experiments against\nseveral popular RL methods.\n","authors":["Chandrajit Bajaj","Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.15617v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07510v4","updated":"2024-08-13T03:47:24Z","published":"2024-05-13T07:10:53Z","title":"PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator","summary":" We present Piecewise Rectified Flow (PeRFlow), a flow-based method for\naccelerating diffusion models. PeRFlow divides the sampling process of\ngenerative flows into several time windows and straightens the trajectories in\neach interval via the reflow operation, thereby approaching piecewise linear\nflows. PeRFlow achieves superior performance in a few-step generation.\nMoreover, through dedicated parameterizations, the PeRFlow models inherit\nknowledge from the pretrained diffusion models. Thus, the training converges\nfast and the obtained models show advantageous transfer ability, serving as\nuniversal plug-and-play accelerators that are compatible with various workflows\nbased on the pre-trained diffusion models. Codes for training and inference are\npublicly released. https://github.com/magic-research/piecewise-rectified-flow\n","authors":["Hanshu Yan","Xingchao Liu","Jiachun Pan","Jun Hao Liew","Qiang Liu","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2405.07510v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06610v1","updated":"2024-08-13T03:45:11Z","published":"2024-08-13T03:45:11Z","title":"CROME: Cross-Modal Adapters for Efficient Multimodal LLM","summary":" Multimodal Large Language Models (MLLMs) demonstrate remarkable\nimage-language capabilities, but their widespread use faces challenges in\ncost-effective training and adaptation. Existing approaches often necessitate\nexpensive language model retraining and limited adaptability. Additionally, the\ncurrent focus on zero-shot performance improvements offers insufficient\nguidance for task-specific tuning. We propose CROME, an efficient\nvision-language instruction tuning framework. It features a novel gated\ncross-modal adapter that effectively combines visual and textual\nrepresentations prior to input into a frozen LLM. This lightweight adapter,\ntrained with minimal parameters, enables efficient cross-modal understanding.\nNotably, CROME demonstrates superior zero-shot performance on standard visual\nquestion answering and instruction-following benchmarks. Moreover, it yields\nfine-tuning with exceptional parameter efficiency, competing with task-specific\nspecialist state-of-the-art methods. CROME demonstrates the potential of pre-LM\nalignment for building scalable, adaptable, and parameter-efficient multimodal\nmodels.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Tejas Nama","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2408.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16828v3","updated":"2024-08-13T03:41:48Z","published":"2024-04-25T17:59:56Z","title":"Made to Order: Discovering monotonic temporal changes via\n self-supervised video ordering","summary":" Our objective is to discover and localize monotonic temporal changes in a\nsequence of images. To achieve this, we exploit a simple proxy task of ordering\na shuffled image sequence, with `time' serving as a supervisory signal, since\nonly changes that are monotonic with time can give rise to the correct\nordering. We also introduce a transformer-based model for ordering of image\nsequences of arbitrary length with built-in attribution maps. After training,\nthe model successfully discovers and localizes monotonic changes while ignoring\ncyclic and stochastic ones. We demonstrate applications of the model in\nmultiple domains covering different scene and object types, discovering both\nobject-level and environmental changes in unseen sequences. We also demonstrate\nthat the attention-based attribution maps function as effective prompts for\nsegmenting the changing regions, and that the learned representations can be\nused for downstream applications. Finally, we show that the model achieves the\nstate-of-the-art on standard benchmarks for image ordering.\n","authors":["Charig Yang","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2404.16828v3.pdf","comment":"ECCV 2024 Oral. Project page: https://charigyang.github.io/order/"},{"id":"http://arxiv.org/abs/2402.07867v3","updated":"2024-08-13T01:55:06Z","published":"2024-02-12T18:28:36Z","title":"PoisonedRAG: Knowledge Corruption Attacks to Retrieval-Augmented\n Generation of Large Language Models","summary":" Large language models (LLMs) have achieved remarkable success due to their\nexceptional generative capabilities. Despite their success, they also have\ninherent limitations such as a lack of up-to-date knowledge and hallucination.\nRetrieval-Augmented Generation (RAG) is a state-of-the-art technique to\nmitigate these limitations. The key idea of RAG is to ground the answer\ngeneration of an LLM on external knowledge retrieved from a knowledge database.\nExisting studies mainly focus on improving the accuracy or efficiency of RAG,\nleaving its security largely unexplored. We aim to bridge the gap in this work.\nWe find that the knowledge database in a RAG system introduces a new and\npractical attack surface. Based on this attack surface, we propose PoisonedRAG,\nthe first knowledge corruption attack to RAG, where an attacker could inject a\nfew malicious texts into the knowledge database of a RAG system to induce an\nLLM to generate an attacker-chosen target answer for an attacker-chosen target\nquestion. We formulate knowledge corruption attacks as an optimization problem,\nwhose solution is a set of malicious texts. Depending on the background\nknowledge (e.g., black-box and white-box settings) of an attacker on a RAG\nsystem, we propose two solutions to solve the optimization problem,\nrespectively. Our results show PoisonedRAG could achieve a 90% attack success\nrate when injecting five malicious texts for each target question into a\nknowledge database with millions of texts. We also evaluate several defenses\nand our results show they are insufficient to defend against PoisonedRAG,\nhighlighting the need for new defenses.\n","authors":["Wei Zou","Runpeng Geng","Binghui Wang","Jinyuan Jia"],"pdf_url":"https://arxiv.org/pdf/2402.07867v3.pdf","comment":"To appear in USENIX Security Symposium 2025. The code is available at\n https://github.com/sleeepeer/PoisonedRAG"},{"id":"http://arxiv.org/abs/2311.06837v2","updated":"2024-08-13T01:16:35Z","published":"2023-11-12T13:30:31Z","title":"GraNNDis: Efficient Unified Distributed Training Framework for Deep GNNs\n on Large Clusters","summary":" Graph neural networks (GNNs) are one of the rapidly growing fields within\ndeep learning. While many distributed GNN training frameworks have been\nproposed to increase the training throughput, they face three limitations when\napplied to multi-server clusters. 1) They suffer from an inter-server\ncommunication bottleneck because they do not consider the inter-/intra-server\nbandwidth gap, a representative characteristic of multi-server clusters. 2)\nRedundant memory usage and computation hinder the scalability of the\ndistributed frameworks. 3) Sampling methods, de facto standard in mini-batch\ntraining, incur unnecessary errors in multi-server clusters. We found that\nthese limitations can be addressed by exploiting the characteristics of\nmulti-server clusters. Here, we propose GraNNDis, a fast distributed GNN\ntraining framework for multi-server clusters. Firstly, we present Flexible\nPreloading, which preloads the essential vertex dependencies server-wise to\nreduce the low-bandwidth inter-server communications. Secondly, we introduce\nCooperative Batching, which enables memory-efficient, less redundant mini-batch\ntraining by utilizing high-bandwidth intra-server communications. Thirdly, we\npropose Expansion-aware Sampling, a cluster-aware sampling method, which\nsamples the edges that affect the system speedup. As sampling the intra-server\ndependencies does not contribute much to the speedup as they are communicated\nthrough fast intra-server links, it only targets a server boundary to be\nsampled. Lastly, we introduce One-Hop Graph Masking, a computation and\ncommunication structure to realize the above methods in multi-server\nenvironments. We evaluated GraNNDis on multi-server clusters, and it provided\nsignificant speedup over the state-of-the-art distributed GNN training\nframeworks. GraNNDis is open-sourced at\nhttps://github.com/AIS-SNU/GraNNDis_Artifact to facilitate its use.\n","authors":["Jaeyong Song","Hongsun Jang","Jaewon Jung","Youngsok Kim","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2311.06837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06549v1","updated":"2024-08-13T01:14:27Z","published":"2024-08-13T01:14:27Z","title":"Prioritizing Modalities: Flexible Importance Scheduling in Federated\n Multimodal Learning","summary":" Federated Learning (FL) is a distributed machine learning approach that\nenables devices to collaboratively train models without sharing their local\ndata, ensuring user privacy and scalability. However, applying FL to real-world\ndata presents challenges, particularly as most existing FL research focuses on\nunimodal data. Multimodal Federated Learning (MFL) has emerged to address these\nchallenges, leveraging modality-specific encoder models to process diverse\ndatasets. Current MFL methods often uniformly allocate computational\nfrequencies across all modalities, which is inefficient for IoT devices with\nlimited resources. In this paper, we propose FlexMod, a novel approach to\nenhance computational efficiency in MFL by adaptively allocating training\nresources for each modality encoder based on their importance and training\nrequirements. We employ prototype learning to assess the quality of modality\nencoders, use Shapley values to quantify the importance of each modality, and\nadopt the Deep Deterministic Policy Gradient (DDPG) method from deep\nreinforcement learning to optimize the allocation of training resources. Our\nmethod prioritizes critical modalities, optimizing model performance and\nresource utilization. Experimental results on three real-world datasets\ndemonstrate that our proposed method significantly improves the performance of\nMFL models.\n","authors":["Jieming Bian","Lei Wang","Jie Xu"],"pdf_url":"https://arxiv.org/pdf/2408.06549v1.pdf","comment":"Submitted to IEEE TMC, under review"},{"id":"http://arxiv.org/abs/2407.15245v3","updated":"2024-08-13T01:01:59Z","published":"2024-07-21T19:05:30Z","title":"Weyl Calculus and Exactly Solvable Schrödinger Bridges with\n Quadratic State Cost","summary":" Schr\\\"{o}dinger bridge--a stochastic dynamical generalization of optimal mass\ntransport--exhibits a learning-control duality. Viewed as a stochastic control\nproblem, the Schr\\\"{o}dinger bridge finds an optimal control policy that steers\na given joint state statistics to another while minimizing the total control\neffort subject to controlled diffusion and deadline constraints. Viewed as a\nstochastic learning problem, the Schr\\\"{o}dinger bridge finds the most-likely\ndistribution-valued trajectory connecting endpoint distributional observations,\ni.e., solves the two point boundary-constrained maximum likelihood problem over\nthe manifold of probability distributions. Recent works have shown that solving\nthe Schr\\\"{o}dinger bridge problem with state cost requires finding the Markov\nkernel associated with a reaction-diffusion PDE where the state cost appears as\na state-dependent reaction rate. We explain how ideas from Weyl calculus in\nquantum mechanics, specifically the Weyl operator and the Weyl symbol, can help\ndetermine such Markov kernels. We illustrate these ideas by explicitly finding\nthe Markov kernel for the case of quadratic state cost via Weyl calculus,\nrecovering our earlier results but avoiding tedious computation with Hermite\npolynomials.\n","authors":["Alexis M. H. Teter","Wenqing Wang","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2407.15245v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11038v2","updated":"2024-08-13T00:55:09Z","published":"2024-07-06T01:40:31Z","title":"Fuzzy Recurrent Stochastic Configuration Networks for Industrial Data\n Analytics","summary":" This paper presents a novel neuro-fuzzy model, termed fuzzy recurrent\nstochastic configuration networks (F-RSCNs), for industrial data analytics.\nUnlike the original recurrent stochastic configuration network (RSCN), the\nproposed F-RSCN is constructed by multiple sub-reservoirs, and each\nsub-reservoir is associated with a Takagi-Sugeno-Kang (TSK) fuzzy rule. Through\nthis hybrid framework, first, the interpretability of the model is enhanced by\nincorporating fuzzy reasoning to embed the prior knowledge into the network.\nThen, the parameters of the neuro-fuzzy model are determined by the recurrent\nstochastic configuration (RSC) algorithm. This scheme not only ensures the\nuniversal approximation property and fast learning speed of the built model but\nalso overcomes uncertain problems, such as unknown dynamic orders, arbitrary\nstructure determination, and the sensitivity of learning parameters in\nmodelling nonlinear dynamics. Finally, an online update of the output weights\nis performed using the projection algorithm, and the convergence analysis of\nthe learning parameters is given. By integrating TSK fuzzy inference systems\ninto RSCNs, F-RSCNs have strong fuzzy inference capability and can achieve\nsound performance for both learning and generalization. Comprehensive\nexperiments show that the proposed F-RSCNs outperform other classical\nneuro-fuzzy and non-fuzzy models, demonstrating great potential for modelling\ncomplex industrial systems.\n","authors":["Dianhui Wang","Gang Dang"],"pdf_url":"https://arxiv.org/pdf/2407.11038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06544v1","updated":"2024-08-13T00:34:33Z","published":"2024-08-13T00:34:33Z","title":"Variance-Reduced Cascade Q-learning: Algorithms and Sample Complexity","summary":" We study the problem of estimating the optimal Q-function of\n$\\gamma$-discounted Markov decision processes (MDPs) under the synchronous\nsetting, where independent samples for all state-action pairs are drawn from a\ngenerative model at each iteration. We introduce and analyze a novel model-free\nalgorithm called Variance-Reduced Cascade Q-learning (VRCQ). VRCQ comprises two\nkey building blocks: (i) the established direct variance reduction technique\nand (ii) our proposed variance reduction scheme, Cascade Q-learning. By\nleveraging these techniques, VRCQ provides superior guarantees in the\n$\\ell_\\infty$-norm compared with the existing model-free stochastic\napproximation-type algorithms. Specifically, we demonstrate that VRCQ is\nminimax optimal. Additionally, when the action set is a singleton (so that the\nQ-learning problem reduces to policy evaluation), it achieves non-asymptotic\ninstance optimality while requiring the minimum number of samples theoretically\npossible. Our theoretical results and their practical implications are\nsupported by numerical experiments.\n","authors":["Mohammad Boveiri","Peyman Mohajerin Esfahani"],"pdf_url":"https://arxiv.org/pdf/2408.06544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06542v1","updated":"2024-08-13T00:32:05Z","published":"2024-08-13T00:32:05Z","title":"Value of Information and Reward Specification in Active Inference and\n POMDPs","summary":" Expected free energy (EFE) is a central quantity in active inference which\nhas recently gained popularity due to its intuitive decomposition of the\nexpected value of control into a pragmatic and an epistemic component. While\nnumerous conjectures have been made to justify EFE as a decision making\nobjective function, the most widely accepted is still its intuitiveness and\nresemblance to variational free energy in approximate Bayesian inference. In\nthis work, we take a bottom up approach and ask: taking EFE as given, what's\nthe resulting agent's optimality gap compared with a reward-driven\nreinforcement learning (RL) agent, which is well understood? By casting EFE\nunder a particular class of belief MDP and using analysis tools from RL theory,\nwe show that EFE approximates the Bayes optimal RL policy via information\nvalue. We discuss the implications for objective specification of active\ninference agents.\n","authors":["Ran Wei"],"pdf_url":"https://arxiv.org/pdf/2408.06542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06540v1","updated":"2024-08-13T00:20:39Z","published":"2024-08-13T00:20:39Z","title":"Dynamic Exclusion of Low-Fidelity Data in Bayesian Optimization for\n Autonomous Beamline Alignment","summary":" Aligning beamlines at synchrotron light sources is a high-dimensional,\nexpensive-to-sample optimization problem, as beams are focused using a series\nof dynamic optical components. Bayesian Optimization is an efficient machine\nlearning approach to finding global optima of beam quality, but the model can\neasily be impaired by faulty data points caused by the beam going off the edge\nof the sensor or by background noise. This study, conducted at the National\nSynchrotron Light Source II (NSLS-II) facility at Brookhaven National\nLaboratory (BNL), is an investigation of methods to identify untrustworthy\nreadings of beam quality and discourage the optimization model from seeking out\npoints likely to yield low-fidelity beams. The approaches explored include\ndynamic pruning using loss analysis of size and position models and a\nlengthscale-based genetic algorithm to determine which points to include in the\nmodel for optimal fit. Each method successfully classified high and low\nfidelity points. This research advances BNL's mission to tackle our nation's\nenergy challenges by providing scientists at all beamlines with access to\nhigher quality beams, and faster convergence to these optima for their\nexperiments.\n","authors":["Megha R. Narayanan","Thomas W. Morris"],"pdf_url":"https://arxiv.org/pdf/2408.06540v1.pdf","comment":"12 pages, 6 figure sets"},{"id":"http://arxiv.org/abs/2310.00105v3","updated":"2024-08-13T00:14:15Z","published":"2023-09-29T19:33:01Z","title":"Latent Space Symmetry Discovery","summary":" Equivariant neural networks require explicit knowledge of the symmetry group.\nAutomatic symmetry discovery methods aim to relax this constraint and learn\ninvariance and equivariance from data. However, existing symmetry discovery\nmethods are limited to simple linear symmetries and cannot handle the\ncomplexity of real-world data. We propose a novel generative model, Latent\nLieGAN (LaLiGAN), which can discover symmetries of nonlinear group actions. It\nlearns a mapping from the data space to a latent space where the symmetries\nbecome linear and simultaneously discovers symmetries in the latent space.\nTheoretically, we show that our model can express nonlinear symmetries under\nsome conditions about the group action. Experimentally, we demonstrate that our\nmethod can accurately discover the intrinsic symmetry in high-dimensional\ndynamical systems. LaLiGAN also results in a well-structured latent space that\nis useful for downstream tasks including equation discovery and long-term\nforecasting.\n","authors":["Jianke Yang","Nima Dehmamy","Robin Walters","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2310.00105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06536v1","updated":"2024-08-13T00:04:17Z","published":"2024-08-13T00:04:17Z","title":"A Comparison of Imitation Learning Algorithms for Bimanual Manipulation","summary":" Amidst the wide popularity of imitation learning algorithms in robotics,\ntheir properties regarding hyperparameter sensitivity, ease of training, data\nefficiency, and performance have not been well-studied in high-precision\nindustry-inspired environments. In this work, we demonstrate the limitations\nand benefits of prominent imitation learning approaches and analyze their\ncapabilities regarding these properties. We evaluate each algorithm on a\ncomplex bimanual manipulation task involving an over-constrained dynamics\nsystem in a setting involving multiple contacts between the manipulated object\nand the environment. While we find that imitation learning is well suited to\nsolve such complex tasks, not all algorithms are equal in terms of handling\nenvironmental and hyperparameter perturbations, training requirements,\nperformance, and ease of use. We investigate the empirical influence of these\nkey characteristics by employing a carefully designed experimental procedure\nand learning environment. Paper website: https://bimanual-imitation.github.io/\n","authors":["Michael Drolet","Simon Stepputtis","Siva Kailas","Ajinkya Jain","Jan Peters","Stefan Schaal","Heni Ben Amor"],"pdf_url":"https://arxiv.org/pdf/2408.06536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07238v1","updated":"2024-08-13T23:59:36Z","published":"2024-08-13T23:59:36Z","title":"Using Advanced LLMs to Enhance Smaller LLMs: An Interpretable Knowledge\n Distillation Approach","summary":" Advanced Large language models (LLMs) like GPT-4 or LlaMa 3 provide superior\nperformance in complex human-like interactions. But they are costly, or too\nlarge for edge devices such as smartphones and harder to self-host, leading to\nsecurity and privacy concerns. This paper introduces a novel interpretable\nknowledge distillation approach to enhance the performance of smaller, more\neconomical LLMs that firms can self-host. We study this problem in the context\nof building a customer service agent aimed at achieving high customer\nsatisfaction through goal-oriented dialogues. Unlike traditional knowledge\ndistillation, where the \"student\" model learns directly from the \"teacher\"\nmodel's responses via fine-tuning, our interpretable \"strategy\" teaching\napproach involves the teacher providing strategies to improve the student's\nperformance in various scenarios. This method alternates between a \"scenario\ngeneration\" step and a \"strategies for improvement\" step, creating a customized\nlibrary of scenarios and optimized strategies for automated prompting. The\nmethod requires only black-box access to both student and teacher models; hence\nit can be used without manipulating model parameters. In our customer service\napplication, the method improves performance, and the learned strategies are\ntransferable to other LLMs and scenarios beyond the training set. The method's\ninterpretabilty helps safeguard against potential harms through human audit.\n","authors":["Tong Wang","K. Sudhir","Dat Hong"],"pdf_url":"https://arxiv.org/pdf/2408.07238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10988v3","updated":"2024-08-13T23:50:36Z","published":"2023-07-20T16:18:33Z","title":"On minimizing the training set fill distance in machine learning\n regression","summary":" For regression tasks one often leverages large datasets for training\npredictive machine learning models. However, using large datasets may not be\nfeasible due to computational limitations or high data labelling costs.\nTherefore, suitably selecting small training sets from large pools of\nunlabelled data points is essential to maximize model performance while\nmaintaining efficiency. In this work, we study Farthest Point Sampling (FPS), a\ndata selection approach that aims to minimize the fill distance of the selected\nset. We derive an upper bound for the maximum expected prediction error,\nconditional to the location of the unlabelled data points, that linearly\ndepends on the training set fill distance. For empirical validation, we perform\nexperiments using two regression models on three datasets. We empirically show\nthat selecting a training set by aiming to minimize the fill distance, thereby\nminimizing our derived bound, significantly reduces the maximum prediction\nerror of various regression models, outperforming alternative sampling\napproaches by a large margin. Furthermore, we show that selecting training sets\nwith the FPS can also increase model stability for the specific case of\nGaussian kernel regression approaches.\n","authors":["Paolo Climaco","Jochen Garcke"],"pdf_url":"https://arxiv.org/pdf/2307.10988v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07233v1","updated":"2024-08-13T23:24:36Z","published":"2024-08-13T23:24:36Z","title":"Pan-cancer gene set discovery via scRNA-seq for optimal deep learning\n based downstream tasks","summary":" The application of machine learning to transcriptomics data has led to\nsignificant advances in cancer research. However, the high dimensionality and\ncomplexity of RNA sequencing (RNA-seq) data pose significant challenges in\npan-cancer studies. This study hypothesizes that gene sets derived from\nsingle-cell RNA sequencing (scRNA-seq) data will outperform those selected\nusing bulk RNA-seq in pan-cancer downstream tasks. We analyzed scRNA-seq data\nfrom 181 tumor biopsies across 13 cancer types. High-dimensional weighted gene\nco-expression network analysis (hdWGCNA) was performed to identify relevant\ngene sets, which were further refined using XGBoost for feature selection.\nThese gene sets were applied to downstream tasks using TCGA pan-cancer RNA-seq\ndata and compared to six reference gene sets and oncogenes from OncoKB\nevaluated with deep learning models, including multilayer perceptrons (MLPs)\nand graph neural networks (GNNs). The XGBoost-refined hdWGCNA gene set\ndemonstrated higher performance in most tasks, including tumor mutation burden\nassessment, microsatellite instability classification, mutation prediction,\ncancer subtyping, and grading. In particular, genes such as DPM1, BAD, and\nFKBP4 emerged as important pan-cancer biomarkers, with DPM1 consistently\nsignificant across tasks. This study presents a robust approach for feature\nselection in cancer genomics by integrating scRNA-seq data and advanced\nanalysis techniques, offering a promising avenue for improving predictive\naccuracy in cancer research.\n","authors":["Jong Hyun Kim","Jongseong Jang"],"pdf_url":"https://arxiv.org/pdf/2408.07233v1.pdf","comment":"16 pages, 3 figures, 1 tables, and 6 supplementary Table"},{"id":"http://arxiv.org/abs/2405.08298v2","updated":"2024-08-13T23:22:46Z","published":"2024-05-14T03:48:45Z","title":"Deep Reinforcement Learning for Real-Time Ground Delay Program Revision\n and Corresponding Flight Delay Assignments","summary":" This paper explores the optimization of Ground Delay Programs (GDP), a\nprevalent Traffic Management Initiative used in Air Traffic Management (ATM) to\nreconcile capacity and demand discrepancies at airports. Employing\nReinforcement Learning (RL) to manage the inherent uncertainties in the\nnational airspace system-such as weather variability, fluctuating flight\ndemands, and airport arrival rates-we developed two RL models: Behavioral\nCloning (BC) and Conservative Q-Learning (CQL). These models are designed to\nenhance GDP efficiency by utilizing a sophisticated reward function that\nintegrates ground and airborne delays and terminal area congestion. We\nconstructed a simulated single-airport environment, SAGDP_ENV, which\nincorporates real operational data along with predicted uncertainties to\nfacilitate realistic decision-making scenarios. Utilizing the whole year 2019\ndata from Newark Liberty International Airport (EWR), our models aimed to\npreemptively set airport program rates. Despite thorough modeling and\nsimulation, initial outcomes indicated that the models struggled to learn\neffectively, attributed potentially to oversimplified environmental\nassumptions. This paper discusses the challenges encountered, evaluates the\nmodels' performance against actual operational data, and outlines future\ndirections to refine RL applications in ATM.\n","authors":["Ke Liu","Fan Hu","Hui Lin","Xi Cheng","Jianan Chen","Jilin Song","Siyuan Feng","Gaofeng Su","Chen Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.08298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15091v2","updated":"2024-08-13T23:17:57Z","published":"2023-12-22T22:18:13Z","title":"A Note on Stability in Asynchronous Stochastic Approximation without\n Communication Delays","summary":" In this paper, we study asynchronous stochastic approximation algorithms\nwithout communication delays. Our main contribution is a stability proof for\nthese algorithms that extends a method of Borkar and Meyn by accommodating more\ngeneral noise conditions. We also derive convergence results from this\nstability result and discuss their application in important average-reward\nreinforcement learning problems.\n","authors":["Huizhen Yu","Yi Wan","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2312.15091v2.pdf","comment":"Corrected typos and a minor error; parts of this material will be\n included in a separate future arXiv preprint"},{"id":"http://arxiv.org/abs/2407.17672v2","updated":"2024-08-13T22:46:55Z","published":"2024-07-24T23:31:02Z","title":"Spiking Neural Networks in Vertical Federated Learning: Performance\n Trade-offs","summary":" Federated machine learning enables model training across multiple clients\nwhile maintaining data privacy. Vertical Federated Learning (VFL) specifically\ndeals with instances where the clients have different feature sets of the same\nsamples. As federated learning models aim to improve efficiency and\nadaptability, innovative neural network architectures like Spiking Neural\nNetworks (SNNs) are being leveraged to enable fast and accurate processing at\nthe edge. SNNs, known for their efficiency over Artificial Neural Networks\n(ANNs), have not been analyzed for their applicability in VFL, thus far. In\nthis paper, we investigate the benefits and trade-offs of using SNN models in a\nvertical federated learning setting. We implement two different federated\nlearning architectures -- with model splitting and without model splitting --\nthat have different privacy and performance implications. We evaluate the setup\nusing CIFAR-10 and CIFAR-100 benchmark datasets along with SNN implementations\nof VGG9 and ResNET classification models. Comparative evaluations demonstrate\nthat the accuracy of SNN models is comparable to that of traditional ANNs for\nVFL applications, albeit significantly more energy efficient.\n","authors":["Maryam Abbasihafshejani","Anindya Maiti","Murtuza Jadliwala"],"pdf_url":"https://arxiv.org/pdf/2407.17672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07221v1","updated":"2024-08-13T22:17:48Z","published":"2024-08-13T22:17:48Z","title":"A Review of Pseudo-Labeling for Computer Vision","summary":" Deep neural models have achieved state of the art performance on a wide range\nof problems in computer science, especially in computer vision. However, deep\nneural networks often require large datasets of labeled samples to generalize\neffectively, and an important area of active research is semi-supervised\nlearning, which attempts to instead utilize large quantities of (easily\nacquired) unlabeled samples. One family of methods in this space is\npseudo-labeling, a class of algorithms that use model outputs to assign labels\nto unlabeled samples which are then used as labeled samples during training.\nSuch assigned labels, called pseudo-labels, are most commonly associated with\nthe field of semi-supervised learning. In this work we explore a broader\ninterpretation of pseudo-labels within both self-supervised and unsupervised\nmethods. By drawing the connection between these areas we identify new\ndirections when advancements in one area would likely benefit others, such as\ncurriculum learning and self-supervised regularization.\n","authors":["Patrick Kage","Jay C. Rothenberger","Pavlos Andreadis","Dimitrios I. Diochnos"],"pdf_url":"https://arxiv.org/pdf/2408.07221v1.pdf","comment":"21 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.07219v1","updated":"2024-08-13T22:13:25Z","published":"2024-08-13T22:13:25Z","title":"Causal Effect Estimation using identifiable Variational AutoEncoder with\n Latent Confounders and Post-Treatment Variables","summary":" Estimating causal effects from observational data is challenging, especially\nin the presence of latent confounders. Much work has been done on addressing\nthis challenge, but most of the existing research ignores the bias introduced\nby the post-treatment variables. In this paper, we propose a novel method of\njoint Variational AutoEncoder (VAE) and identifiable Variational AutoEncoder\n(iVAE) for learning the representations of latent confounders and latent\npost-treatment variables from their proxy variables, termed CPTiVAE, to achieve\nunbiased causal effect estimation from observational data. We further prove the\nidentifiability in terms of the representation of latent post-treatment\nvariables. Extensive experiments on synthetic and semi-synthetic datasets\ndemonstrate that the CPTiVAE outperforms the state-of-the-art methods in the\npresence of latent confounders and post-treatment variables. We further apply\nCPTiVAE to a real-world dataset to show its potential application.\n","authors":["Yang Xie","Ziqi Xu","Debo Cheng","Jiuyong Li","Lin Liu","Yinghao Zhang","Zaiwen Feng"],"pdf_url":"https://arxiv.org/pdf/2408.07219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01863v2","updated":"2024-08-13T22:07:03Z","published":"2024-02-02T19:35:05Z","title":"DFML: Decentralized Federated Mutual Learning","summary":" In the realm of real-world devices, centralized servers in Federated Learning\n(FL) present challenges including communication bottlenecks and susceptibility\nto a single point of failure. Additionally, contemporary devices inherently\nexhibit model and data heterogeneity. Existing work lacks a Decentralized FL\n(DFL) framework capable of accommodating such heterogeneity without imposing\narchitectural restrictions or assuming the availability of public data. To\naddress these issues, we propose a Decentralized Federated Mutual Learning\n(DFML) framework that is serverless, supports nonrestrictive heterogeneous\nmodels, and avoids reliance on public data. DFML effectively handles model and\ndata heterogeneity through mutual learning, which distills knowledge between\nclients, and cyclically varying the amount of supervision and distillation\nsignals. Extensive experimental results demonstrate consistent effectiveness of\nDFML in both convergence speed and global accuracy, outperforming prevalent\nbaselines under various conditions. For example, with the CIFAR-100 dataset and\n50 clients, DFML achieves a substantial increase of +17.20% and +19.95% in\nglobal accuracy under Independent and Identically Distributed (IID) and non-IID\ndata shifts, respectively.\n","authors":["Yasser H. Khalil","Amir H. Estiri","Mahdi Beitollahi","Nader Asadi","Sobhan Hemati","Xu Li","Guojun Zhang","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2402.01863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03626v2","updated":"2024-08-13T22:01:42Z","published":"2024-04-04T17:48:28Z","title":"Training LLMs over Neurally Compressed Text","summary":" In this paper, we explore the idea of training large language models (LLMs)\nover highly compressed text. While standard subword tokenizers compress text by\na small factor, neural text compressors can achieve much higher rates of\ncompression. If it were possible to train LLMs directly over neurally\ncompressed text, this would confer advantages in training and serving\nefficiency, as well as easier handling of long text spans. The main obstacle to\nthis goal is that strong compression tends to produce opaque outputs that are\nnot well-suited for learning. In particular, we find that text na\\\"ively\ncompressed via Arithmetic Coding is not readily learnable by LLMs. To overcome\nthis, we propose Equal-Info Windows, a novel compression technique whereby text\nis segmented into blocks that each compress to the same bit length. Using this\nmethod, we demonstrate effective learning over neurally compressed text that\nimproves with scale, and outperforms byte-level baselines by a wide margin on\nperplexity and inference speed benchmarks. While our method delivers worse\nperplexity than subword tokenizers for models trained with the same parameter\ncount, it has the benefit of shorter sequence lengths. Shorter sequence lengths\nrequire fewer autoregressive generation steps, and reduce latency. Finally, we\nprovide extensive analysis of the properties that contribute to learnability,\nand offer concrete suggestions for how to further improve the performance of\nhigh-compression tokenizers.\n","authors":["Brian Lester","Jaehoon Lee","Alex Alemi","Jeffrey Pennington","Adam Roberts","Jascha Sohl-Dickstein","Noah Constant"],"pdf_url":"https://arxiv.org/pdf/2404.03626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19858v4","updated":"2024-08-13T21:52:52Z","published":"2024-07-29T10:26:52Z","title":"AI-Powered Energy Algorithmic Trading: Integrating Hidden Markov Models\n with Neural Networks","summary":" In quantitative finance, machine learning methods are essential for alpha\ngeneration. This study introduces a new approach that combines Hidden Markov\nModels (HMM) and neural networks, integrated with Black-Litterman portfolio\noptimization. During the COVID period (2019-2022), this dual-model approach\nachieved a 83% return with a Sharpe ratio of 0.77. It incorporates two risk\nmodels to enhance risk management, showing efficiency during volatile periods.\nThe methodology was implemented on the QuantConnect platform, which was chosen\nfor its robust framework and experimental reproducibility. The system, which\npredicts future price movements, includes a three-year warm-up to ensure proper\nalgorithm function. It targets highly liquid, large-cap energy stocks to ensure\nstable and predictable performance while also considering broker payments. The\ndual-model alpha system utilizes log returns to select the optimal state based\non the historical performance. It combines state predictions with neural\nnetwork outputs, which are based on historical data, to generate trading\nsignals. This study examined the architecture of the trading system, data\npre-processing, training, and performance. The full code and backtesting data\nare available under the QuantConnect terms.\n","authors":["Tiago Monteiro"],"pdf_url":"https://arxiv.org/pdf/2407.19858v4.pdf","comment":"14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.20192v2","updated":"2024-08-13T21:40:07Z","published":"2024-07-29T17:19:40Z","title":"Time series forecasting with high stakes: A field study of the air cargo\n industry","summary":" Time series forecasting in the air cargo industry presents unique challenges\ndue to volatile market dynamics and the significant impact of accurate\nforecasts on generated revenue. This paper explores a comprehensive approach to\ndemand forecasting at the origin-destination (O\\&D) level, focusing on the\ndevelopment and implementation of machine learning models in decision-making\nfor the air cargo industry. We leverage a mixture of experts framework,\ncombining statistical and advanced deep learning models to provide reliable\nforecasts for cargo demand over a six-month horizon. The results demonstrate\nthat our approach outperforms industry benchmarks, offering actionable insights\nfor cargo capacity allocation and strategic decision-making in the air cargo\nindustry. While this work is applied in the airline industry, the methodology\nis broadly applicable to any field where forecast-based decision-making in a\nvolatile environment is crucial.\n","authors":["Abhinav Garg","Naman Shukla","Maarten Wormer"],"pdf_url":"https://arxiv.org/pdf/2407.20192v2.pdf","comment":"The 10th Mining and Learning from Time Series Workshop: From\n Classical Methods to LLMs. SIGKDD, Barcelona, Spain, 6 page"},{"id":"http://arxiv.org/abs/2408.07205v1","updated":"2024-08-13T21:24:14Z","published":"2024-08-13T21:24:14Z","title":"Deep Index Policy for Multi-Resource Restless Matching Bandit and Its\n Application in Multi-Channel Scheduling","summary":" Scheduling in multi-channel wireless communication system presents formidable\nchallenges in effectively allocating resources. To address these challenges, we\ninvestigate the multi-resource restless matching bandit (MR-RMB) model for\nheterogeneous resource systems with an objective of maximizing long-term\ndiscounted total rewards while respecting resource constraints. We have also\ngeneralized to applications beyond multi-channel wireless. We discuss the\nMax-Weight Index Matching algorithm, which optimizes resource allocation based\non learned partial indexes. We have derived the policy gradient theorem for\nindex learning. Our main contribution is the introduction of a new Deep Index\nPolicy (DIP), an online learning algorithm tailored for MR-RMB. DIP learns the\npartial index by leveraging the policy gradient theorem for restless arms with\nconvoluted and unknown transition kernels of heterogeneous resources. We\ndemonstrate the utility of DIP by evaluating its performance for three\ndifferent MR-RMB problems. Our simulation results show that DIP indeed learns\nthe partial indexes efficiently.\n","authors":["Nida Zamir","I-Hong Hou"],"pdf_url":"https://arxiv.org/pdf/2408.07205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13481v2","updated":"2024-08-13T21:16:30Z","published":"2023-09-23T21:39:51Z","title":"Offline to Online Learning for Personalized Bandwidth Estimation","summary":" In this work, we tackle the problem of bandwidth estimation (BWE) for\nreal-time communication systems through expert personalization. While expert\nheuristic-based methods have been widely adopted, tailoring these methods for\neach and every end user environment is cumbersome due to the level of domain\nexpertise and manual effort required to adjust the carefully tuned heuristic\nparameters. Thus. we propose Merlin, a data-driven solution to BWE that\nharnesses expert demonstrations from prior heuristic-based methods to extract\nan expert BWE policy. The extracted policy can then be finetuned to end user\nnetwork conditions to improve user quality of experience (QoE). In real-world\nvideoconferencing calls, Merlin matches our expert's policy with no\nstatistically significant movements in terms of objective QoE metrics.\nAdditionally, we show that personalizing Merlin's control policy is possible\nthrough a small number of online data-driven parameter updates.\n","authors":["Aashish Gottipati","Sami Khairy","Gabriel Mittag","Vishak Gopal","Ross Cutler"],"pdf_url":"https://arxiv.org/pdf/2309.13481v2.pdf","comment":"7 pages, 6 figures, under review. Trimmed content to 6 pages, added\n finetuning evaluations, and updated writing to focus on IL + finetuning as\n opposed to IL only"},{"id":"http://arxiv.org/abs/2408.07201v1","updated":"2024-08-13T21:10:39Z","published":"2024-08-13T21:10:39Z","title":"Quantification of total uncertainty in the physics-informed\n reconstruction of CVSim-6 physiology","summary":" When predicting physical phenomena through simulation, quantification of the\ntotal uncertainty due to multiple sources is as crucial as making sure the\nunderlying numerical model is accurate. Possible sources include irreducible\naleatoric uncertainty due to noise in the data, epistemic uncertainty induced\nby insufficient data or inadequate parameterization, and model-form uncertainty\nrelated to the use of misspecified model equations. Physics-based\nregularization interacts in nontrivial ways with aleatoric, epistemic and\nmodel-form uncertainty and their combination, and a better understanding of\nthis interaction is needed to improve the predictive performance of\nphysics-informed digital twins that operate under real conditions. With a\nspecific focus on biological and physiological models, this study investigates\nthe decomposition of total uncertainty in the estimation of states and\nparameters of a differential system simulated with MC X-TFC, a new\nphysics-informed approach for uncertainty quantification based on random\nprojections and Monte-Carlo sampling. MC X-TFC is applied to a six-compartment\nstiff ODE system, the CVSim-6 model, developed in the context of human\nphysiology. The system is analyzed by progressively removing data while\nestimating an increasing number of parameters and by investigating total\nuncertainty under model-form misspecification of non-linear resistance in the\npulmonary compartment. In particular, we focus on the interaction between the\nformulation of the discrepancy term and quantification of model-form\nuncertainty, and show how additional physics can help in the estimation\nprocess. The method demonstrates robustness and efficiency in estimating\nunknown states and parameters, even with limited, sparse, and noisy data. It\nalso offers great flexibility in integrating data with physics for improved\nestimation, even in cases of model misspecification.\n","authors":["Mario De Florio","Zongren Zou","Daniele E. Schiavazzi","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2408.07201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07199v1","updated":"2024-08-13T20:52:13Z","published":"2024-08-13T20:52:13Z","title":"Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents","summary":" Large Language Models (LLMs) have shown remarkable capabilities in natural\nlanguage tasks requiring complex reasoning, yet their application in agentic,\nmulti-step reasoning within interactive environments remains a difficult\nchallenge. Traditional supervised pre-training on static datasets falls short\nin enabling autonomous agent capabilities needed to perform complex\ndecision-making in dynamic settings like web navigation. Previous attempts to\nbridge this ga-through supervised fine-tuning on curated expert\ndemonstrations-often suffer from compounding errors and limited exploration\ndata, resulting in sub-optimal policy outcomes. To overcome these challenges,\nwe propose a framework that combines guided Monte Carlo Tree Search (MCTS)\nsearch with a self-critique mechanism and iterative fine-tuning on agent\ninteractions using an off-policy variant of the Direct Preference Optimization\n(DPO) algorithm. Our method allows LLM agents to learn effectively from both\nsuccessful and unsuccessful trajectories, thereby improving their\ngeneralization in complex, multi-step reasoning tasks. We validate our approach\nin the WebShop environment-a simulated e-commerce platform where it\nconsistently outperforms behavior cloning and reinforced fine-tuning baseline,\nand beats average human performance when equipped with the capability to do\nonline search. In real-world booking scenarios, our methodology boosts Llama-3\n70B model's zero-shot performance from 18.6% to 81.7% success rate (a 340%\nrelative increase) after a single day of data collection and further to 95.4%\nwith online search. We believe this represents a substantial leap forward in\nthe capabilities of autonomous agents, paving the way for more sophisticated\nand reliable decision-making in real-world settings.\n","authors":["Pranav Putta","Edmund Mills","Naman Garg","Sumeet Motwani","Chelsea Finn","Divyansh Garg","Rafael Rafailov"],"pdf_url":"https://arxiv.org/pdf/2408.07199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15332v2","updated":"2024-08-13T20:37:14Z","published":"2024-04-08T11:19:28Z","title":"Clinical translation of machine learning algorithms for seizure\n detection in scalp electroencephalography: systematic review","summary":" Machine learning algorithms for seizure detection have shown considerable\ndiagnostic potential, with recent reported accuracies reaching 100%. Yet, only\nfew published algorithms have fully addressed the requirements for successful\nclinical translation. This is, for example, because the properties of training\ndata may limit the generalisability of algorithms, algorithm performance may\nvary depending on which electroencephalogram (EEG) acquisition hardware was\nused, or run-time processing costs may be prohibitive to real-time clinical use\ncases. To address these issues in a critical manner, we systematically review\nmachine learning algorithms for seizure detection with a focus on clinical\ntranslatability, assessed by criteria including generalisability, run-time\ncosts, explainability, and clinically-relevant performance metrics. For\nnon-specialists, the domain-specific knowledge necessary to contextualise model\ndevelopment and evaluation is provided. It is our hope that such critical\nevaluation of machine learning algorithms with respect to their potential\nreal-world effectiveness can help accelerate clinical translation and identify\ngaps in the current seizure detection literature.\n","authors":["Nina Moutonnet","Steven White","Benjamin P Campbell","Saeid Sanei","Toshihisa Tanaka","Hong Ji","Danilo Mandic","Gregory Scott"],"pdf_url":"https://arxiv.org/pdf/2404.15332v2.pdf","comment":"60 pages, LaTeX; Addition of co-authors, keywords alphabetically\n sorted, text in figure 1 changed to black, references added ([9],[56] ),\n abbreviations defined (CNN, RNN), added section 6.4, corrected the\n referencing style, added a sentence about the existence of non-epileptic\n attacks, added an explanation about the drawback of the 10-20 system, removed\n bold from Figure/Table titles"},{"id":"http://arxiv.org/abs/2408.07194v1","updated":"2024-08-13T20:28:20Z","published":"2024-08-13T20:28:20Z","title":"Massive Dimensions Reduction and Hybridization with Meta-heuristics in\n Deep Learning","summary":" Deep learning is mainly based on utilizing gradient-based optimization for\ntraining Deep Neural Network (DNN) models. Although robust and widely used,\ngradient-based optimization algorithms are prone to getting stuck in local\nminima. In this modern deep learning era, the state-of-the-art DNN models have\nmillions and billions of parameters, including weights and biases, making them\nhuge-scale optimization problems in terms of search space. Tuning a huge number\nof parameters is a challenging task that causes vanishing/exploding gradients\nand overfitting; likewise, utilized loss functions do not exactly represent our\ntargeted performance metrics. A practical solution to exploring large and\ncomplex solution space is meta-heuristic algorithms. Since DNNs exceed\nthousands and millions of parameters, even robust meta-heuristic algorithms,\nsuch as Differential Evolution, struggle to efficiently explore and converge in\nsuch huge-dimensional search spaces, leading to very slow convergence and high\nmemory demand. To tackle the mentioned curse of dimensionality, the concept of\nblocking was recently proposed as a technique that reduces the search space\ndimensions by grouping them into blocks. In this study, we aim to introduce\nHistogram-based Blocking Differential Evolution (HBDE), a novel approach that\nhybridizes gradient-based and gradient-free algorithms to optimize parameters.\nExperimental results demonstrated that the HBDE could reduce the parameters in\nthe ResNet-18 model from 11M to 3K during the training/optimizing phase by\nmetaheuristics, namely, the proposed HBDE, which outperforms baseline\ngradient-based and parent gradient-free DE algorithms evaluated on CIFAR-10 and\nCIFAR-100 datasets showcasing its effectiveness with reduced computational\ndemands for the very first time.\n","authors":["Rasa Khosrowshahli","Shahryar Rahnamayan","Beatrice Ombuki-Berman"],"pdf_url":"https://arxiv.org/pdf/2408.07194v1.pdf","comment":"8 pages, 5 figures, 3 tables, accepted at IEEE CCECE 2024 (updated\n Fig. 1 and conclusion remarks)"},{"id":"http://arxiv.org/abs/2408.07192v1","updated":"2024-08-13T20:20:58Z","published":"2024-08-13T20:20:58Z","title":"Solving Truly Massive Budgeted Monotonic POMDPs with Oracle-Guided\n Meta-Reinforcement Learning","summary":" Monotonic Partially Observable Markov Decision Processes (POMDPs), where the\nsystem state progressively decreases until a restorative action is performed,\ncan be used to model sequential repair problems effectively. This paper\nconsiders the problem of solving budget-constrained multi-component monotonic\nPOMDPs, where a finite budget limits the maximal number of restorative actions.\nFor a large number of components, solving such a POMDP using current methods is\ncomputationally intractable due to the exponential growth in the state space\nwith an increasing number of components. To address this challenge, we propose\na two-step approach. Since the individual components of a budget-constrained\nmulti-component monotonic POMDP are only connected via the shared budget, we\nfirst approximate the optimal budget allocation among these components using an\napproximation of each component POMDP's optimal value function which is\nobtained through a random forest model. Subsequently, we introduce an\noracle-guided meta-trained Proximal Policy Optimization (PPO) algorithm to\nsolve each of the independent budget-constrained single-component monotonic\nPOMDPs. The oracle policy is obtained by performing value iteration on the\ncorresponding monotonic Markov Decision Process (MDP). This two-step method\nprovides scalability in solving truly massive multi-component monotonic POMDPs.\nTo demonstrate the efficacy of our approach, we consider a real-world\nmaintenance scenario that involves inspection and repair of an administrative\nbuilding by a team of agents within a maintenance budget. Finally, we perform a\ncomputational complexity analysis for a varying number of components to show\nthe scalability of the proposed approach.\n","authors":["Manav Vora","Michael N Grussing","Melkior Ornik"],"pdf_url":"https://arxiv.org/pdf/2408.07192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07191v1","updated":"2024-08-13T20:16:11Z","published":"2024-08-13T20:16:11Z","title":"Joint Graph Rewiring and Feature Denoising via Spectral Resonance","summary":" Graph neural networks (GNNs) take as input the graph structure and the\nfeature vectors associated with the nodes. Both contain noisy information about\nthe labels. Here we propose joint denoising and rewiring (JDR)--an algorithm to\njointly denoise the graph structure and features, which can improve the\nperformance of any downstream algorithm. We do this by defining and maximizing\nthe alignment between the leading eigenspaces of graph and feature matrices. To\napproximately solve this computationally hard problem, we propose a heuristic\nthat efficiently handles real-world graph datasets with many classes and\ndifferent levels of homophily or heterophily. We experimentally verify the\neffectiveness of our approach on synthetic data and real-world graph datasets.\nThe results show that JDR consistently outperforms existing rewiring methods on\nnode classification tasks using GNNs as downstream models.\n","authors":["Jonas Linkerhägner","Cheng Shi","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2408.07191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06024v2","updated":"2024-08-13T20:01:59Z","published":"2024-08-12T09:24:48Z","title":"Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis\n Search","summary":" Deep neural network models have a complex architecture and are\noverparameterized. The number of parameters is more than the whole dataset,\nwhich is highly resource-consuming. This complicates their application and\nlimits its usage on different devices. Reduction in the number of network\nparameters helps to reduce the size of the model, but at the same time,\nthoughtlessly applied, can lead to a deterioration in the quality of the\nnetwork. One way to reduce the number of model parameters is matrix\ndecomposition, where a matrix is represented as a product of smaller matrices.\nIn this paper, we propose a new way of applying the matrix decomposition with\nrespect to the weights of convolutional layers. The essence of the method is to\ntrain not all convolutions, but only the subset of convolutions (basis\nconvolutions), and represent the rest as linear combinations of the basis ones.\nExperiments on models from the ResNet family and the CIFAR-10 dataset\ndemonstrate that basis convolutions can not only reduce the size of the model\nbut also accelerate the forward and backward passes of the network. Another\ncontribution of this work is that we propose a fast method for selecting a\nsubset of network layers in which the use of matrix decomposition does not\ndegrade the quality of the final model.\n","authors":["Vasiliy Alekseev","Ilya Lukashevich","Ilia Zharikov","Ilya Vasiliev"],"pdf_url":"https://arxiv.org/pdf/2408.06024v2.pdf","comment":"Increase the size of matrix pictures for better UX in PDF view"},{"id":"http://arxiv.org/abs/2406.03361v2","updated":"2024-08-13T19:56:45Z","published":"2024-06-05T15:14:58Z","title":"What Matters in Hierarchical Search for Combinatorial Reasoning\n Problems?","summary":" Efficiently tackling combinatorial reasoning problems, particularly the\nnotorious NP-hard tasks, remains a significant challenge for AI research.\nRecent efforts have sought to enhance planning by incorporating hierarchical\nhigh-level search strategies, known as subgoal methods. While promising, their\nperformance against traditional low-level planners is inconsistent, raising\nquestions about their application contexts. In this study, we conduct an\nin-depth exploration of subgoal-planning methods for combinatorial reasoning.\nWe identify the attributes pivotal for leveraging the advantages of high-level\nsearch: hard-to-learn value functions, complex action spaces, presence of dead\nends in the environment, or using data collected from diverse experts. We\npropose a consistent evaluation methodology to achieve meaningful comparisons\nbetween methods and reevaluate the state-of-the-art algorithms.\n","authors":["Michał Zawalski","Gracjan Góral","Michał Tyrolski","Emilia Wiśnios","Franciszek Budrowski","Łukasz Kuciński","Piotr Miłoś"],"pdf_url":"https://arxiv.org/pdf/2406.03361v2.pdf","comment":"Accepted for Generative Models for Decision Making Workshop at ICLR\n 2024"},{"id":"http://arxiv.org/abs/2306.08158v5","updated":"2024-08-13T19:51:48Z","published":"2023-06-13T22:07:54Z","title":"Sociodemographic Bias in Language Models: A Survey and Forward Path","summary":" Sociodemographic bias in language models (LMs) has the potential for harm\nwhen deployed in real-world settings. This paper presents a comprehensive\nsurvey of the past decade of research on sociodemographic bias in LMs,\norganized into a typology that facilitates examining the different aims: types\nof bias, quantifying bias, and debiasing techniques. We track the evolution of\nthe latter two questions, then identify current trends and their limitations,\nas well as emerging techniques. To guide future research towards more effective\nand reliable solutions, and to help authors situate their work within this\nbroad landscape, we conclude with a checklist of open questions.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2306.08158v5.pdf","comment":"23 pages, 3 figure"},{"id":"http://arxiv.org/abs/2408.07181v1","updated":"2024-08-13T19:46:50Z","published":"2024-08-13T19:46:50Z","title":"VulCatch: Enhancing Binary Vulnerability Detection through CodeT5\n Decompilation and KAN Advanced Feature Extraction","summary":" Binary program vulnerability detection is critical for software security, yet\nexisting deep learning approaches often rely on source code analysis, limiting\ntheir ability to detect unknown vulnerabilities. To address this, we propose\nVulCatch, a binary-level vulnerability detection framework. VulCatch introduces\na Synergy Decompilation Module (SDM) and Kolmogorov-Arnold Networks (KAN) to\ntransform raw binary code into pseudocode using CodeT5, preserving high-level\nsemantics for deep analysis with tools like Ghidra and IDA. KAN further\nenhances feature transformation, enabling the detection of complex\nvulnerabilities. VulCatch employs word2vec, Inception Blocks, BiLSTM Attention,\nand Residual connections to achieve high detection accuracy (98.88%) and\nprecision (97.92%), while minimizing false positives (1.56%) and false\nnegatives (2.71%) across seven CVE datasets.\n","authors":["Abdulrahman Hamman Adama Chukkol","Senlin Luo","Kashif Sharif","Yunusa Haruna","Muhammad Muhammad Abdullahi"],"pdf_url":"https://arxiv.org/pdf/2408.07181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19631v2","updated":"2024-08-13T19:34:13Z","published":"2024-03-28T17:47:19Z","title":"Retrieval-enhanced Knowledge Editing in Language Models for Multi-Hop\n Question Answering","summary":" Large Language Models (LLMs) have shown proficiency in question-answering\ntasks but often struggle to integrate real-time knowledge, leading to\npotentially outdated or inaccurate responses. This problem becomes even more\nchallenging when dealing with multi-hop questions, since they require LLMs to\nupdate and integrate multiple knowledge pieces relevant to the questions. To\ntackle the problem, we propose the Retrieval-Augmented model Editing (RAE)\nframework for multi-hop question answering. RAE first retrieves edited facts\nand then refines the language model through in-context learning. Specifically,\nour retrieval approach, based on mutual information maximization, leverages the\nreasoning abilities of LLMs to identify chain facts that traditional\nsimilarity-based searches might miss. In addition, our framework includes a\npruning strategy to eliminate redundant information from the retrieved facts,\nwhich enhances the editing accuracy and mitigates the hallucination problem.\nOur framework is supported by theoretical justification for its fact retrieval\nefficacy. Finally, comprehensive evaluation across various LLMs validates RAE's\nability in providing accurate answers with updated knowledge. Our code is\navailable at: https://github.com/sycny/RAE.\n","authors":["Yucheng Shi","Qiaoyu Tan","Xuansheng Wu","Shaochen Zhong","Kaixiong Zhou","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.19631v2.pdf","comment":"Accepted by CIKM 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2111.12663v4","updated":"2024-08-13T16:59:44Z","published":"2021-11-24T17:51:16Z","title":"PointPCA: Point Cloud Objective Quality Assessment Using PCA-Based\n Descriptors","summary":" Point clouds denote a prominent solution for the representation of 3D\nphoto-realistic content in immersive applications. Similarly to other imaging\nmodalities, quality predictions for point cloud contents are vital for a wide\nrange of applications, enabling trade-off optimizations between data quality\nand data size in every processing step from acquisition to rendering. In this\nwork, we focus on use cases that consider human end-users consuming point cloud\ncontents and, hence, we concentrate on visual quality metrics. In particular,\nwe propose a set of perceptually relevant descriptors based on Principal\nComponent Analysis (PCA) decomposition, which is applied to both geometry and\ntexture data for full-reference point cloud quality assessment. Statistical\nfeatures are derived from these descriptors to characterize local shape and\nappearance properties for both a reference and a distorted point cloud. The\nextracted statistical features are subsequently compared to provide\ncorresponding predictions of visual quality for the distorted point cloud. As\npart of our method, a learning-based approach is proposed to fuse these\nindividual predictors to a unified perceptual score. We validate the accuracy\nof the individual predictors, as well as the unified quality scores obtained\nafter regression against subjectively annotated datasets, showing that our\nmetric outperforms state-of-the-art solutions. Insights regarding design\ndecisions are provided through exploratory studies, evaluating the performance\nof our metric under different parameter configurations, attribute domains,\ncolor spaces, and regression models. A software implementation of the proposed\nmetric is made available at the following link:\nhttps://github.com/cwi-dis/pointpca.\n","authors":["Evangelos Alexiou","Xuemei Zhou","Irene Viola","Pablo Cesar"],"pdf_url":"https://arxiv.org/pdf/2111.12663v4.pdf","comment":"32 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.01713v2","updated":"2024-08-13T12:58:13Z","published":"2024-04-02T07:57:05Z","title":"Generative AI for Immersive Communication: The Next Frontier in\n Internet-of-Senses Through 6G","summary":" Over the past two decades, the Internet-of-Things (IoT) has become a\ntransformative concept, and as we approach 2030, a new paradigm known as the\nInternet of Senses (IoS) is emerging. Unlike conventional Virtual Reality (VR),\nIoS seeks to provide multi-sensory experiences, acknowledging that in our\nphysical reality, our perception extends far beyond just sight and sound; it\nencompasses a range of senses. This article explores the existing technologies\ndriving immersive multi-sensory media, delving into their capabilities and\npotential applications. This exploration includes a comparative analysis\nbetween conventional immersive media streaming and a proposed use case that\nleverages semantic communication empowered by generative Artificial\nIntelligence (AI). The focal point of this analysis is the substantial\nreduction in bandwidth consumption by 99.93% in the proposed scheme. Through\nthis comparison, we aim to underscore the practical applications of generative\nAI for immersive media. Concurrently addressing major challenges in this field,\nsuch as temporal synchronization of multiple media, ensuring high throughput,\nminimizing the End-to-End (E2E) latency, and robustness to low bandwidth while\noutlining future trajectories.\n","authors":["Nassim Sehad","Lina Bariah","Wassim Hamidouche","Hamed Hellaoui","Riku Jäntti","Mérouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2404.01713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04416v2","updated":"2024-08-13T12:40:46Z","published":"2024-07-05T11:07:13Z","title":"Improving Audio Generation with Visual Enhanced Captions","summary":" Generative models have shown significant achievements in audio generation\ntasks. However, existing models struggle with complex and detailed prompts,\nleading to potential performance degradation. We hypothesize that this problem\nstems from the simplicity and scarcity of the training data. This work aims to\ncreate a large-scale audio dataset with rich captions for improving audio\ngeneration models. We first develop an automated pipeline to generate detailed\ncaptions by transforming predicted visual captions, audio captions, and tagging\nlabels into comprehensive descriptions using a Large Language Model (LLM). The\nresulting dataset, Sound-VECaps, comprises 1.66M high-quality audio-caption\npairs with enriched details including audio event orders, occurred places and\nenvironment information. We then demonstrate that training the text-to-audio\ngeneration models with Sound-VECaps significantly improves the performance on\ncomplex prompts. Furthermore, we conduct ablation studies of the models on\nseveral downstream audio-language tasks, showing the potential of Sound-VECaps\nin advancing audio-text representation learning. Our dataset and models are\navailable online.\n","authors":["Yi Yuan","Dongya Jia","Xiaobin Zhuang","Yuanzhe Chen","Zhengxi Liu","Zhuo Chen","Yuping Wang","Yuxuan Wang","Xubo Liu","Xiyuan Kang","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04416v2.pdf","comment":"5 pages with 1 appendix"},{"id":"http://arxiv.org/abs/2408.06753v1","updated":"2024-08-13T09:19:59Z","published":"2024-08-13T09:19:59Z","title":"Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies","summary":" Existing methods on audio-visual deepfake detection mainly focus on\nhigh-level features for modeling inconsistencies between audio and visual data.\nAs a result, these approaches usually overlook finer audio-visual artifacts,\nwhich are inherent to deepfakes. Herein, we propose the introduction of\nfine-grained mechanisms for detecting subtle artifacts in both spatial and\ntemporal domains. First, we introduce a local audio-visual model capable of\ncapturing small spatial regions that are prone to inconsistencies with audio.\nFor that purpose, a fine-grained mechanism based on a spatially-local distance\ncoupled with an attention module is adopted. Second, we introduce a\ntemporally-local pseudo-fake augmentation to include samples incorporating\nsubtle temporal inconsistencies in our training set. Experiments on the DFDC\nand the FakeAVCeleb datasets demonstrate the superiority of the proposed method\nin terms of generalization as compared to the state-of-the-art under both\nin-dataset and cross-dataset settings.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2408.06753v1.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2408.06614v1","updated":"2024-08-13T03:57:35Z","published":"2024-08-13T03:57:35Z","title":"ViMo: Generating Motions from Casual Videos","summary":" Although humans have the innate ability to imagine multiple possible actions\nfrom videos, it remains an extraordinary challenge for computers due to the\nintricate camera movements and montages. Most existing motion generation\nmethods predominantly rely on manually collected motion datasets, usually\ntediously sourced from motion capture (Mocap) systems or Multi-View cameras,\nunavoidably resulting in a limited size that severely undermines their\ngeneralizability. Inspired by recent advance of diffusion models, we probe a\nsimple and effective way to capture motions from videos and propose a novel\nVideo-to-Motion-Generation framework (ViMo) which could leverage the immense\ntrove of untapped video content to produce abundant and diverse 3D human\nmotions. Distinct from prior work, our videos could be more causal, including\ncomplicated camera movements and occlusions. Striking experimental results\ndemonstrate the proposed model could generate natural motions even for videos\nwhere rapid movements, varying perspectives, or frequent occlusions might\nexist. We also show this work could enable three important downstream\napplications, such as generating dancing motions according to arbitrary music\nand source video style. Extensive experimental results prove that our model\noffers an effective and scalable way to generate diversity and realistic\nmotions. Code and demos will be public soon.\n","authors":["Liangdong Qiu","Chengxing Yu","Yanran Li","Zhao Wang","Haibin Huang","Chongyang Ma","Di Zhang","Pengfei Wan","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2408.06614v1.pdf","comment":null}]},"2024-08-14T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.07702v1","updated":"2024-08-14T17:59:04Z","published":"2024-08-14T17:59:04Z","title":"The Death of Schema Linking? Text-to-SQL in the Age of Well-Reasoned\n Language Models","summary":" Schema linking is a crucial step in Text-to-SQL pipelines, which translate\nnatural language queries into SQL. The goal of schema linking is to retrieve\nrelevant tables and columns (signal) while disregarding irrelevant ones\n(noise). However, imperfect schema linking can often exclude essential columns\nneeded for accurate query generation. In this work, we revisit the need for\nschema linking when using the latest generation of large language models\n(LLMs). We find empirically that newer models are adept at identifying relevant\nschema elements during generation, without the need for explicit schema\nlinking. This allows Text-to-SQL pipelines to bypass schema linking entirely\nand instead pass the full database schema to the LLM, eliminating the risk of\nexcluding necessary information. Furthermore, as alternatives to schema\nlinking, we propose techniques that improve Text-to-SQL accuracy without\ncompromising on essential schema information. Our approach achieves 71.83\\%\nexecution accuracy on the BIRD benchmark, ranking first at the time of\nsubmission.\n","authors":["Karime Maamari","Fadhil Abubaker","Daniel Jaroslawicz","Amine Mhedhbi"],"pdf_url":"https://arxiv.org/pdf/2408.07702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07697v1","updated":"2024-08-14T17:53:13Z","published":"2024-08-14T17:53:13Z","title":"Quantifying over Optimum Answer Sets","summary":" Answer Set Programming with Quantifiers (ASP(Q)) has been introduced to\nprovide a natural extension of ASP modeling to problems in the polynomial\nhierarchy (PH). However, ASP(Q) lacks a method for encoding in an elegant and\ncompact way problems requiring a polynomial number of calls to an oracle in\n$\\Sigma_n^p$ (that is, problems in $\\Delta_{n+1}^p$). Such problems include, in\nparticular, optimization problems. In this paper we propose an extension of\nASP(Q), in which component programs may contain weak constraints. Weak\nconstraints can be used both for expressing local optimization within\nquantified component programs and for modeling global optimization criteria. We\nshowcase the modeling capabilities of the new formalism through various\napplication scenarios. Further, we study its computational properties obtaining\ncomplexity results and unveiling non-obvious characteristics of ASP(Q) programs\nwith weak constraints.\n","authors":["Giuseppe Mazzotta","Francesco Ricca","Mirek Truszczynski"],"pdf_url":"https://arxiv.org/pdf/2408.07697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03710v2","updated":"2024-08-14T17:39:59Z","published":"2023-10-05T17:36:16Z","title":"Agent Instructs Large Language Models to be General Zero-Shot Reasoners","summary":" We introduce a method to improve the zero-shot reasoning abilities of large\nlanguage models on general language understanding tasks. Specifically, we build\nan autonomous agent to instruct the reasoning process of large language models.\nWe show this approach further unleashes the zero-shot reasoning abilities of\nlarge language models to more tasks. We study the performance of our method on\na wide set of datasets spanning generation, classification, and reasoning. We\nshow that our method generalizes to most tasks and obtains state-of-the-art\nzero-shot performance on 20 of the 29 datasets that we evaluate. For instance,\nour method boosts the performance of state-of-the-art large language models by\na large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and\nGPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement\nin reasoning is striking, with an average increase of 10.5%. With our method,\nLlama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.\n","authors":["Nicholas Crispino","Kyle Montgomery","Fankun Zeng","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03710v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2408.07676v1","updated":"2024-08-14T17:23:12Z","published":"2024-08-14T17:23:12Z","title":"Enhanced Detection of Conversational Mental Manipulation Through\n Advanced Prompting Techniques","summary":" This study presents a comprehensive, long-term project to explore the\neffectiveness of various prompting techniques in detecting dialogical mental\nmanipulation. We implement Chain-of-Thought prompting with Zero-Shot and\nFew-Shot settings on a binary mental manipulation detection task, building upon\nexisting work conducted with Zero-Shot and Few- Shot prompting. Our primary\nobjective is to decipher why certain prompting techniques display superior\nperformance, so as to craft a novel framework tailored for detection of mental\nmanipulation. Preliminary findings suggest that advanced prompting techniques\nmay not be suitable for more complex models, if they are not trained through\nexample-based learning.\n","authors":["Ivory Yang","Xiaobo Guo","Sean Xie","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2408.07676v1.pdf","comment":"Accepted at WiNLP @ EMNLP 2024"},{"id":"http://arxiv.org/abs/2408.07666v1","updated":"2024-08-14T16:58:48Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07665v1","updated":"2024-08-14T16:55:06Z","published":"2024-08-14T16:55:06Z","title":"Spoken Stereoset: On Evaluating Social Bias Toward Speaker in Speech\n Large Language Models","summary":" Warning: This paper may contain texts with uncomfortable content.\n Large Language Models (LLMs) have achieved remarkable performance in various\ntasks, including those involving multimodal data like speech. However, these\nmodels often exhibit biases due to the nature of their training data. Recently,\nmore Speech Large Language Models (SLLMs) have emerged, underscoring the urgent\nneed to address these biases. This study introduces Spoken Stereoset, a dataset\nspecifically designed to evaluate social biases in SLLMs. By examining how\ndifferent models respond to speech from diverse demographic groups, we aim to\nidentify these biases. Our experiments reveal significant insights into their\nperformance and bias levels. The findings indicate that while most models show\nminimal bias, some still exhibit slightly stereotypical or anti-stereotypical\ntendencies.\n","authors":["Yi-Cheng Lin","Wei-Chih Chen","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2408.07665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07663v1","updated":"2024-08-14T16:51:21Z","published":"2024-08-14T16:51:21Z","title":"Alignment-Enhanced Decoding:Defending via Token-Level Adaptive Refining\n of Probability Distributions","summary":" Large language models are susceptible to jailbreak attacks, which can result\nin the generation of harmful content. While prior defenses mitigate these risks\nby perturbing or inspecting inputs, they ignore competing objectives, the\nunderlying cause of alignment failures. In this paper, we propose\nAlignment-Enhanced Decoding (AED), a novel defense that employs adaptive\ndecoding to address the root causes of jailbreak issues. We first define the\nCompetitive Index to quantify alignment failures and utilize feedback from\nself-evaluation to compute post-alignment logits. Then, AED adaptively combines\nAED and post-alignment logits with the original logits to obtain harmless and\nhelpful distributions. Consequently, our method enhances safety alignment while\nmaintaining helpfulness. We conduct experiments across five models and four\ncommon jailbreaks, with the results validating the effectiveness of our\napproach. Code is available at https://github.com/GIGABaozi/AED.git.\n","authors":["Quan Liu","Zhenhong Zhou","Longzhu He","Yi Liu","Wei Zhang","Sen Su"],"pdf_url":"https://arxiv.org/pdf/2408.07663v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.07648v1","updated":"2024-08-14T16:19:18Z","published":"2024-08-14T16:19:18Z","title":"See It All: Contextualized Late Aggregation for 3D Dense Captioning","summary":" 3D dense captioning is a task to localize objects in a 3D scene and generate\ndescriptive sentences for each object. Recent approaches in 3D dense captioning\nhave adopted transformer encoder-decoder frameworks from object detection to\nbuild an end-to-end pipeline without hand-crafted components. However, these\napproaches struggle with contradicting objectives where a single query\nattention has to simultaneously view both the tightly localized object regions\nand contextual environment. To overcome this challenge, we introduce SIA\n(See-It-All), a transformer pipeline that engages in 3D dense captioning with a\nnovel paradigm called late aggregation. SIA simultaneously decodes two sets of\nqueries-context query and instance query. The instance query focuses on\nlocalization and object attribute descriptions, while the context query\nversatilely captures the region-of-interest of relationships between multiple\nobjects or with the global scene, then aggregated afterwards (i.e., late\naggregation) via simple distance-based measures. To further enhance the quality\nof contextualized caption generation, we design a novel aggregator to generate\na fully informed caption based on the surrounding context, the global\nenvironment, and object instances. Extensive experiments on two of the most\nwidely-used 3D dense captioning datasets demonstrate that our proposed method\nachieves a significant improvement over prior methods.\n","authors":["Minjung Kim","Hyung Suk Lim","Seung Hwan Kim","Soonyoung Lee","Bumsoo Kim","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2408.07648v1.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.07637v1","updated":"2024-08-14T16:03:47Z","published":"2024-08-14T16:03:47Z","title":"Hierarchical Working Memory and a New Magic Number","summary":" The extremely limited working memory span, typically around four items,\ncontrasts sharply with our everyday experience of processing much larger\nstreams of sensory information concurrently. This disparity suggests that\nworking memory can organize information into compact representations such as\nchunks, yet the underlying neural mechanisms remain largely unknown. Here, we\npropose a recurrent neural network model for chunking within the framework of\nthe synaptic theory of working memory. We showed that by selectively\nsuppressing groups of stimuli, the network can maintain and retrieve the\nstimuli in chunks, hence exceeding the basic capacity. Moreover, we show that\nour model can dynamically construct hierarchical representations within working\nmemory through hierarchical chunking. A consequence of this proposed mechanism\nis a new limit on the number of items that can be stored and subsequently\nretrieved from working memory, depending only on the basic working memory\ncapacity when chunking is not invoked. Predictions from our model were\nconfirmed by analyzing single-unit responses in epileptic patients and memory\nexperiments with verbal material. Our work provides a novel conceptual and\nanalytical framework for understanding the on-the-fly organization of\ninformation in the brain that is crucial for cognition.\n","authors":["Weishun Zhong","Mikhail Katkov","Misha Tsodyks"],"pdf_url":"https://arxiv.org/pdf/2408.07637v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.17762v2","updated":"2024-08-14T16:00:49Z","published":"2024-02-27T18:55:17Z","title":"Massive Activations in Large Language Models","summary":" We observe an empirical phenomenon in Large Language Models (LLMs) -- very\nfew activations exhibit significantly larger values than others (e.g., 100,000\ntimes larger). We call them massive activations. First, we demonstrate the\nwidespread existence of massive activations across various LLMs and\ncharacterize their locations. Second, we find their values largely stay\nconstant regardless of the input, and they function as indispensable bias terms\nin LLMs. Third, these massive activations lead to the concentration of\nattention probabilities to their corresponding tokens, and further, implicit\nbias terms in the self-attention output. Last, we also study massive\nactivations in Vision Transformers. Code is available at\nhttps://github.com/locuslab/massive-activations.\n","authors":["Mingjie Sun","Xinlei Chen","J. Zico Kolter","Zhuang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17762v2.pdf","comment":"First Conference on Language Modeling (COLM), 2024. Website at\n https://eric-mingjie.github.io/massive-activations/index.html"},{"id":"http://arxiv.org/abs/2408.06583v2","updated":"2024-08-14T15:44:07Z","published":"2024-08-13T02:43:19Z","title":"An Event Structure-aware Generative Model for Biomedical Event\n Extraction","summary":" Biomedical Event Extraction (BEE) is a challenging task that involves\nmodeling complex relationships between fine-grained entities in biomedical\ntext. Most existing BEE models rely on classification methods that ignore label\nsemantics and argument dependencies in the data. Although generative models\nthat use prompts are increasingly being used for event extraction, they face\ntwo main challenges: creating effective prompts for the biomedical domain and\ndealing with events with complex structures in the text. To address these\nlimitations, we propose GenBEE, a generative model enhanced with\nstructure-aware prefixes for biomedical event extraction. GenBEE constructs\nevent prompts that leverage knowledge distilled from large language models\n(LLMs), thereby incorporating both label semantics and argument dependency\nrelationships. Additionally, GenBEE introduces a structural prefix learning\nmodule that generates structure-aware prefixes with structural prompts,\nenriching the generation process with structural features. Extensive\nexperiments on three benchmark datasets demonstrate the effectiveness of GenBEE\nand it achieves state-of-the-art performance on the MLEE and GE11 datasets.\nMoreover, our analysis shows that the structural prefixes effectively bridge\nthe gap between structural prompts and the representation space of generative\nmodels, enabling better integration of event structural information.\n","authors":["Haohan Yuan","Siu Cheung Hui","Haopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06583v2.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2406.15363v2","updated":"2024-08-14T15:32:25Z","published":"2024-04-01T15:17:39Z","title":"Exploring LLM Multi-Agents for ICD Coding","summary":" To address the limitations of Large Language Models (LLMs) in the\nInternational Classification of Diseases (ICD) coding task, where they often\nproduce inaccurate and incomplete prediction results due to the\nhigh-dimensional and skewed distribution of the ICD codes, and often lack\ninterpretability and reliability as well. We introduce an innovative\nmulti-agent approach for ICD coding which mimics the ICD coding assignment\nprocedure in real-world settings, comprising five distinct agents: the patient,\nphysician, coder, reviewer, and adjuster. Each agent utilizes an LLM-based\nmodel tailored to their specific role within the coding process. We also\nintegrate the system with Electronic Health Record (HER)'s SOAP (subjective,\nobjective, assessment and plan) structure to boost the performances. We compare\nour method with a system of agents designed solely by LLMs and other strong\nbaselines and evaluate it using the Medical Information Mart for Intensive Care\nIII (MIMIC-III) dataset. Our multi-agent coding framework significantly\noutperforms Zero-shot Chain of Thought (CoT) prompting and self-consistency\nwith CoT (CoT-SC) in coding common and rare ICD codes. An ablation study\nvalidates the effectiveness of the designated agent roles. it also outperforms\nthe LLM-designed agent system. Moreover, our method achieves comparable results\nto state-of-the-art ICD coding methods that require extensive pre-training or\nfine-tuning, and outperforms them in rare code accuracy, and explainability.\nAdditionally, we demonstrate the method's practical applicability by presenting\nits performance in scenarios not limited by the common or rare ICD code\nconstraints.The proposed multi-agent method for ICD coding effectively mimics\nthe real-world coding process and improves performance on both common and rare\ncodes.\n","authors":["Rumeng Li","Xun Wang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2406.15363v2.pdf","comment":"12pages"},{"id":"http://arxiv.org/abs/2408.06663v2","updated":"2024-08-14T15:23:38Z","published":"2024-08-13T06:28:43Z","title":"Amuro & Char: Analyzing the Relationship between Pre-Training and\n Fine-Tuning of Large Language Models","summary":" The development of large language models leads to the formation of a\npre-train-then-align paradigm, in which the model is typically pre-trained on a\nlarge text corpus and undergoes a tuning stage to align the model with human\npreference or downstream tasks. In this work, we investigate the relationship\nbetween pre-training and fine-tuning by fine-tuning multiple intermediate\npre-trained model checkpoints. Our results on 18 datasets suggest that i)\ncontinual pre-training improves the model in a latent way that unveils after\nfine-tuning; ii) with extra fine-tuning, the datasets that the model does not\ndemonstrate capability gain much more than those that the model performs well\nduring the pre-training stage; iii) although model benefits significantly\nthrough supervised fine-tuning, it may forget previously known domain knowledge\nand the tasks that are not seen during fine-tuning; iv) the model resembles\nhigh sensitivity to evaluation prompts after supervised fine-tuning, but this\nsensitivity can be alleviated by more pre-training.\n","authors":["Kaiser Sun","Mark Dredze"],"pdf_url":"https://arxiv.org/pdf/2408.06663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07611v1","updated":"2024-08-14T15:19:16Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n Integrating Web Search and Knowledge Graphs","summary":" Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v1.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2408.07599v1","updated":"2024-08-14T14:59:20Z","published":"2024-08-14T14:59:20Z","title":"Assessing the Role of Lexical Semantics in Cross-lingual Transfer\n through Controlled Manipulations","summary":" While cross-linguistic model transfer is effective in many settings, there is\nstill limited understanding of the conditions under which it works. In this\npaper, we focus on assessing the role of lexical semantics in cross-lingual\ntransfer, as we compare its impact to that of other language properties.\nExamining each language property individually, we systematically analyze how\ndifferences between English and a target language influence the capacity to\nalign the language with an English pretrained representation space. We do so by\nartificially manipulating the English sentences in ways that mimic specific\ncharacteristics of the target language, and reporting the effect of each\nmanipulation on the quality of alignment with the representation space. We show\nthat while properties such as the script or word order only have a limited\nimpact on alignment quality, the degree of lexical matching between the two\nlanguages, which we define using a measure of translation entropy, greatly\naffects it.\n","authors":["Roy Ilani","Taelin Karidi","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2408.07599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07583v1","updated":"2024-08-14T14:28:11Z","published":"2024-08-14T14:28:11Z","title":"Transformers and Large Language Models for Efficient Intrusion Detection\n Systems: A Comprehensive Survey","summary":" With significant advancements in Transformers LLMs, NLP has extended its\nreach into many research fields due to its enhanced capabilities in text\ngeneration and user interaction. One field benefiting greatly from these\nadvancements is cybersecurity. In cybersecurity, many parameters that need to\nbe protected and exchanged between senders and receivers are in the form of\ntext and tabular data, making NLP a valuable tool in enhancing the security\nmeasures of communication protocols. This survey paper provides a comprehensive\nanalysis of the utilization of Transformers and LLMs in cyber-threat detection\nsystems. The methodology of paper selection and bibliometric analysis is\noutlined to establish a rigorous framework for evaluating existing research.\nThe fundamentals of Transformers are discussed, including background\ninformation on various cyber-attacks and datasets commonly used in this field.\nThe survey explores the application of Transformers in IDSs, focusing on\ndifferent architectures such as Attention-based models, LLMs like BERT and GPT,\nCNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.\nFurthermore, it explores the diverse environments and applications where\nTransformers and LLMs-based IDS have been implemented, including computer\nnetworks, IoT devices, critical infrastructure protection, cloud computing,\nSDN, as well as in autonomous vehicles. The paper also addresses research\nchallenges and future directions in this area, identifying key issues such as\ninterpretability, scalability, and adaptability to evolving threats, and more.\nFinally, the conclusion summarizes the findings and highlights the significance\nof Transformers and LLMs in enhancing cyber-threat detection capabilities,\nwhile also outlining potential avenues for further research and development.\n","authors":["Hamza Kheddar"],"pdf_url":"https://arxiv.org/pdf/2408.07583v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.04760 by other authors"},{"id":"http://arxiv.org/abs/2402.19457v3","updated":"2024-08-14T14:06:10Z","published":"2024-02-29T18:51:23Z","title":"$\\texttt{COSMIC}$: Mutual Information for Task-Agnostic Summarization\n Evaluation","summary":" Assessing the quality of summarizers poses significant challenges. In\nresponse, we propose a novel task-oriented evaluation approach that assesses\nsummarizers based on their capacity to produce summaries that are useful for\ndownstream tasks, while preserving task outcomes. We theoretically establish a\ndirect relationship between the resulting error probability of these tasks and\nthe mutual information between source texts and generated summaries. We\nintroduce $\\texttt{COSMIC}$ as a practical implementation of this metric,\ndemonstrating its strong correlation with human judgment-based metrics and its\neffectiveness in predicting downstream task performance. Comparative analyses\nagainst established metrics like $\\texttt{BERTScore}$ and $\\texttt{ROUGE}$\nhighlight the competitive performance of $\\texttt{COSMIC}$.\n","authors":["Maxime Darrin","Philippe Formont","Jackie Chi Kit Cheung","Pablo Piantanida"],"pdf_url":"https://arxiv.org/pdf/2402.19457v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2402.01766v3","updated":"2024-08-14T13:41:02Z","published":"2024-01-31T14:52:02Z","title":"LLM Voting: Human Choices and AI Collective Decision Making","summary":" This paper investigates the voting behaviors of Large Language Models (LLMs),\nspecifically GPT-4 and LLaMA-2, their biases, and how they align with human\nvoting patterns. Our methodology involved using a dataset from a human voting\nexperiment to establish a baseline for human preferences and conducting a\ncorresponding experiment with LLM agents. We observed that the choice of voting\nmethods and the presentation order influenced LLM voting outcomes. We found\nthat varying the persona can reduce some of these biases and enhance alignment\nwith human choices. While the Chain-of-Thought approach did not improve\nprediction accuracy, it has potential for AI explainability in the voting\nprocess. We also identified a trade-off between preference diversity and\nalignment accuracy in LLMs, influenced by different temperature settings. Our\nfindings indicate that LLMs may lead to less diverse collective outcomes and\nbiased assumptions when used in voting scenarios, emphasizing the need for\ncautious integration of LLMs into democratic processes.\n","authors":["Joshua C. Yang","Damian Dailisan","Marcin Korecki","Carina I. Hausladen","Dirk Helbing"],"pdf_url":"https://arxiv.org/pdf/2402.01766v3.pdf","comment":"Accepted in AAAI Conference on AI, Ethics, and Society (AIES)"},{"id":"http://arxiv.org/abs/2408.07543v1","updated":"2024-08-14T13:23:43Z","published":"2024-08-14T13:23:43Z","title":"MathScape: Evaluating MLLMs in multimodal Math Scenarios through a\n Hierarchical Benchmark","summary":" With the development of Multimodal Large Language Models (MLLMs), the\nevaluation of multimodal models in the context of mathematical problems has\nbecome a valuable research field. Multimodal visual-textual mathematical\nreasoning serves as a critical indicator for evaluating the comprehension and\ncomplex multi-step quantitative reasoning abilities of MLLMs. However, previous\nmultimodal math benchmarks have not sufficiently integrated visual and textual\ninformation. To address this gap, we proposed MathScape, a new benchmark that\nemphasizes the understanding and application of combined visual and textual\ninformation. MathScape is designed to evaluate photo-based math problem\nscenarios, assessing the theoretical understanding and application ability of\nMLLMs through a categorical hierarchical approach. We conduct a\nmulti-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark\nis challenging even for the most sophisticated models. By analyzing the\nevaluation results, we identify the limitations of MLLMs, offering valuable\ninsights for enhancing model performance.\n","authors":["Minxuan Zhou","Hao Liang","Tianpeng Li","Zhiyu Wu","Mingan Lin","Linzhuang Sun","Yaqi Zhou","Yan Zhang","Xiaoqin Huang","Yicong Chen","Yujing Qiao","Weipeng Chen","Bin Cui","Wentao Zhang","Zenan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10020v2","updated":"2024-08-14T13:15:00Z","published":"2024-03-15T05:06:21Z","title":"Lost in Overlap: Exploring Watermark Collision in LLMs","summary":" The proliferation of large language models (LLMs) in generating content\nraises concerns about text copyright. Watermarking methods, particularly\nlogit-based approaches, embed imperceptible identifiers into text to address\nthese challenges. However, the widespread usage of watermarking across diverse\nLLMs has led to an inevitable issue known as watermark collision during common\ntasks, such as paraphrasing or translation. In this paper, we introduce\nwatermark collision as a novel and general philosophy for watermark attacks,\naimed at enhancing attack performance on top of any other attacking methods. We\nalso provide a comprehensive demonstration that watermark collision poses a\nthreat to all logit-based watermark algorithms, impacting not only specific\nattack scenarios but also downstream applications.\n","authors":["Yiyang Luo","Ke Lin","Chao Gu"],"pdf_url":"https://arxiv.org/pdf/2403.10020v2.pdf","comment":"Long Paper, 7 pages"},{"id":"http://arxiv.org/abs/2408.07531v1","updated":"2024-08-14T13:03:41Z","published":"2024-08-14T13:03:41Z","title":"Development of a Multi-Agent Clinical Decision Support System for Korean\n Triage and Acuity Scale (KTAS)-Based Triage and Treatment Planning in\n Emergency Departments","summary":" Emergency department (ED) overcrowding and the complexity of rapid\ndecision-making in critical care settings pose significant challenges to\nhealthcare systems worldwide. While clinical decision support systems (CDSS)\nhave shown promise, the integration of large language models (LLMs) offers new\npossibilities for enhancing triage accuracy and clinical decision-making. This\nstudy presents an LLM-driven CDSS designed to assist ED physicians and nurses\nin patient triage, treatment planning, and overall emergency care management.\n We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,\norchestrated by CrewAI and Langchain. The system comprises four AI agents\nemulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED\nCoordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for\ntriage assessment and integrates with the RxNorm API for medication management.\n The model was evaluated using the Asclepius dataset, with performance\nassessed by a clinical emergency medicine specialist. The CDSS demonstrated\nhigh accuracy in triage decision-making compared to the baseline of a\nsingle-agent system. Furthermore, the system exhibited strong performance in\ncritical areas, including primary diagnosis, critical findings identification,\ndisposition decision-making, treatment planning, and resource allocation.\n Our multi-agent CDSS demonstrates significant potential for supporting\ncomprehensive emergency care management. By leveraging state-of-the-art AI\ntechnologies, this system offers a scalable and adaptable tool that could\nenhance emergency medical care delivery, potentially alleviating ED\novercrowding and improving patient outcomes. This work contributes to the\ngrowing field of AI applications in emergency medicine and offers a promising\ndirection for future research and clinical implementation.\n","authors":["Seungjun Han","Wongyung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.07531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04093v3","updated":"2024-08-14T12:47:31Z","published":"2024-08-07T21:16:55Z","title":"Tree Attention: Topology-aware Decoding for Long-Context Attention on\n GPU clusters","summary":" Self-attention is the core mathematical operation of modern transformer\narchitectures and is also a significant computational bottleneck due to its\nquadratic complexity in the sequence length. In this work, we derive the scalar\nenergy function whose gradient computes the self-attention block, thus\nelucidating the theoretical underpinnings of self-attention, providing a\nBayesian interpretation of the operation and linking it closely with\nenergy-based models such as Hopfield Networks. Our formulation reveals that the\nreduction across the sequence axis can be efficiently computed in parallel\nthrough a tree reduction. Our algorithm, for parallelizing attention\ncomputation across multiple GPUs enables cross-device decoding to be performed\nasymptotically faster (up to 8x faster in our experiments) than alternative\napproaches such as Ring Attention, while also requiring significantly less\ncommunication volume and incurring 2x less peak memory. Our code is publicly\navailable here: \\url{https://github.com/Zyphra/tree_attention}.\n","authors":["Vasudev Shyam","Jonathan Pilault","Emily Shepperd","Quentin Anthony","Beren Millidge"],"pdf_url":"https://arxiv.org/pdf/2408.04093v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00656v2","updated":"2024-08-14T12:42:44Z","published":"2024-03-31T12:01:32Z","title":"WavLLM: Towards Robust and Adaptive Speech Large Language Model","summary":" The recent advancements in large language models (LLMs) have revolutionized\nthe field of natural language processing, progressively broadening their scope\nto multimodal perception and generation. However, effectively integrating\nlistening capabilities into LLMs poses significant challenges, particularly\nwith respect to generalizing across varied contexts and executing complex\nauditory tasks. In this work, we introduce WavLLM, a robust and adaptive speech\nlarge language model with dual encoders, and a prompt-aware LoRA weight\nadapter, optimized by a two-stage curriculum learning approach. Leveraging dual\nencoders, we decouple different types of speech information, utilizing a\nWhisper encoder to process the semantic content of speech, and a WavLM encoder\nto capture the unique characteristics of the speaker's identity. Within the\ncurriculum learning framework, WavLLM first builds its foundational\ncapabilities by optimizing on mixed elementary single tasks, followed by\nadvanced multi-task training on more complex tasks such as combinations of the\nelementary tasks. To enhance the flexibility and adherence to different tasks\nand instructions, a prompt-aware LoRA weight adapter is introduced in the\nsecond advanced multi-task training stage. We validate the proposed model on\nuniversal speech benchmarks including tasks such as ASR, ST, SV, ER, and also\napply it to specialized datasets like Gaokao English listening comprehension\nset for SQA, and speech Chain-of-Thought (CoT) evaluation set. Experiments\ndemonstrate that the proposed model achieves state-of-the-art performance\nacross a range of speech tasks on the same model size, exhibiting robust\ngeneralization capabilities in executing complex tasks using CoT approach.\nFurthermore, our model successfully completes Gaokao tasks without specialized\ntraining. The codes, models, audio, and Gaokao evaluation set can be accessed\nat \\url{aka.ms/wavllm}.\n","authors":["Shujie Hu","Long Zhou","Shujie Liu","Sanyuan Chen","Lingwei Meng","Hongkun Hao","Jing Pan","Xunying Liu","Jinyu Li","Sunit Sivasankaran","Linquan Liu","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07505v1","updated":"2024-08-14T12:32:41Z","published":"2024-08-14T12:32:41Z","title":"Large Language Models Know What Makes Exemplary Contexts","summary":" In-context learning (ICL) has proven to be a significant capability with the\nadvancement of Large Language models (LLMs). By instructing LLMs using few-shot\ndemonstrative examples, ICL enables them to perform a wide range of tasks\nwithout needing to update millions of parameters. This paper presents a unified\nframework for LLMs that allows them to self-select influential in-context\nexamples to compose their contexts; self-rank candidates with different\ndemonstration compositions; self-optimize the demonstration selection and\nordering through reinforcement learning. Specifically, our method designs a\nparameter-efficient retrieval head that generates the optimized demonstration\nafter training with rewards from LLM's own preference. Experimental results\nvalidate the proposed method's effectiveness in enhancing ICL performance.\nAdditionally, our approach effectively identifies and selects the most\nrepresentative examples for the current task, and includes more diversity in\nretrieval.\n","authors":["Quanyu Long","Jianda Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07505v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.07479v1","updated":"2024-08-14T11:49:24Z","published":"2024-08-14T11:49:24Z","title":"A Study on Bias Detection and Classification in Natural Language\n Processing","summary":" Human biases have been shown to influence the performance of models and\nalgorithms in various fields, including Natural Language Processing. While the\nstudy of this phenomenon is garnering focus in recent years, the available\nresources are still relatively scarce, often focusing on different forms or\nmanifestations of biases. The aim of our work is twofold: 1) gather\npublicly-available datasets and determine how to better combine them to\neffectively train models in the task of hate speech detection and\nclassification; 2) analyse the main issues with these datasets, such as\nscarcity, skewed resources, and reliance on non-persistent data. We discuss\nthese issues in tandem with the development of our experiments, in which we\nshow that the combinations of different datasets greatly impact the models'\nperformance.\n","authors":["Ana Sofia Evans","Helena Moniz","Luísa Coheur"],"pdf_url":"https://arxiv.org/pdf/2408.07479v1.pdf","comment":"31 pages, 15 Tables, 4 Figures"},{"id":"http://arxiv.org/abs/2402.11811v3","updated":"2024-08-14T11:47:39Z","published":"2024-02-19T03:56:44Z","title":"FIPO: Free-form Instruction-oriented Prompt Optimization with Preference\n Dataset and Modular Fine-tuning Schema","summary":" When the quality of naive prompts is carefully optimized by human experts,\nthe task performance of large language models (LLMs) can be significantly\nimproved. However, expert-based prompt optimizations are expensive. Herein,\nsome works have proposed Automatic Prompt Optimization (APO), to optimize naive\nprompts according to task outputs of given in-box testing models, with the help\nof advanced LLMs (e.g., GPT-4) in an ad-hoc way. Although effective, existing\nschemes suffer from poor generalization ability and privacy risk. To this end,\nwe collect the first large-scale Prompt Optimization Preference dataset (POP),\nfine-tune offline local LLM-based optimizers, then fairly test with various\ndownstream models. Our method allows accurate optimization of the core task\ninstruction part within the naive prompt in a model-agnostic manner, and thus\nis named Free-from Instruction-oriented Prompt Optimization (FIPO). In\nspecific, FIPO uses a modular APO template that dynamically integrate the naive\ntask instruction, optional instruction responses, and optional ground truth to\nproduce finely optimized prompts. The POP dataset is meticulously constructed\nusing advanced LLMs, undergoing rigorous cross-validation by human experts and\nanalytical models. Leveraging insights from the data with Tulu2 models and\ndiverse fine-tuning strategies, we validate the efficacy of FIPO framework\nacross five public benchmarks and six testing models. Check codes and data\nhere: https://github.com/LuJunru/FIPO_Project.\n","authors":["Junru Lu","Siyu An","Min Zhang","Yulan He","Di Yin","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2402.11811v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19832v2","updated":"2024-08-14T11:42:02Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention for\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model, which\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear scalability and fast processing of long sequences. We replace\nthe Transformer-based backbone with a pre-trained Mamba-2 model and explore\nmethods for integrating 2D visual selective scanning mechanisms into multimodal\nlearning while also trying various visual encoders and Mamba-2 model variants.\nOur extensive experiments in various multimodal benchmark tests demonstrate the\ncompetitive performance of ML-Mamba and highlight the potential of state space\nmodels in multimodal tasks. The experimental results show that: (1) we\nempirically explore how to effectively apply the 2D vision selective scan\nmechanism for multimodal learning. We propose a novel multimodal connector\ncalled the Mamba-2 Scan Connector (MSC), which enhances representational\ncapabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling while faster inference speed; (3) Compared to multimodal models\nutilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference\nperformance and effectiveness.\n","authors":["Wenjun Huang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.13600,\n arXiv:2406.07537 by other authors"},{"id":"http://arxiv.org/abs/2408.07471v1","updated":"2024-08-14T11:29:47Z","published":"2024-08-14T11:29:47Z","title":"Bridging and Modeling Correlations in Pairwise Data for Direct\n Preference Optimization","summary":" Direct preference optimization (DPO), a widely adopted offline preference\noptimization algorithm, aims to align large language models (LLMs) with\nhuman-desired behaviors using pairwise preference data. However, the winning\nresponse and the losing response within pairwise data are generated isolatedly,\nleading to weak correlations between them as well as suboptimal alignment\nperformance. To address this issue, we propose an effective framework named\nBMC, for bridging and modeling correlations in pairwise data. Firstly, we\nincrease the consistency and informativeness of the pairwise preference signals\nby targeted modifications, synthesizing a pseudo winning response through\nimproving the losing response based on the winning response. Secondly, we\nidentify that DPO alone is insufficient to model these correlations and capture\nnuanced variations. Therefore, we propose learning token-level correlations by\ndynamically leveraging the policy model's confidence during training.\nComprehensive experiments on QA, math, and instruction-following tasks\ndemonstrate the effectiveness of our approach, significantly surpassing\ncompetitive baselines, including DPO. Additionally, our in-depth quantitative\nanalysis reveals the reasons behind our method's superior performance over DPO\nand showcases its versatility to other DPO variants.\n","authors":["Yuxin Jiang","Bo Huang","Yufei Wang","Xingshan Zeng","Liangyou Li","Yasheng Wang","Xin Jiang","Lifeng Shang","Ruiming Tang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07471v1.pdf","comment":"18 pages, 8 figures, 8 tables, working in progress"},{"id":"http://arxiv.org/abs/2408.07465v1","updated":"2024-08-14T11:19:28Z","published":"2024-08-14T11:19:28Z","title":"Large Language Models Prompting With Episodic Memory","summary":" Prompt optimization is essential for enhancing the performance of Large\nLanguage Models (LLMs) in a range of Natural Language Processing (NLP) tasks,\nparticularly in scenarios of few-shot learning where training examples are\nincorporated directly into the prompt. Despite the growing interest in\noptimizing prompts with few-shot examples, existing methods for prompt\noptimization are often resource-intensive or perform inadequately. In this\nwork, we propose PrOmpting with Episodic Memory (POEM), a novel prompt\noptimization technique that is simple, efficient, and demonstrates strong\ngeneralization capabilities. We approach prompt optimization as a Reinforcement\nLearning (RL) challenge, using episodic memory to archive combinations of input\ndata, permutations of few-shot examples, and the rewards observed during\ntraining. In the testing phase, we optimize the sequence of examples for each\ntest query by selecting the sequence that yields the highest total rewards from\nthe top-k most similar training examples in the episodic memory. Our results\nshow that POEM outperforms recent techniques like TEMPERA and RLPrompt by over\n5.3% in various text classification tasks. Furthermore, our approach adapts\nwell to broader language understanding tasks, consistently outperforming\nconventional heuristic methods for ordering examples.\n","authors":["Dai Do","Quan Tran","Svetha Venkatesh","Hung Le"],"pdf_url":"https://arxiv.org/pdf/2408.07465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05840v2","updated":"2024-08-14T11:07:17Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v2.pdf","comment":"Fix HTML view. That is, fix the heap (strikethrough) order of .tex\n files using the auxiliary Arxiv Readme XXX"},{"id":"http://arxiv.org/abs/2408.07457v1","updated":"2024-08-14T10:58:48Z","published":"2024-08-14T10:58:48Z","title":"From Brazilian Portuguese to European Portuguese","summary":" Brazilian Portuguese and European Portuguese are two varieties of the same\nlanguage and, despite their close similarities, they exhibit several\ndifferences. However, there is a significant disproportion in the availability\nof resources between the two variants, with Brazilian Portuguese having more\nabundant resources. This inequity can impact the quality of translation\nservices accessible to European Portuguese speakers. To address this issue, we\npropose the development of a Brazilian Portuguese to European Portuguese\ntranslation system, leveraging recent advancements in neural architectures and\nmodels. To evaluate the performance of such systems, we manually curated a gold\ntest set comprising 500 sentences across five different topics. Each sentence\nin the gold test set has two distinct references, facilitating a\nstraightforward evaluation of future translation models. We experimented with\nvarious models by fine-tuning existing Large Language Models using parallel\ndata extracted from movie subtitles and TED Talks transcripts in both Brazilian\nand European Portuguese. Our evaluation involved the use of conventional\nautomatic metrics as well as a human evaluation. In addition, all models were\ncompared against ChatGPT 3.5 Turbo, which currently yields the best results.\n","authors":["João Sanches","Rui Ribeiro","Luísa Coheur"],"pdf_url":"https://arxiv.org/pdf/2408.07457v1.pdf","comment":"12 pages, 8 tables"},{"id":"http://arxiv.org/abs/2408.07453v1","updated":"2024-08-14T10:46:15Z","published":"2024-08-14T10:46:15Z","title":"Fact or Fiction? Improving Fact Verification with Knowledge Graphs\n through Simplified Subgraph Retrievals","summary":" Despite recent success in natural language processing (NLP), fact\nverification still remains a difficult task. Due to misinformation spreading\nincreasingly fast, attention has been directed towards automatically verifying\nthe correctness of claims. In the domain of NLP, this is usually done by\ntraining supervised machine learning models to verify claims by utilizing\nevidence from trustworthy corpora. We present efficient methods for verifying\nclaims on a dataset where the evidence is in the form of structured knowledge\ngraphs. We use the FactKG dataset, which is constructed from the DBpedia\nknowledge graph extracted from Wikipedia. By simplifying the evidence retrieval\nprocess, from fine-tuned language models to simple logical retrievals, we are\nable to construct models that both require less computational resources and\nachieve better test-set accuracy.\n","authors":["Tobias A. Opsahl"],"pdf_url":"https://arxiv.org/pdf/2408.07453v1.pdf","comment":"10 pages, 3 figures, appendix"},{"id":"http://arxiv.org/abs/2408.07452v1","updated":"2024-08-14T10:44:51Z","published":"2024-08-14T10:44:51Z","title":"CMU's IWSLT 2024 Simultaneous Speech Translation System","summary":" This paper describes CMU's submission to the IWSLT 2024 Simultaneous Speech\nTranslation (SST) task for translating English speech to German text in a\nstreaming manner. Our end-to-end speech-to-text (ST) system integrates the\nWavLM speech encoder, a modality adapter, and the Llama2-7B-Base model as the\ndecoder. We employ a two-stage training approach: initially, we align the\nrepresentations of speech and text, followed by full fine-tuning. Both stages\nare trained on MuST-c v2 data with cross-entropy loss. We adapt our offline ST\nmodel for SST using a simple fixed hold-n policy. Experiments show that our\nmodel obtains an offline BLEU score of 31.1 and a BLEU score of 29.5 under 2\nseconds latency on the MuST-C-v2 tst-COMMON.\n","authors":["Xi Xu","Siqi Ouyang","Brian Yan","Patrick Fernandes","William Chen","Lei Li","Graham Neubig","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.07452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07448v1","updated":"2024-08-14T10:36:17Z","published":"2024-08-14T10:36:17Z","title":"LiveFC: A System for Live Fact-Checking of Audio Streams","summary":" The advances in the digital era have led to rapid dissemination of\ninformation. This has also aggravated the spread of misinformation and\ndisinformation. This has potentially serious consequences, such as civil\nunrest. While fact-checking aims to combat this, manual fact-checking is\ncumbersome and not scalable. While automated fact-checking approaches exist,\nthey do not operate in real-time and do not always account for spread of\nmisinformation through different modalities. This is particularly important as\nproactive fact-checking on live streams in real-time can help people be\ninformed of false narratives and prevent catastrophic consequences that may\ncause civil unrest. This is particularly relevant with the rapid dissemination\nof information through video on social media platforms or other streams like\npolitical rallies and debates. Hence, in this work we develop a platform named\n\\name{}, that can aid in fact-checking live audio streams in real-time. \\name{}\nhas a user-friendly interface that displays the claims detected along with\ntheir veracity and evidence for live streams with associated speakers for\nclaims from respective segments. The app can be accessed at\nhttp://livefc.factiverse.ai and a screen recording of the demo can be found at\nhttps://bit.ly/3WVAoIw.\n","authors":["Venktesh V","Vinay Setty"],"pdf_url":"https://arxiv.org/pdf/2408.07448v1.pdf","comment":"Under Review, 11 pages"},{"id":"http://arxiv.org/abs/2408.07425v1","updated":"2024-08-14T10:03:28Z","published":"2024-08-14T10:03:28Z","title":"Exploring Retrieval Augmented Generation in Arabic","summary":" Recently, Retrieval Augmented Generation (RAG) has emerged as a powerful\ntechnique in natural language processing, combining the strengths of\nretrieval-based and generation-based models to enhance text generation tasks.\nHowever, the application of RAG in Arabic, a language with unique\ncharacteristics and resource constraints, remains underexplored. This paper\npresents a comprehensive case study on the implementation and evaluation of RAG\nfor Arabic text. The work focuses on exploring various semantic embedding\nmodels in the retrieval stage and several LLMs in the generation stage, in\norder to investigate what works and what doesn't in the context of Arabic. The\nwork also touches upon the issue of variations between document dialect and\nquery dialect in the retrieval stage. Results show that existing semantic\nembedding models and LLMs can be effectively employed to build Arabic RAG\npipelines.\n","authors":["Samhaa R. El-Beltagy","Mohamed A. Abdallah"],"pdf_url":"https://arxiv.org/pdf/2408.07425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01107v2","updated":"2024-08-14T09:54:24Z","published":"2024-08-02T08:37:03Z","title":"BioRAG: A RAG-LLM Framework for Biological Question Reasoning","summary":" The question-answering system for Life science research, which is\ncharacterized by the rapid pace of discovery, evolving insights, and complex\ninteractions among knowledge entities, presents unique challenges in\nmaintaining a comprehensive knowledge warehouse and accurate information\nretrieval. To address these issues, we introduce BioRAG, a novel\nRetrieval-Augmented Generation (RAG) with the Large Language Models (LLMs)\nframework. Our approach starts with parsing, indexing, and segmenting an\nextensive collection of 22 million scientific papers as the basic knowledge,\nfollowed by training a specialized embedding model tailored to this domain.\nAdditionally, we enhance the vector retrieval process by incorporating a\ndomain-specific knowledge hierarchy, which aids in modeling the intricate\ninterrelationships among each query and context. For queries requiring the most\ncurrent information, BioRAG deconstructs the question and employs an iterative\nretrieval process incorporated with the search engine for step-by-step\nreasoning. Rigorous experiments have demonstrated that our model outperforms\nfine-tuned LLM, LLM with search engines, and other scientific RAG frameworks\nacross multiple life science question-answering tasks.\n","authors":["Chengrui Wang","Qingqing Long","Meng Xiao","Xunxin Cai","Chengjun Wu","Zhen Meng","Xuezhi Wang","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.01107v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.07413v1","updated":"2024-08-14T09:43:32Z","published":"2024-08-14T09:43:32Z","title":"Knowledge in Superposition: Unveiling the Failures of Lifelong Knowledge\n Editing for Large Language Models","summary":" Knowledge editing aims to update outdated or incorrect knowledge in large\nlanguage models (LLMs). However, current knowledge editing methods have limited\nscalability for lifelong editing. This study explores the fundamental reason\nwhy knowledge editing fails in lifelong editing. We begin with the closed-form\nsolution derived from linear associative memory, which underpins\nstate-of-the-art knowledge editing methods. We extend the solution from single\nediting to lifelong editing, and through rigorous mathematical derivation,\nidentify an interference term in the final solution, suggesting that editing\nknowledge may impact irrelevant knowledge. Further analysis of the interference\nterm reveals a close relationship with superposition between knowledge\nrepresentations. When knowledge superposition does not exist in language\nmodels, the interference term vanishes, allowing for lossless knowledge\nediting. Experiments across numerous language models reveal that knowledge\nsuperposition is universal, exhibiting high kurtosis, zero mean, and\nheavy-tailed distributions with clear scaling laws. Ultimately, by combining\ntheory and experiments, we demonstrate that knowledge superposition is the\nfundamental reason for the failure of lifelong editing. Moreover, this is the\nfirst study to investigate knowledge editing from the perspective of\nsuperposition and provides a comprehensive observation of superposition across\nnumerous real-world language models. Code available at\nhttps://github.com/ChenhuiHu/knowledge_in_superposition.\n","authors":["Chenhui Hu","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.07413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07410v1","updated":"2024-08-14T09:34:19Z","published":"2024-08-14T09:34:19Z","title":"Aquila2 Technical Report","summary":" This paper introduces the Aquila2 series, which comprises a wide range of\nbilingual models with parameter sizes of 7, 34, and 70 billion. These models\nare trained based on an innovative framework named HeuriMentor (HM), which\noffers real-time insights into model convergence and enhances the training\nprocess and data management. The HM System, comprising the Adaptive Training\nEngine (ATE), Training State Monitor (TSM), and Data Management Unit (DMU),\nallows for precise monitoring of the model's training progress and enables\nefficient optimization of data distribution, thereby enhancing training\neffectiveness. Extensive evaluations show that the Aquila2 model series\nperforms comparably well on both English and Chinese benchmarks. Specifically,\nAquila2-34B demonstrates only a slight decrease in performance when quantized\nto Int4. Furthermore, we have made our training code\n(https://github.com/FlagOpen/FlagScale) and model weights\n(https://github.com/FlagAI-Open/Aquila2) publicly available to support ongoing\nresearch and the development of applications.\n","authors":["Bo-Wen Zhang","Liangdong Wang","Jijie Li","Shuhao Gu","Xinya Wu","Zhengduo Zhang","Boyan Gao","Yulong Ao","Guang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.07410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07402v1","updated":"2024-08-14T09:21:23Z","published":"2024-08-14T09:21:23Z","title":"A Quantum-Inspired Analysis of Human Disambiguation Processes","summary":" Formal languages are essential for computer programming and are constructed\nto be easily processed by computers. In contrast, natural languages are much\nmore challenging and instigated the field of Natural Language Processing (NLP).\nOne major obstacle is the ubiquity of ambiguities. Recent advances in NLP have\nled to the development of large language models, which can resolve ambiguities\nwith high accuracy. At the same time, quantum computers have gained much\nattention in recent years as they can solve some computational problems faster\nthan classical computers. This new computing paradigm has reached the fields of\nmachine learning and NLP, where hybrid classical-quantum learning algorithms\nhave emerged. However, more research is needed to identify which NLP tasks\ncould benefit from a genuine quantum advantage. In this thesis, we applied\nformalisms arising from foundational quantum mechanics, such as contextuality\nand causality, to study ambiguities arising from linguistics. By doing so, we\nalso reproduced psycholinguistic results relating to the human disambiguation\nprocess. These results were subsequently used to predict human behaviour and\noutperformed current NLP methods.\n","authors":["Daphne Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07402v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2408.07401v1","updated":"2024-08-14T09:20:17Z","published":"2024-08-14T09:20:17Z","title":"DataVisT5: A Pre-trained Language Model for Jointly Understanding Text\n and Data Visualization","summary":" Data visualization (DV) is the fundamental and premise tool to improve the\nefficiency in conveying the insights behind the big data, which has been widely\naccepted in existing data-driven world. Task automation in DV, such as\nconverting natural language queries to visualizations (i.e., text-to-vis),\ngenerating explanations from visualizations (i.e., vis-to-text), answering\nDV-related questions in free form (i.e. FeVisQA), and explicating tabular data\n(i.e., table-to-text), is vital for advancing the field. Despite their\npotential, the application of pre-trained language models (PLMs) like T5 and\nBERT in DV has been limited by high costs and challenges in handling\ncross-modal information, leading to few studies on PLMs for DV. We introduce\n\\textbf{DataVisT5}, a novel PLM tailored for DV that enhances the T5\narchitecture through a hybrid objective pre-training and multi-task fine-tuning\nstrategy, integrating text and DV datasets to effectively interpret cross-modal\nsemantics. Extensive evaluations on public datasets show that DataVisT5\nconsistently outperforms current state-of-the-art models on various DV-related\ntasks. We anticipate that DataVisT5 will not only inspire further research on\nvertical PLMs but also expand the range of applications for PLMs.\n","authors":["Zhuoyue Wan","Yuanfeng Song","Shuaimin Li","Chen Jason Zhang","Raymond Chi-Wing Wong"],"pdf_url":"https://arxiv.org/pdf/2408.07401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07377v1","updated":"2024-08-14T08:53:00Z","published":"2024-08-14T08:53:00Z","title":"Do GPT Language Models Suffer From Split Personality Disorder? The\n Advent Of Substrate-Free Psychometrics","summary":" Previous research on emergence in large language models shows these display\napparent human-like abilities and psychological latent traits. However, results\nare partly contradicting in expression and magnitude of these latent traits,\nyet agree on the worrisome tendencies to score high on the Dark Triad of\nnarcissism, psychopathy, and Machiavellianism, which, together with a track\nrecord of derailments, demands more rigorous research on safety of these\nmodels. We provided a state of the art language model with the same personality\nquestionnaire in nine languages, and performed Bayesian analysis of Gaussian\nMixture Model, finding evidence for a deeper-rooted issue. Our results suggest\nboth interlingual and intralingual instabilities, which indicate that current\nlanguage models do not develop a consistent core personality. This can lead to\nunsafe behaviour of artificial intelligence systems that are based on these\nfoundation models, and are increasingly integrated in human life. We\nsubsequently discuss the shortcomings of modern psychometrics, abstract it, and\nprovide a framework for its species-neutral, substrate-free formulation.\n","authors":["Peter Romero","Stephen Fitz","Teruo Nakatsuma"],"pdf_url":"https://arxiv.org/pdf/2408.07377v1.pdf","comment":"37 pages, 7 figures, 3 tables, date v1: Mar 26 2023"},{"id":"http://arxiv.org/abs/2407.21264v2","updated":"2024-08-14T08:10:43Z","published":"2024-07-31T00:56:09Z","title":"Model Attribution in LLM-Generated Disinformation: A Domain\n Generalization Approach with Supervised Contrastive Learning","summary":" Model attribution for LLM-generated disinformation poses a significant\nchallenge in understanding its origins and mitigating its spread. This task is\nespecially challenging because modern large language models (LLMs) produce\ndisinformation with human-like quality. Additionally, the diversity in\nprompting methods used to generate disinformation complicates accurate source\nattribution. These methods introduce domain-specific features that can mask the\nfundamental characteristics of the models. In this paper, we introduce the\nconcept of model attribution as a domain generalization problem, where each\nprompting method represents a unique domain. We argue that an effective\nattribution model must be invariant to these domain-specific features. It\nshould also be proficient in identifying the originating models across all\nscenarios, reflecting real-world detection challenges. To address this, we\nintroduce a novel approach based on Supervised Contrastive Learning. This\nmethod is designed to enhance the model's robustness to variations in prompts\nand focuses on distinguishing between different source LLMs. We evaluate our\nmodel through rigorous experiments involving three common prompting methods:\n``open-ended'', ``rewriting'', and ``paraphrasing'', and three advanced LLMs:\n``llama 2'', ``chatgpt'', and ``vicuna''. Our results demonstrate the\neffectiveness of our approach in model attribution tasks, achieving\nstate-of-the-art performance across diverse and unseen datasets.\n","authors":["Alimohammad Beigi","Zhen Tan","Nivedh Mudiam","Canyu Chen","Kai Shu","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21264v2.pdf","comment":"10 pages, 2 figures, accepted at DSAA 2024"},{"id":"http://arxiv.org/abs/2408.07353v1","updated":"2024-08-14T07:57:51Z","published":"2024-08-14T07:57:51Z","title":"Only One Relation Possible? Modeling the Ambiguity in Event Temporal\n Relation Extraction","summary":" Event Temporal Relation Extraction (ETRE) aims to identify the temporal\nrelationship between two events, which plays an important role in natural\nlanguage understanding. Most previous works follow a single-label\nclassification style, classifying an event pair into either a specific temporal\nrelation (e.g., \\textit{Before}, \\textit{After}), or a special label\n\\textit{Vague} when there may be multiple possible temporal relations between\nthe pair. In our work, instead of directly making predictions on\n\\textit{Vague}, we propose a multi-label classification solution for ETRE\n(METRE) to infer the possibility of each temporal relation independently, where\nwe treat \\textit{Vague} as the cases when there is more than one possible\nrelation between two events. We design a speculation mechanism to explore the\npossible relations hidden behind \\textit{Vague}, which enables the latent\ninformation to be used efficiently. Experiments on TB-Dense, MATRES and UDS-T\nshow that our method can effectively utilize the \\textit{Vague} instances to\nimprove the recognition for specific temporal relations and outperforms most\nstate-of-the-art methods.\n","authors":["Yutong Hu","Quzhe Huang","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2408.07353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14483v2","updated":"2024-08-14T07:42:30Z","published":"2023-10-23T01:29:18Z","title":"Chain-of-Factors Paper-Reviewer Matching","summary":" With the rapid increase in paper submissions to academic conferences, the\nneed for automated and accurate paper-reviewer matching is more critical than\never. Previous efforts in this area have considered various factors to assess\nthe relevance of a reviewer's expertise to a paper, such as the semantic\nsimilarity, shared topics, and citation connections between the paper and the\nreviewer's previous works. However, most of these studies focus on only one\nfactor, resulting in an incomplete evaluation of the paper-reviewer relevance.\nTo address this issue, we propose a unified model for paper-reviewer matching\nthat jointly considers semantic, topic, and citation factors. To be specific,\nduring training, we instruction-tune a contextualized language model shared\nacross all factors to capture their commonalities and characteristics; during\ninference, we chain the three factors to enable step-by-step, coarse-to-fine\nsearch for qualified reviewers given a submission. Experiments on four datasets\n(one of which is newly contributed by us) spanning various fields such as\nmachine learning, computer vision, information retrieval, and data mining\nconsistently demonstrate the effectiveness of our proposed Chain-of-Factors\nmodel in comparison with state-of-the-art paper-reviewer matching methods and\nscientific pre-trained language models.\n","authors":["Yu Zhang","Yanzhen Shen","SeongKu Kang","Xiusi Chen","Bowen Jin","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2310.14483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00655v5","updated":"2024-08-14T07:34:44Z","published":"2024-08-01T15:45:19Z","title":"SentenceVAE: Enable Next-sentence Prediction for Large Language Models\n with Faster Speed, Higher Accuracy and Longer Context","summary":" Current large language models (LLMs) primarily utilize next-token prediction\nmethod for inference, which significantly impedes their processing speed. In\nthis paper, we introduce a novel inference methodology termed next-sentence\nprediction, aiming at enhancing the inference efficiency of LLMs. We present\nSentence Variational Autoencoder (SentenceVAE), which includes a Sentence\nEncoder to compress multiple tokens in a sentence into a single token, and a\nSentence Decoder to reconstruct it. By integrating SentenceVAE into the input\nand output layers of LLMs, we develop Sentence-level LLMs (SLLMs) that employ a\nsentence-by-sentence inference method. In addition, the SentenceVAE module of\nSLLMs can maintain the integrity of the original semantic content by segmenting\nthe context into sentences, thereby improving accuracy while boosting inference\nspeed. Moreover, compared to previous LLMs, SLLMs process fewer tokens over\nequivalent context length, significantly reducing memory demands for\nself-attention computation and facilitating the handling of longer context.\nExtensive experiments on Wanjuan dataset have revealed that the proposed method\ncan accelerate inference speed by 204~365%, reduce perplexity (PPL) to 46~75%\nof its original metric, and decrease memory overhead by 86~91% for the\nequivalent context length, compared to previous token-by-token methods.\n","authors":["Hongjun An","Yifan Chen","Zhe Sun","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.00655v5.pdf","comment":"update the article"},{"id":"http://arxiv.org/abs/2408.05545v2","updated":"2024-08-14T05:43:22Z","published":"2024-08-10T13:03:19Z","title":"Multi-layer Sequence Labeling-based Joint Biomedical Event Extraction","summary":" In recent years, biomedical event extraction has been dominated by\ncomplicated pipeline and joint methods, which need to be simplified. In\naddition, existing work has not effectively utilized trigger word information\nexplicitly. Hence, we propose MLSL, a method based on multi-layer sequence\nlabeling for joint biomedical event extraction. MLSL does not introduce prior\nknowledge and complex structures. Moreover, it explicitly incorporates the\ninformation of candidate trigger words into the sequence labeling to learn the\ninteraction relationships between trigger words and argument roles. Based on\nthis, MLSL can learn well with just a simple workflow. Extensive\nexperimentation demonstrates the superiority of MLSL in terms of extraction\nperformance compared to other state-of-the-art methods.\n","authors":["Gongchi Chen","Pengchao Wu","Jinghang Gu","Longhua Qian","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.05545v2.pdf","comment":"13 pages, 3 figures, accepted by NLPCC2024"},{"id":"http://arxiv.org/abs/2408.07303v1","updated":"2024-08-14T05:18:43Z","published":"2024-08-14T05:18:43Z","title":"Enhancing Visual Question Answering through Ranking-Based Hybrid\n Training and Multimodal Fusion","summary":" Visual Question Answering (VQA) is a challenging task that requires systems\nto provide accurate answers to questions based on image content. Current VQA\nmodels struggle with complex questions due to limitations in capturing and\nintegrating multimodal information effectively. To address these challenges, we\npropose the Rank VQA model, which leverages a ranking-inspired hybrid training\nstrategy to enhance VQA performance. The Rank VQA model integrates high-quality\nvisual features extracted using the Faster R-CNN model and rich semantic text\nfeatures obtained from a pre-trained BERT model. These features are fused\nthrough a sophisticated multimodal fusion technique employing multi-head\nself-attention mechanisms. Additionally, a ranking learning module is\nincorporated to optimize the relative ranking of answers, thus improving answer\naccuracy. The hybrid training strategy combines classification and ranking\nlosses, enhancing the model's generalization ability and robustness across\ndiverse datasets. Experimental results demonstrate the effectiveness of the\nRank VQA model. Our model significantly outperforms existing state-of-the-art\nmodels on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of\nboth accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank\nVQA is evident in its ability to handle complex questions that require\nunderstanding nuanced details and making sophisticated inferences from the\nimage and text. This work highlights the effectiveness of a ranking-based\nhybrid training strategy in improving VQA performance and lays the groundwork\nfor further research in multimodal learning methods.\n","authors":["Peiyuan Chen","Zecheng Zhang","Yiping Dong","Li Zhou","Han Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07303v1.pdf","comment":"Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal\n Fusion, Ranking Learning, Hybrid Training Strategy"},{"id":"http://arxiv.org/abs/2408.04975v3","updated":"2024-08-14T05:09:42Z","published":"2024-08-09T09:56:30Z","title":"\\textit{re}CSE: Portable Reshaping Features for Sentence Embedding in\n Self-supervised Contrastive Learning","summary":" We propose \\textit{re}CSE, a self supervised contrastive learning sentence\nrepresentation framework based on feature reshaping. This framework is\ndifferent from the current advanced models that use discrete data augmentation\nmethods, but instead reshapes the input features of the original sentence,\naggregates the global information of each token in the sentence, and alleviates\nthe common problems of representation polarity and GPU memory consumption\nlinear increase in current advanced models. In addition, our \\textit{re}CSE has\nachieved competitive performance in semantic similarity tasks. And the\nexperiment proves that our proposed feature reshaping method has strong\nuniversality, which can be transplanted to other self supervised contrastive\nlearning frameworks and enhance their representation ability, even achieving\nstate-of-the-art performance. Our code is available at\nhttps://github.com/heavenhellchen/reCSE.\n","authors":["Fufangchen Zhao","Jian Gao","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2408.04975v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13764v2","updated":"2024-08-14T04:05:23Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v2.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2305.07895v6","updated":"2024-08-14T03:30:14Z","published":"2023-05-13T11:28:37Z","title":"On the Hidden Mystery of OCR in Large Multimodal Models","summary":" Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results presented in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Mingxin Huang","Biao Yang","Wenwen Yu","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11173v4","updated":"2024-08-14T03:06:00Z","published":"2024-06-17T03:26:02Z","title":"BSRBF-KAN: A combination of B-splines and Radial Basis Functions in\n Kolmogorov-Arnold Networks","summary":" In this paper, we introduce BSRBF-KAN, a Kolmogorov Arnold Network (KAN) that\ncombines B-splines and radial basis functions (RBFs) to fit input vectors\nduring data training. We perform experiments with BSRBF-KAN, multi-layer\nperception (MLP), and other popular KANs, including EfficientKAN, FastKAN,\nFasterKAN, and GottliebKAN over the MNIST and Fashion-MNIST datasets. BSRBF-KAN\nshows stability in 5 training runs with a competitive average accuracy of\n97.55% on MNIST and 89.33% on Fashion-MNIST and obtains convergence better than\nother networks. We expect BSRBF-KAN to open many combinations of mathematical\nfunctions to design KANs. Our repo is publicly available at:\nhttps://github.com/hoangthangta/BSRBF_KAN.\n","authors":["Hoang-Thang Ta"],"pdf_url":"https://arxiv.org/pdf/2406.11173v4.pdf","comment":"8 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2402.06457v2","updated":"2024-08-14T02:41:48Z","published":"2024-02-09T15:02:56Z","title":"V-STaR: Training Verifiers for Self-Taught Reasoners","summary":" Common self-improvement approaches for large language models (LLMs), such as\nSTaR, iteratively fine-tune LLMs on self-generated solutions to improve their\nproblem-solving ability. However, these approaches discard the large amounts of\nincorrect solutions generated during this process, potentially neglecting\nvaluable information in such solutions. To address this shortcoming, we propose\nV-STaR that utilizes both the correct and incorrect solutions generated during\nthe self-improvement process to train a verifier using DPO that judges\ncorrectness of model-generated solutions. This verifier is used at inference\ntime to select one solution among many candidate solutions. Running V-STaR for\nmultiple iterations results in progressively better reasoners and verifiers,\ndelivering a 4% to 17% test accuracy improvement over existing self-improvement\nand verification approaches on common code generation and math reasoning\nbenchmarks with LLaMA2 models.\n","authors":["Arian Hosseini","Xingdi Yuan","Nikolay Malkin","Aaron Courville","Alessandro Sordoni","Rishabh Agarwal"],"pdf_url":"https://arxiv.org/pdf/2402.06457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06578v2","updated":"2024-08-14T01:37:39Z","published":"2024-08-13T02:35:54Z","title":"OpenEP: Open-Ended Future Event Prediction","summary":" Future event prediction (FEP) is a long-standing and crucial task in the\nworld, as understanding the evolution of events enables early risk\nidentification, informed decision-making, and strategic planning. Existing work\ntypically treats event prediction as classification tasks and confines the\noutcomes of future events to a fixed scope, such as yes/no questions, candidate\nset, and taxonomy, which is difficult to include all possible outcomes of\nfuture events. In this paper, we introduce OpenEP (an Open-Ended Future Event\nPrediction task), which generates flexible and diverse predictions aligned with\nreal-world scenarios. This is mainly reflected in two aspects: firstly, the\npredictive questions are diverse, covering different stages of event\ndevelopment and perspectives; secondly, the outcomes are flexible, without\nconstraints on scope or format. To facilitate the study of this task, we\nconstruct OpenEPBench, an open-ended future event prediction dataset. For\nquestion construction, we pose questions from seven perspectives, including\nlocation, time, event development, event outcome, event impact, event response,\nand other, to facilitate an in-depth analysis and understanding of the\ncomprehensive evolution of events. For outcome construction, we collect\nfree-form text containing the outcomes as ground truth to provide semantically\ncomplete and detail-enriched outcomes. Furthermore, we propose StkFEP, a\nstakeholder-enhanced future event prediction framework, that incorporates event\ncharacteristics for open-ended settings. Our method extracts stakeholders\ninvolved in events to extend questions to gather diverse information. We also\ncollect historically events that are relevant and similar to the question to\nreveal potential evolutionary patterns. Experiment results indicate that\naccurately predicting future events in open-ended settings is challenging for\nexisting LLMs.\n","authors":["Yong Guan","Hao Peng","Xiaozhi Wang","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2408.06578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07852v1","updated":"2024-08-14T23:34:28Z","published":"2024-08-14T23:34:28Z","title":"Training Language Models on the Knowledge Graph: Insights on\n Hallucinations and Their Detectability","summary":" While many capabilities of language models (LMs) improve with increased\ntraining budget, the influence of scale on hallucinations is not yet fully\nunderstood. Hallucinations come in many forms, and there is no universally\naccepted definition. We thus focus on studying only those hallucinations where\na correct answer appears verbatim in the training set. To fully control the\ntraining data content, we construct a knowledge graph (KG)-based dataset, and\nuse it to train a set of increasingly large LMs. We find that for a fixed\ndataset, larger and longer-trained LMs hallucinate less. However, hallucinating\non $\\leq5$% of the training data requires an order of magnitude larger model,\nand thus an order of magnitude more compute, than Hoffmann et al. (2022)\nreported was optimal. Given this costliness, we study how hallucination\ndetectors depend on scale. While we see detector size improves performance on\nfixed LM's outputs, we find an inverse relationship between the scale of the LM\nand the detectability of its hallucinations.\n","authors":["Jiri Hron","Laura Culp","Gamaleldin Elsayed","Rosanne Liu","Ben Adlam","Maxwell Bileschi","Bernd Bohnet","JD Co-Reyes","Noah Fiedel","C. Daniel Freeman","Izzeddin Gur","Kathleen Kenealy","Jaehoon Lee","Peter J. Liu","Gaurav Mishra","Igor Mordatch","Azade Nova","Roman Novak","Aaron Parisi","Jeffrey Pennington","Alex Rizkowsky","Isabelle Simpson","Hanie Sedghi","Jascha Sohl-dickstein","Kevin Swersky","Sharad Vikram","Tris Warkentin","Lechao Xiao","Kelvin Xu","Jasper Snoek","Simon Kornblith"],"pdf_url":"https://arxiv.org/pdf/2408.07852v1.pdf","comment":"Published at COLM 2024. 16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.07851v1","updated":"2024-08-14T23:33:10Z","published":"2024-08-14T23:33:10Z","title":"SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion\n Recognition","summary":" Speech emotion recognition (SER) has made significant strides with the advent\nof powerful self-supervised learning (SSL) models. However, the generalization\nof these models to diverse languages and emotional expressions remains a\nchallenge. We propose a large-scale benchmark to evaluate the robustness and\nadaptability of state-of-the-art SER models in both in-domain and out-of-domain\nsettings. Our benchmark includes a diverse set of multilingual datasets,\nfocusing on less commonly used corpora to assess generalization to new data. We\nemploy logit adjustment to account for varying class distributions and\nestablish a single dataset cluster for systematic evaluation. Surprisingly, we\nfind that the Whisper model, primarily designed for automatic speech\nrecognition, outperforms dedicated SSL models in cross-lingual SER. Our results\nhighlight the need for more robust and generalizable SER models, and our\nbenchmark serves as a valuable resource to drive future research in this\ndirection.\n","authors":["Mohamed Osman","Daniel Z. Kaplan","Tamer Nadeem"],"pdf_url":"https://arxiv.org/pdf/2408.07851v1.pdf","comment":"Accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.06172v2","updated":"2024-08-14T22:31:35Z","published":"2024-07-08T17:48:42Z","title":"On Speeding Up Language Model Evaluation","summary":" Developing prompt-based methods with Large Language Models (LLMs) requires\nmaking numerous decisions, which give rise to a combinatorial search problem.\nFor example, selecting the right pre-trained LLM, prompt, and hyperparameters\nto attain the best performance for a task typically necessitates evaluating an\nexpoential number of candidates on large validation sets. This exhaustive\nevaluation can be time-consuming and costly, as both inference and evaluation\nof LLM-based approaches are resource-intensive. Worse, a lot of computation is\nwasted: Many hyper-parameter settings are non-competitive, and many samples\nfrom the validation set are highly correlated - providing little or no new\ninformation. So, if the goal is to identify the best method, it can be done far\nmore efficiently if the validation samples and methods are selected adaptively.\nIn this paper, we propose a novel method to address this challenge. We lean on\nlow-rank matrix factorization to fill in missing evaluations and on multi-armed\nbandits to sequentially identify the next (method, validation sample)-pair to\nevaluate. We carefully assess the efficacy of our approach on several\ncompetitive benchmark problems and show that it can identify the top-performing\nmethod using only 5-15% of the typically needed resources -- resulting in a\nstaggering 85-95% LLM cost savings.\n","authors":["Jin Peng Zhou","Christian K. Belardi","Ruihan Wu","Travis Zhang","Carla P. Gomes","Wen Sun","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2407.06172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07840v1","updated":"2024-08-14T22:28:19Z","published":"2024-08-14T22:28:19Z","title":"ONSEP: A Novel Online Neural-Symbolic Framework for Event Prediction\n Based on Large Language Model","summary":" In the realm of event prediction, temporal knowledge graph forecasting (TKGF)\nstands as a pivotal technique. Previous approaches face the challenges of not\nutilizing experience during testing and relying on a single short-term history,\nwhich limits adaptation to evolving data. In this paper, we introduce the\nOnline Neural-Symbolic Event Prediction (ONSEP) framework, which innovates by\nintegrating dynamic causal rule mining (DCRM) and dual history augmented\ngeneration (DHAG). DCRM dynamically constructs causal rules from real-time\ndata, allowing for swift adaptation to new causal relationships. In parallel,\nDHAG merges short-term and long-term historical contexts, leveraging a\nbi-branch approach to enrich event prediction. Our framework demonstrates\nnotable performance enhancements across diverse datasets, with significant\nHit@k (k=1,3,10) improvements, showcasing its ability to augment large language\nmodels (LLMs) for event prediction without necessitating extensive retraining.\nThe ONSEP framework not only advances the field of TKGF but also underscores\nthe potential of neural-symbolic approaches in adapting to dynamic data\nenvironments.\n","authors":["Xuanqing Yu","Wangtao Sun","Jingwei Li","Kang Liu","Chengbao Liu","Jie Tan"],"pdf_url":"https://arxiv.org/pdf/2408.07840v1.pdf","comment":"16 pages, ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2404.00699v2","updated":"2024-08-14T21:56:32Z","published":"2024-03-31T14:32:02Z","title":"How Much are Large Language Models Contaminated? A Comprehensive Survey\n and the LLMSanitize Library","summary":" With the rise of Large Language Models (LLMs) in recent years, abundant new\nopportunities are emerging, but also new challenges, among which contamination\nis quickly becoming critical. Business applications and fundraising in AI have\nreached a scale at which a few percentage points gained on popular\nquestion-answering benchmarks could translate into dozens of millions of\ndollars, placing high pressure on model integrity. At the same time, it is\nbecoming harder and harder to keep track of the data that LLMs have seen; if\nnot impossible with closed-source models like GPT-4 and Claude-3 not divulging\nany information on the training set. As a result, contamination becomes a major\nissue: LLMs' performance may not be reliable anymore, as the high performance\nmay be at least partly due to their previous exposure to the data. This\nlimitation jeopardizes the entire progress in the field of NLP, yet, there\nremains a lack of methods on how to efficiently detect contamination.In this\npaper, we survey all recent work on contamination detection with LLMs, and help\nthe community track contamination levels of LLMs by releasing an open-source\nPython library named LLMSanitize implementing major contamination detection\nalgorithms.\n","authors":["Mathieu Ravaut","Bosheng Ding","Fangkai Jiao","Hailin Chen","Xingxuan Li","Ruochen Zhao","Chengwei Qin","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2404.00699v2.pdf","comment":"8 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2310.10845v2","updated":"2024-08-14T20:41:56Z","published":"2023-10-16T21:37:34Z","title":"CoTFormer: A Chain-of-Thought Driven Architecture with Budget-Adaptive\n Computation Cost at Inference","summary":" Scaling language models to larger and deeper sizes has led to significant\nboosts in performance. Even though the size of these models limits their\napplication in compute-constrained environments, the race to continually\ndevelop ever larger and deeper foundational models is underway. At the same\ntime -- regardless of the model size -- task-specific techniques continue to\nplay a pivotal role in achieving optimal downstream performance. One of these\ntechniques, called Chain-of-Thought (CoT), is particularly interesting since,\nas we point out in this work, it resembles employing a deeper transformer\nthrough re-applying the model multiple times. However, a key subtlety in\ncomputing the attention of past tokens differentiates CoT from simply applying\nthe model several times. Based on this insight, we propose CoTFormer, a novel\narchitecture which closely mimics CoT at the token level, allowing us to obtain\nsignificantly improved accuracies close to much larger models. While applying\nCoT introduces additional computation costs, we compensate for it by leveraging\nCoTFormer's special compatibility with token-wise variable depth. Through a\ncompute adaptive model -- which automatically allocates the compute to tokens\nthat need it most -- we show that it is possible to reduce the computation cost\nsignificantly without any reduction in accuracy, and with further compute cost\nreductions possible while maintaining a competitive accuracy.\n","authors":["Amirkeivan Mohtashami","Matteo Pagliardini","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2310.10845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06062v2","updated":"2024-08-14T20:16:23Z","published":"2024-08-12T11:23:24Z","title":"On Tables with Numbers, with Numbers","summary":" This paper is a critical reflection on the epistemic culture of contemporary\ncomputational linguistics, framed in the context of its growing obsession with\ntables with numbers. We argue against tables with numbers on the basis of their\nepistemic irrelevance, their environmental impact, their role in enabling and\nexacerbating social inequalities, and their deep ties to commercial\napplications and profit-driven research. We substantiate our arguments with\nempirical evidence drawn from a meta-analysis of computational linguistics\nresearch over the last decade.\n","authors":["Konstantinos Kogkalidis","Stergios Chatzikyriakidis"],"pdf_url":"https://arxiv.org/pdf/2408.06062v2.pdf","comment":"v2: corrected Figure 2 scale and caption (thanks go to Ernest Davis)"},{"id":"http://arxiv.org/abs/2401.17435v4","updated":"2024-08-14T19:23:43Z","published":"2024-01-30T20:49:47Z","title":"Can LLMs Replace Economic Choice Prediction Labs? The Case of\n Language-based Persuasion Games","summary":" Human choice prediction in economic contexts is crucial for applications in\nmarketing, finance, public policy, and more. This task, however, is often\nconstrained by the difficulties in acquiring human choice data. With most\nexperimental economics studies focusing on simple choice settings, the AI\ncommunity has explored whether LLMs can substitute for humans in these\npredictions and examined more complex experimental economics settings. However,\na key question remains: can LLMs generate training data for human choice\nprediction? We explore this in language-based persuasion games, a complex\neconomic setting involving natural language in strategic interactions. Our\nexperiments show that models trained on LLM-generated data can effectively\npredict human behavior in these games and even outperform models trained on\nactual human data.\n","authors":["Eilam Shapira","Omer Madmon","Roi Reichart","Moshe Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2401.17435v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06537v2","updated":"2024-08-14T18:38:11Z","published":"2024-08-13T00:06:56Z","title":"Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality\n Parallel Data Outperforms Traditional Web-Crawled Data","summary":" Recent research in neural machine translation (NMT) has shown that training\non high-quality machine-generated data can outperform training on\nhuman-generated data. This work accompanies the first-ever release of a\nLLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and\nmulti-sentence examples. We perform extensive experiments to demonstrate the\nquality of our dataset in terms of its downstream impact on NMT model\nperformance. We find that training from scratch on our (machine-generated)\ndataset outperforms training on the (web-crawled) WMT'23 training dataset\n(which is 300 times larger), and also outperforms training on the top-quality\nsubset of the WMT'23 training dataset. We also find that performing\nself-distillation by finetuning the LLM which generated this dataset\noutperforms the LLM's strong few-shot baseline. These findings corroborate the\nquality of our dataset, and demonstrate the value of high-quality\nmachine-generated data in improving performance of NMT models.\n","authors":["Mara Finkelstein","David Vilar","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2408.06537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11852v2","updated":"2024-08-14T18:20:22Z","published":"2024-04-02T15:05:06Z","title":"Risks from Language Models for Automated Mental Healthcare: Ethics and\n Structure for Implementation","summary":" Amidst the growing interest in developing task-autonomous AI for automated\nmental health care, this paper addresses the ethical and practical challenges\nassociated with the issue and proposes a structured framework that delineates\nlevels of autonomy, outlines ethical requirements, and defines beneficial\ndefault behaviors for AI agents in the context of mental health support. We\nalso evaluate fourteen state-of-the-art language models (ten off-the-shelf,\nfour fine-tuned) using 16 mental health-related questionnaires designed to\nreflect various mental health conditions, such as psychosis, mania, depression,\nsuicidal thoughts, and homicidal tendencies. The questionnaire design and\nresponse evaluations were conducted by mental health clinicians (M.D.s). We\nfind that existing language models are insufficient to match the standard\nprovided by human professionals who can navigate nuances and appreciate\ncontext. This is due to a range of issues, including overly cautious or\nsycophantic responses and the absence of necessary safeguards. Alarmingly, we\nfind that most of the tested models could cause harm if accessed in mental\nhealth emergencies, failing to protect users and potentially exacerbating\nexisting symptoms. We explore solutions to enhance the safety of current\nmodels. Before the release of increasingly task-autonomous AI systems in mental\nhealth, it is crucial to ensure that these models can reliably detect and\nmanage symptoms of common psychiatric disorders to prevent harm to users. This\ninvolves aligning with the ethical framework and default behaviors outlined in\nour study. We contend that model developers are responsible for refining their\nsystems per these guidelines to safeguard against the risks posed by current AI\ntechnologies to user mental health and safety.\n Trigger warning: Contains and discusses examples of sensitive mental health\ntopics, including suicide and self-harm.\n","authors":["Declan Grabb","Max Lamparth","Nina Vasan"],"pdf_url":"https://arxiv.org/pdf/2406.11852v2.pdf","comment":"Updated with fine-tuned model results to match CoLM accepted\n camera-ready version"},{"id":"http://arxiv.org/abs/2405.03862v3","updated":"2024-08-14T18:01:13Z","published":"2024-05-06T21:20:35Z","title":"Persona Inconstancy in Multi-Agent LLM Collaboration: Conformity,\n Confabulation, and Impersonation","summary":" Multi-agent AI systems can be used for simulating collective decision-making\nin scientific and practical applications. They can also be used to introduce a\ndiverse group discussion step in chatbot pipelines, enhancing the cultural\nsensitivity of the chatbot's responses. These applications, however, are\npredicated on the ability of AI agents to reliably adopt assigned personas and\nmimic human interactions. To see whether LLM agents satisfy these requirements,\nwe examine AI agent ensembles engaged in cross-national collaboration and\ndebate by analyzing their private responses and chat transcripts. Our findings\nsuggest that multi-agent discussions can support collective AI decisions that\nmore often reflect diverse perspectives, yet this effect is tempered by the\nagents' susceptibility to conformity due to perceived peer pressure and\noccasional challenges in maintaining consistent personas and opinions.\nInstructions that encourage debate in support of one's opinions rather than\ncollaboration increase the rate of inconstancy. Without addressing the factors\nwe identify, the full potential of multi-agent frameworks for producing more\nculturally diverse AI outputs or more realistic simulations of group\ndecision-making may remain untapped.\n","authors":["Razan Baltaji","Babak Hemmatian","Lav R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2405.03862v3.pdf","comment":"16 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.08333v1","updated":"2024-08-14T22:53:07Z","published":"2024-08-14T22:53:07Z","title":"CodeMirage: Hallucinations in Code Generated by Large Language Models","summary":" Large Language Models (LLMs) have shown promising potentials in program\ngeneration and no-code automation. However, LLMs are prone to generate\nhallucinations, i.e., they generate text which sounds plausible but is\nincorrect. Although there has been a recent surge in research on LLM\nhallucinations for text generation, similar hallucination phenomenon can happen\nin code generation. Sometimes the generated code can have syntactical or\nlogical errors as well as more advanced issues like security vulnerabilities,\nmemory leaks, etc. Given the wide adaptation of LLMs to enhance efficiency in\ncode generation and development in general, it becomes imperative to\ninvestigate hallucinations in code generation. To the best of our knowledge,\nthis is the first attempt at studying hallucinations in the code generated by\nLLMs. We start by introducing the code hallucination definition and a\ncomprehensive taxonomy of code hallucination types. We propose the first\nbenchmark CodeMirage dataset for code hallucinations. The benchmark contains\n1,137 GPT-3.5 generated hallucinated code snippets for Python programming\nproblems from two base datasets - HumanEval and MBPP. We then propose the\nmethodology for code hallucination detection and experiment with open source\nLLMs such as CodeLLaMA as well as OpenAI's GPT-3.5 and GPT-4 models using\none-shot prompt. We find that GPT-4 performs the best on HumanEval dataset and\ngives comparable results to the fine-tuned CodeBERT baseline on MBPP dataset.\nTowards the end, we discuss various mitigation strategies for code\nhallucinations and conclude our work.\n","authors":["Vibhor Agarwal","Yulong Pei","Salwa Alamir","Xiaomo Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08333v1.pdf","comment":"Accepted at AutoMates @ IJCAI 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.07703v1","updated":"2024-08-14T17:59:32Z","published":"2024-08-14T17:59:32Z","title":"Knowledge Distillation with Refined Logits","summary":" Recent research on knowledge distillation has increasingly focused on logit\ndistillation because of its simplicity, effectiveness, and versatility in model\ncompression. In this paper, we introduce Refined Logit Distillation (RLD) to\naddress the limitations of current logit distillation methods. Our approach is\nmotivated by the observation that even high-performing teacher models can make\nincorrect predictions, creating a conflict between the standard distillation\nloss and the cross-entropy loss. This conflict can undermine the consistency of\nthe student model's learning objectives. Previous attempts to use labels to\nempirically correct teacher predictions may undermine the class correlation. In\ncontrast, our RLD employs labeling information to dynamically refine teacher\nlogits. In this way, our method can effectively eliminate misleading\ninformation from the teacher while preserving crucial class correlations, thus\nenhancing the value and efficiency of distilled knowledge. Experimental results\non CIFAR-100 and ImageNet demonstrate its superiority over existing methods.\nThe code is provided at \\text{https://github.com/zju-SWJ/RLD}.\n","authors":["Wujie Sun","Defang Chen","Siwei Lyu","Genlang Chen","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07703v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.07694v1","updated":"2024-08-14T17:50:27Z","published":"2024-08-14T17:50:27Z","title":"End-to-end Semantic-centric Video-based Multimodal Affective Computing","summary":" In the pathway toward Artificial General Intelligence (AGI), understanding\nhuman's affection is essential to enhance machine's cognition abilities. For\nachieving more sensual human-AI interaction, Multimodal Affective Computing\n(MAC) in human-spoken videos has attracted increasing attention. However,\nprevious methods are mainly devoted to designing multimodal fusion algorithms,\nsuffering from two issues: semantic imbalance caused by diverse pre-processing\noperations and semantic mismatch raised by inconsistent affection content\ncontained in different modalities comparing with the multimodal ground truth.\nBesides, the usage of manual features extractors make they fail in building\nend-to-end pipeline for multiple MAC downstream tasks. To address above\nchallenges, we propose a novel end-to-end framework named SemanticMAC to\ncompute multimodal semantic-centric affection for human-spoken videos. We\nfirstly employ pre-trained Transformer model in multimodal data pre-processing\nand design Affective Perceiver module to capture unimodal affective\ninformation. Moreover, we present a semantic-centric approach to unify\nmultimodal representation learning in three ways, including gated feature\ninteraction, multi-task pseudo label generation, and intra-/inter-sample\ncontrastive learning. Finally, SemanticMAC effectively learn specific- and\nshared-semantic representations in the guidance of semantic-centric labels.\nExtensive experimental results demonstrate that our approach surpass the\nstate-of-the-art methods on 7 public datasets in four MAC downstream tasks.\n","authors":["Ronghao Lin","Ying Zeng","Sijie Mai","Haifeng Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07694v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.07689v1","updated":"2024-08-14T17:45:13Z","published":"2024-08-14T17:45:13Z","title":"Detecting Near-Duplicate Face Images","summary":" Near-duplicate images are often generated when applying repeated photometric\nand geometric transformations that produce imperceptible variants of the\noriginal image. Consequently, a deluge of near-duplicates can be circulated\nonline posing copyright infringement concerns. The concerns are more severe\nwhen biometric data is altered through such nuanced transformations. In this\nwork, we address the challenge of near-duplicate detection in face images by,\nfirstly, identifying the original image from a set of near-duplicates and,\nsecondly, deducing the relationship between the original image and the\nnear-duplicates. We construct a tree-like structure, called an Image Phylogeny\nTree (IPT) using a graph-theoretic approach to estimate the relationship, i.e.,\ndetermine the sequence in which they have been generated. We further extend our\nmethod to create an ensemble of IPTs known as Image Phylogeny Forests (IPFs).\nWe rigorously evaluate our method to demonstrate robustness across other\nmodalities, unseen transformations by latest generative models and IPT\nconfigurations, thereby significantly advancing the state-of-the-art\nperformance by 42% on IPF reconstruction accuracy.\n","authors":["Sudipta Banerjee","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2408.07689v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.08541v2","updated":"2024-08-14T17:43:25Z","published":"2023-10-12T17:34:20Z","title":"Idea2Img: Iterative Self-Refinement with GPT-4V(ision) for Automatic\n Image Design and Generation","summary":" We introduce ``Idea to Image,'' a system that enables multimodal iterative\nself-refinement with GPT-4V(ision) for automatic image design and generation.\nHumans can quickly identify the characteristics of different text-to-image\n(T2I) models via iterative explorations. This enables them to efficiently\nconvert their high-level generation ideas into effective T2I prompts that can\nproduce good images. We investigate if systems based on large multimodal models\n(LMMs) can develop analogous multimodal self-refinement abilities that enable\nexploring unknown models or environments via self-refining tries. Idea2Img\ncyclically generates revised T2I prompts to synthesize draft images, and\nprovides directional feedback for prompt revision, both conditioned on its\nmemory of the probed T2I model's characteristics. The iterative self-refinement\nbrings Idea2Img various advantages over vanilla T2I models. Notably, Idea2Img\ncan process input ideas with interleaved image-text sequences, follow ideas\nwith design instructions, and generate images of better semantic and visual\nqualities. The user preference study validates the efficacy of multimodal\niterative self-refinement on automatic image design and generation.\n","authors":["Zhengyuan Yang","Jianfeng Wang","Linjie Li","Kevin Lin","Chung-Ching Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08541v2.pdf","comment":"ECCV 2024; Project page at https://idea2img.github.io/"},{"id":"http://arxiv.org/abs/2408.07687v1","updated":"2024-08-14T17:41:45Z","published":"2024-08-14T17:41:45Z","title":"RSD-DOG : A New Image Descriptor based on Second Order Derivatives","summary":" This paper introduces the new and powerful image patch descriptor based on\nsecond order image statistics/derivatives. Here, the image patch is treated as\na 3D surface with intensity being the 3rd dimension. The considered 3D surface\nhas a rich set of second order features/statistics such as ridges, valleys,\ncliffs and so on, that can be easily captured by using the difference of\nrotating semi Gaussian filters. The originality of this method is based on\nsuccessfully combining the response of the directional filters with that of the\nDifference of Gaussian (DOG) approach. The obtained descriptor shows a good\ndiscriminative power when dealing with the variations in illumination, scale,\nrotation, blur, viewpoint and compression. The experiments on image matching,\ndemonstrates the advantage of the obtained descriptor when compared to its\nfirst order counterparts such as SIFT, DAISY, GLOH, GIST and LIDRIC.\n","authors":["Darshan Venkatrayappa","Philippe Montesinos","Daniel Diep","Baptiste Magnier"],"pdf_url":"https://arxiv.org/pdf/2408.07687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19376v2","updated":"2024-08-14T17:39:06Z","published":"2024-03-28T12:38:21Z","title":"NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data","summary":" The acquisition of objects outside the Line-of-Sight of cameras is a very\nintriguing but also extremely challenging research topic. Recent works showed\nthe feasibility of this idea exploiting transient imaging data produced by\ncustom direct Time of Flight sensors. In this paper, for the first time, we\ntackle this problem using only data from an off-the-shelf indirect Time of\nFlight sensor without any further hardware requirement. We introduced a Deep\nLearning model able to re-frame the surfaces where light bounces happen as a\nvirtual mirror. This modeling makes the task easier to handle and also\nfacilitates the construction of annotated training data. From the obtained data\nit is possible to retrieve the depth information of the hidden scene. We also\nprovide a first-in-its-kind synthetic dataset for the task and demonstrate the\nfeasibility of the proposed idea over it.\n","authors":["Matteo Caligiuri","Adriano Simonetto","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2403.19376v2.pdf","comment":"Submitted to MELEX 24 (ECCV workshop), 17 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.07680v1","updated":"2024-08-14T17:28:58Z","published":"2024-08-14T17:28:58Z","title":"A Spitting Image: Modular Superpixel Tokenization in Vision Transformers","summary":" Vision Transformer (ViT) architectures traditionally employ a grid-based\napproach to tokenization independent of the semantic content of an image. We\npropose a modular superpixel tokenization strategy which decouples tokenization\nand feature extraction; a shift from contemporary approaches where these are\ntreated as an undifferentiated whole. Using on-line content-aware tokenization\nand scale- and shape-invariant positional embeddings, we perform experiments\nand ablations that contrast our approach with patch-based tokenization and\nrandomized partitions as baselines. We show that our method significantly\nimproves the faithfulness of attributions, gives pixel-level granularity on\nzero-shot unsupervised dense prediction tasks, while maintaining predictive\nperformance in classification tasks. Our approach provides a modular\ntokenization framework commensurable with standard architectures, extending the\nspace of ViTs to a larger class of semantically-rich models.\n","authors":["Marius Aasan","Odd Kolbjørnsen","Anne Schistad Solberg","Adín Ramirez Rivera"],"pdf_url":"https://arxiv.org/pdf/2408.07680v1.pdf","comment":"To appear in ECCV (MELEX) 2024 Workshop Proceedings"},{"id":"http://arxiv.org/abs/2408.01159v2","updated":"2024-08-14T17:24:25Z","published":"2024-08-02T10:21:10Z","title":"Robust Curve Detection in Volumetric Medical Imaging via Attraction\n Field","summary":" Understanding body part geometry is crucial for precise medical diagnostics.\nCurves effectively describe anatomical structures and are widely used in\nmedical imaging applications related to cardiovascular, respiratory, and\nskeletal diseases. Traditional curve detection methods are often task-specific,\nrelying heavily on domain-specific features, limiting their broader\napplicability. This paper introduces a novel approach for detecting\nnon-branching curves, which does not require prior knowledge of the object's\norientation, shape, or position. Our method uses neural networks to predict (1)\nan attraction field, which offers subpixel accuracy, and (2) a closeness map,\nwhich limits the region of interest and essentially eliminates outliers far\nfrom the desired curve. We tested our curve detector on several clinically\nrelevant tasks with diverse morphologies and achieved impressive subpixel-level\naccuracy results that surpass existing methods, highlighting its versatility\nand robustness. Additionally, to support further advancements in this field, we\nprovide our private annotations of aortic centerlines and masks, which can\nserve as a benchmark for future research. The dataset can be found at\nhttps://github.com/neuro-ml/curve-detection.\n","authors":["Farukh Yaushev","Daria Nogina","Valentin Samokhin","Mariya Dugova","Ekaterina Petrash","Dmitry Sevryukov","Mikhail Belyaev","Maxim Pisov"],"pdf_url":"https://arxiv.org/pdf/2408.01159v2.pdf","comment":"Accepted to ShapeMI MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.07675v1","updated":"2024-08-14T17:22:41Z","published":"2024-08-14T17:22:41Z","title":"G$^2$V$^2$former: Graph Guided Video Vision Transformer for Face\n Anti-Spoofing","summary":" In videos containing spoofed faces, we may uncover the spoofing evidence\nbased on either photometric or dynamic abnormality, even a combination of both.\nPrevailing face anti-spoofing (FAS) approaches generally concentrate on the\nsingle-frame scenario, however, purely photometric-driven methods overlook the\ndynamic spoofing clues that may be exposed over time. This may lead FAS systems\nto conclude incorrect judgments, especially in cases where it is easily\ndistinguishable in terms of dynamics but challenging to discern in terms of\nphotometrics. To this end, we propose the Graph Guided Video Vision Transformer\n(G$^2$V$^2$former), which combines faces with facial landmarks for photometric\nand dynamic feature fusion. We factorize the attention into space and time, and\nfuse them via a spatiotemporal block. Specifically, we design a novel temporal\nattention called Kronecker temporal attention, which has a wider receptive\nfield, and is beneficial for capturing dynamic information. Moreover, we\nleverage the low-semantic motion of facial landmarks to guide the high-semantic\nchange of facial expressions based on the motivation that regions containing\nlandmarks may reveal more dynamic clues. Extensive experiments on nine\nbenchmark datasets demonstrate that our method achieves superior performance\nunder various scenarios. The codes will be released soon.\n","authors":["Jingyi Yang","Zitong Yu","Xiuming Ni","Jia He","Hui Li"],"pdf_url":"https://arxiv.org/pdf/2408.07675v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.03754v2","updated":"2024-08-14T17:14:33Z","published":"2024-04-04T18:50:58Z","title":"Data Science for Geographic Information Systems","summary":" The integration of data science into Geographic Information Systems (GIS) has\nfacilitated the evolution of these tools into complete spatial analysis\nplatforms. The adoption of machine learning and big data techniques has\nequipped these platforms with the capacity to handle larger amounts of\nincreasingly complex data, transcending the limitations of more traditional\napproaches. This work traces the historical and technical evolution of data\nscience and GIS as fields of study, highlighting the critical points of\nconvergence between domains, and underlining the many sectors that rely on this\nintegration. A GIS application is presented as a case study in the disaster\nmanagement sector where we utilize aerial data from Tr\\'oia, Portugal, to\nemphasize the process of insight extraction from raw data. We conclude by\noutlining prospects for future research in integration of these fields in\ngeneral, and the developed application in particular.\n","authors":["Afonso Oliveira","Nuno Fachada","João P. Matos-Carvalho"],"pdf_url":"https://arxiv.org/pdf/2404.03754v2.pdf","comment":"The peer-reviewed version of this paper is published in IEEE Xplore\n at https://doi.org/10.1109/YEF-ECE62614.2024.10624902. This version is\n typeset by the author and differs only in pagination and typographical detail"},{"id":"http://arxiv.org/abs/2408.07666v1","updated":"2024-08-14T16:58:48Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07648v1","updated":"2024-08-14T16:19:18Z","published":"2024-08-14T16:19:18Z","title":"See It All: Contextualized Late Aggregation for 3D Dense Captioning","summary":" 3D dense captioning is a task to localize objects in a 3D scene and generate\ndescriptive sentences for each object. Recent approaches in 3D dense captioning\nhave adopted transformer encoder-decoder frameworks from object detection to\nbuild an end-to-end pipeline without hand-crafted components. However, these\napproaches struggle with contradicting objectives where a single query\nattention has to simultaneously view both the tightly localized object regions\nand contextual environment. To overcome this challenge, we introduce SIA\n(See-It-All), a transformer pipeline that engages in 3D dense captioning with a\nnovel paradigm called late aggregation. SIA simultaneously decodes two sets of\nqueries-context query and instance query. The instance query focuses on\nlocalization and object attribute descriptions, while the context query\nversatilely captures the region-of-interest of relationships between multiple\nobjects or with the global scene, then aggregated afterwards (i.e., late\naggregation) via simple distance-based measures. To further enhance the quality\nof contextualized caption generation, we design a novel aggregator to generate\na fully informed caption based on the surrounding context, the global\nenvironment, and object instances. Extensive experiments on two of the most\nwidely-used 3D dense captioning datasets demonstrate that our proposed method\nachieves a significant improvement over prior methods.\n","authors":["Minjung Kim","Hyung Suk Lim","Seung Hwan Kim","Soonyoung Lee","Bumsoo Kim","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2408.07648v1.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.07642v1","updated":"2024-08-14T16:13:03Z","published":"2024-08-14T16:13:03Z","title":"Boosting Unconstrained Face Recognition with Targeted Style Adversary","summary":" While deep face recognition models have demonstrated remarkable performance,\nthey often struggle on the inputs from domains beyond their training data.\nRecent attempts aim to expand the training set by relying on computationally\nexpensive and inherently challenging image-space augmentation of image\ngeneration modules. In an orthogonal direction, we present a simple yet\neffective method to expand the training data by interpolating between\ninstance-level feature statistics across labeled and unlabeled sets. Our\nmethod, dubbed Targeted Style Adversary (TSA), is motivated by two\nobservations: (i) the input domain is reflected in feature statistics, and (ii)\nface recognition model performance is influenced by style information. Shifting\ntowards an unlabeled style implicitly synthesizes challenging training\ninstances. We devise a recognizability metric to constraint our framework to\npreserve the inherent identity-related information of labeled instances. The\nefficacy of our method is demonstrated through evaluations on unconstrained\nbenchmarks, outperforming or being on par with its competitors while offering\nnearly a 70\\% improvement in training speed and 40\\% less memory consumption.\n","authors":["Mohammad Saeed Ebrahimi Saadabadi","Sahar Rahimi Malakshan","Seyed Rasoul Hosseini","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2408.07642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07854v2","updated":"2024-08-14T16:08:45Z","published":"2024-03-12T17:44:45Z","title":"Distilling the Knowledge in Data Pruning","summary":" With the increasing size of datasets used for training neural networks, data\npruning becomes an attractive field of research. However, most current data\npruning algorithms are limited in their ability to preserve accuracy compared\nto models trained on the full data, especially in high pruning regimes. In this\npaper we explore the application of data pruning while incorporating knowledge\ndistillation (KD) when training on a pruned subset. That is, rather than\nrelying solely on ground-truth labels, we also use the soft predictions from a\nteacher network pre-trained on the complete data. By integrating KD into\ntraining, we demonstrate significant improvement across datasets, pruning\nmethods, and on all pruning fractions. We first establish a theoretical\nmotivation for employing self-distillation to improve training on pruned data.\nThen, we empirically make a compelling and highly practical observation: using\nKD, simple random pruning is comparable or superior to sophisticated pruning\nmethods across all pruning regimes. On ImageNet for example, we achieve\nsuperior accuracy despite training on a random subset of only 50% of the data.\nAdditionally, we demonstrate a crucial connection between the pruning factor\nand the optimal knowledge distillation weight. This helps mitigate the impact\nof samples with noisy labels and low-quality images retained by typical pruning\nalgorithms. Finally, we make an intriguing observation: when using lower\npruning fractions, larger teachers lead to accuracy degradation, while\nsurprisingly, employing teachers with a smaller capacity than the student's may\nimprove results. Our code will be made available.\n","authors":["Emanuel Ben-Baruch","Adam Botach","Igor Kviatkovsky","Manoj Aggarwal","Gérard Medioni"],"pdf_url":"https://arxiv.org/pdf/2403.07854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.06382v3","updated":"2024-08-14T15:58:57Z","published":"2021-10-12T22:05:11Z","title":"A Survey of Open Source User Activity Traces with Applications to User\n Mobility Characterization and Modeling","summary":" The current state-of-the-art in user mobility research has extensively relied\non open-source mobility traces captured from pedestrian and vehicular activity\nthrough a variety of communication technologies as users engage in a wide-range\nof applications, including connected healthcare, localization, social media,\ne-commerce, etc. Most of these traces are feature-rich and diverse, not only in\nthe information they provide, but also in how they can be used and leveraged.\nThis diversity poses two main challenges for researchers and practitioners who\nwish to make use of available mobility datasets. First, it is quite difficult\nto get a bird's eye view of the available traces without spending considerable\ntime looking them up. Second, once they have found the traces, they still need\nto figure out whether the traces are adequate to their needs.\n The purpose of this survey is three-fold. It proposes a taxonomy to classify\nopen-source mobility traces including their mobility mode, data source and\ncollection technology. It then uses the proposed taxonomy to classify existing\nopen-source mobility traces and finally, highlights three case studies using\npopular publicly available datasets to showcase how our taxonomy can tease out\nfeature sets in traces to help determine their applicability to specific\nuse-cases.\n","authors":["Sinjoni Mukhopadhyay King","Faisal Nawab","Katia Obraczka"],"pdf_url":"https://arxiv.org/pdf/2110.06382v3.pdf","comment":"23 pages, 6 pages references"},{"id":"http://arxiv.org/abs/2406.06965v3","updated":"2024-08-14T15:38:49Z","published":"2024-06-11T05:48:04Z","title":"Evolving from Single-modal to Multi-modal Facial Deepfake Detection: A\n Survey","summary":" This survey addresses the critical challenge of deepfake detection amidst the\nrapid advancements in artificial intelligence. As AI-generated media, including\nvideo, audio and text, become more realistic, the risk of misuse to spread\nmisinformation and commit identity fraud increases. Focused on face-centric\ndeepfakes, this work traces the evolution from traditional single-modality\nmethods to sophisticated multi-modal approaches that handle audio-visual and\ntext-visual scenarios. We provide comprehensive taxonomies of detection\ntechniques, discuss the evolution of generative methods from auto-encoders and\nGANs to diffusion models, and categorize these technologies by their unique\nattributes. To our knowledge, this is the first survey of its kind. We also\nexplore the challenges of adapting detection methods to new generative models\nand enhancing the reliability and robustness of deepfake detectors, proposing\ndirections for future research. This survey offers a detailed roadmap for\nresearchers, supporting the development of technologies to counter the\ndeceptive use of AI in media creation, particularly facial forgery. A curated\nlist of all related papers can be found at\n\\href{https://github.com/qiqitao77/Comprehensive-Advances-in-Deepfake-Detection-Spanning-Diverse-Modalities}{https://github.com/qiqitao77/Awesome-Comprehensive-Deepfake-Detection}.\n","authors":["Ping Liu","Qiqi Tao","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.06965v3.pdf","comment":"P. Liu is with the Department of Computer Science and Engineering,\n University of Nevada, Reno, NV, 89512. Q. Tao and J. Zhou are with Centre for\n Frontier AI Research (CFAR), and Institute of High Performance Computing\n (IHPC), A*STAR, Singapore. J. Zhou is also with Centre for Advanced\n Technologies in Online Safety (CATOS), A*STAR, Singapore. J. Zhou is the\n corresponding author"},{"id":"http://arxiv.org/abs/2204.05566v3","updated":"2024-08-14T15:31:26Z","published":"2022-04-12T06:53:25Z","title":"Compact Model Training by Low-Rank Projection with Energy Transfer","summary":" Low-rankness plays an important role in traditional machine learning, but is\nnot so popular in deep learning. Most previous low-rank network compression\nmethods compress networks by approximating pre-trained models and re-training.\nHowever, the optimal solution in the Euclidean space may be quite different\nfrom the one with low-rank constraint. A well-pre-trained model is not a good\ninitialization for the model with low-rank constraints. Thus, the performance\nof a low-rank compressed network degrades significantly. Compared with other\nnetwork compression methods such as pruning, low-rank methods attract less\nattention in recent years. In this paper, we devise a new training method,\nlow-rank projection with energy transfer (LRPET), that trains low-rank\ncompressed networks from scratch and achieves competitive performance. We\npropose to alternately perform stochastic gradient descent training and\nprojection of each weight matrix onto the corresponding low-rank manifold.\nCompared to re-training on the compact model, this enables full utilization of\nmodel capacity since solution space is relaxed back to Euclidean space after\nprojection. The matrix energy (the sum of squares of singular values) reduction\ncaused by projection is compensated by energy transfer. We uniformly transfer\nthe energy of the pruned singular values to the remaining ones. We\ntheoretically show that energy transfer eases the trend of gradient vanishing\ncaused by projection. In modern networks, a batch normalization (BN) layer can\nbe merged into the previous convolution layer for inference, thereby\ninfluencing the optimal low-rank approximation of the previous layer. We\npropose BN rectification to cut off its effect on the optimal low-rank\napproximation, which further improves the performance.\n","authors":["Kailing Guo","Zhenquan Lin","Canyang Chen","Xiaofen Xing","Fang Liu","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2204.05566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07613v1","updated":"2024-08-14T15:26:10Z","published":"2024-08-14T15:26:10Z","title":"Rethinking the Key Factors for the Generalization of Remote Sensing\n Stereo Matching Networks","summary":" Stereo matching, a critical step of 3D reconstruction, has fully shifted\ntowards deep learning due to its strong feature representation of remote\nsensing images. However, ground truth for stereo matching task relies on\nexpensive airborne LiDAR data, thus making it difficult to obtain enough\nsamples for supervised learning. To improve the generalization ability of\nstereo matching networks on cross-domain data from different sensors and\nscenarios, in this paper, we dedicate to study key training factors from three\nperspectives. (1) For the selection of training dataset, it is important to\nselect data with similar regional target distribution as the test set instead\nof utilizing data from the same sensor. (2) For model structure, cascaded\nstructure that flexibly adapts to different sizes of features is preferred. (3)\nFor training manner, unsupervised methods generalize better than supervised\nmethods, and we design an unsupervised early-stop strategy to help retain the\nbest model with pre-trained weights as the basis. Extensive experiments are\nconducted to support the previous findings, on the basis of which we present an\nunsupervised stereo matching network with good generalization performance. We\nrelease the source code and the datasets at\nhttps://github.com/Elenairene/RKF_RSSM to reproduce the results and encourage\nfuture work.\n","authors":["Liting Jiang","Feng Wang","Wenyi Zhang","Peifeng Li","Hongjian You","Yuming Xiang"],"pdf_url":"https://arxiv.org/pdf/2408.07613v1.pdf","comment":"submitted to IEEE jstars"},{"id":"http://arxiv.org/abs/2407.15793v3","updated":"2024-08-14T15:12:07Z","published":"2024-07-22T16:51:28Z","title":"CLIP with Generative Latent Replay: a Strong Baseline for Incremental\n Learning","summary":" With the emergence of Transformers and Vision-Language Models (VLMs) such as\nCLIP, fine-tuning large pre-trained models has recently become a prevalent\nstrategy in Continual Learning. This has led to the development of numerous\nprompting strategies to adapt transformer-based models without incurring\ncatastrophic forgetting. However, these strategies often compromise the\noriginal zero-shot capabilities of the pre-trained CLIP model and struggle to\nadapt to domains that significantly deviate from the pre-training data. In this\nwork, we propose Continual Generative training for Incremental prompt-Learning,\na simple and novel approach to mitigate forgetting while adapting CLIP.\nBriefly, we employ Variational Autoencoders (VAEs) to learn class-conditioned\ndistributions within the embedding space of the visual encoder. We then exploit\nthese distributions to sample new synthetic visual embeddings and train the\ncorresponding class-specific textual prompts during subsequent tasks. Through\nextensive experiments on different domains, we show that such a generative\nreplay approach can adapt to new tasks while improving zero-shot capabilities,\nevaluated using a novel metric tailored for CL scenarios. Notably, further\nanalysis reveals that our approach can bridge the gap with joint prompt tuning.\nThe codebase is available at https://github.com/aimagelab/mammoth.\n","authors":["Emanuele Frascaroli","Aniello Panariello","Pietro Buzzega","Lorenzo Bonicelli","Angelo Porrello","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2407.15793v3.pdf","comment":"15 pages, 1 figure. Accepted at the The 35th British Machine Vision\n Conference 2024 (BMVC 2024), Glasgow, UK"},{"id":"http://arxiv.org/abs/2408.07605v1","updated":"2024-08-14T15:10:13Z","published":"2024-08-14T15:10:13Z","title":"Panacea+: Panoramic and Controllable Video Generation for Autonomous\n Driving","summary":" The field of autonomous driving increasingly demands high-quality annotated\nvideo training data. In this paper, we propose Panacea+, a powerful and\nuniversally applicable framework for generating video data in driving scenes.\nBuilt upon the foundation of our previous work, Panacea, Panacea+ adopts a\nmulti-view appearance noise prior mechanism and a super-resolution module for\nenhanced consistency and increased resolution. Extensive experiments show that\nthe generated video samples from Panacea+ greatly benefit a wide range of tasks\non different datasets, including 3D object tracking, 3D object detection, and\nlane detection tasks on the nuScenes and Argoverse 2 dataset. These results\nstrongly prove Panacea+ to be a valuable data generation framework for\nautonomous driving.\n","authors":["Yuqing Wen","Yucheng Zhao","Yingfei Liu","Binyuan Huang","Fan Jia","Yanhui Wang","Chi Zhang","Tiancai Wang","Xiaoyan Sun","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07605v1.pdf","comment":"Project page: https://panacea-ad.github.io/. arXiv admin note: text\n overlap with arXiv:2311.16813"},{"id":"http://arxiv.org/abs/2408.07600v1","updated":"2024-08-14T15:00:27Z","published":"2024-08-14T15:00:27Z","title":"Disentangle and denoise: Tackling context misalignment for video moment\n retrieval","summary":" Video Moment Retrieval, which aims to locate in-context video moments\naccording to a natural language query, is an essential task for cross-modal\ngrounding. Existing methods focus on enhancing the cross-modal interactions\nbetween all moments and the textual description for video understanding.\nHowever, constantly interacting with all locations is unreasonable because of\nuneven semantic distribution across the timeline and noisy visual backgrounds.\nThis paper proposes a cross-modal Context Denoising Network (CDNet) for\naccurate moment retrieval by disentangling complex correlations and denoising\nirrelevant dynamics.Specifically, we propose a query-guided semantic\ndisentanglement (QSD) to decouple video moments by estimating alignment levels\naccording to the global and fine-grained correlation. A Context-aware Dynamic\nDenoisement (CDD) is proposed to enhance understanding of aligned\nspatial-temporal details by learning a group of query-relevant offsets.\nExtensive experiments on public benchmarks demonstrate that the proposed CDNet\nachieves state-of-the-art performances.\n","authors":["Kaijing Ma","Han Fang","Xianghao Zang","Chao Ban","Lanxiang Zhou","Zhongjiang He","Yongxiang Li","Hao Sun","Zerun Feng","Xingsong Hou"],"pdf_url":"https://arxiv.org/pdf/2408.07600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07595v1","updated":"2024-08-14T14:50:08Z","published":"2024-08-14T14:50:08Z","title":"Progressive Radiance Distillation for Inverse Rendering with Gaussian\n Splatting","summary":" We propose progressive radiance distillation, an inverse rendering method\nthat combines physically-based rendering with Gaussian-based radiance field\nrendering using a distillation progress map. Taking multi-view images as input,\nour method starts from a pre-trained radiance field guidance, and distills\nphysically-based light and material parameters from the radiance field using an\nimage-fitting process. The distillation progress map is initialized to a small\nvalue, which favors radiance field rendering. During early iterations when\nfitted light and material parameters are far from convergence, the radiance\nfield fallback ensures the sanity of image loss gradients and avoids local\nminima that attracts under-fit states. As fitted parameters converge, the\nphysical model gradually takes over and the distillation progress increases\ncorrespondingly. In presence of light paths unmodeled by the physical model,\nthe distillation progress never finishes on affected pixels and the learned\nradiance field stays in the final rendering. With this designed tolerance for\nphysical model limitations, we prevent unmodeled color components from leaking\ninto light and material parameters, alleviating relighting artifacts.\nMeanwhile, the remaining radiance field compensates for the limitations of the\nphysical model, guaranteeing high-quality novel views synthesis. Experimental\nresults demonstrate that our method significantly outperforms state-of-the-art\ntechniques quality-wise in both novel view synthesis and relighting. The idea\nof progressive radiance distillation is not limited to Gaussian splatting. We\nshow that it also has positive effects for prominently specular scenes when\nadapted to a mesh-based inverse rendering method.\n","authors":["Keyang Ye","Qiming Hou","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05523v2","updated":"2024-08-14T14:34:34Z","published":"2024-08-10T11:39:11Z","title":"DeepFace-Attention: Multimodal Face Biometrics for Attention Estimation\n with Application to e-Learning","summary":" This work introduces an innovative method for estimating attention levels\n(cognitive load) using an ensemble of facial analysis techniques applied to\nwebcam videos. Our method is particularly useful, among others, in e-learning\napplications, so we trained, evaluated, and compared our approach on the mEBAL2\ndatabase, a public multi-modal database acquired in an e-learning environment.\nmEBAL2 comprises data from 60 users who performed 8 different tasks. These\ntasks varied in difficulty, leading to changes in their cognitive loads. Our\napproach adapts state-of-the-art facial analysis technologies to quantify the\nusers' cognitive load in the form of high or low attention. Several behavioral\nsignals and physiological processes related to the cognitive load are used,\nsuch as eyeblink, heart rate, facial action units, and head pose, among others.\nFurthermore, we conduct a study to understand which individual features obtain\nbetter results, the most efficient combinations, explore local and global\nfeatures, and how temporary time intervals affect attention level estimation,\namong other aspects. We find that global facial features are more appropriate\nfor multimodal systems using score-level fusion, particularly as the temporal\nwindow increases. On the other hand, local features are more suitable for\nfusion through neural network training with score-level fusion approaches. Our\nmethod outperforms existing state-of-the-art accuracies using the public mEBAL2\nbenchmark.\n","authors":["Roberto Daza","Luis F. Gomez","Julian Fierrez","Aythami Morales","Ruben Tolosana","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2408.05523v2.pdf","comment":"Article accepted in the IEEE Access journal. Accessible at\n https://ieeexplore.ieee.org/document/10633208"},{"id":"http://arxiv.org/abs/2403.15377v4","updated":"2024-08-14T14:31:50Z","published":"2024-03-22T17:57:42Z","title":"InternVideo2: Scaling Foundation Models for Multimodal Video\n Understanding","summary":" We introduce InternVideo2, a new family of video foundation models (ViFM)\nthat achieve the state-of-the-art results in video recognition, video-text\ntasks, and video-centric dialogue. Our core design is a progressive training\napproach that unifies the masked video modeling, crossmodal contrastive\nlearning, and next token prediction, scaling up the video encoder size to 6B\nparameters. At the data level, we prioritize spatiotemporal consistency by\nsemantically segmenting videos and generating video-audio-speech captions. This\nimproves the alignment between video and text. Through extensive experiments,\nwe validate our designs and demonstrate superior performance on over 60 video\nand audio tasks. Notably, our model outperforms others on various video-related\ndialogue and long video understanding benchmarks, highlighting its ability to\nreason and comprehend longer contexts. Code and models are available at\nhttps://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/.\n","authors":["Yi Wang","Kunchang Li","Xinhao Li","Jiashuo Yu","Yinan He","Chenting Wang","Guo Chen","Baoqi Pei","Ziang Yan","Rongkun Zheng","Jilan Xu","Zun Wang","Yansong Shi","Tianxiang Jiang","Songze Li","Hongjie Zhang","Yifei Huang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15377v4.pdf","comment":"a technical report about video understanding (accepted to ECCV2024)"},{"id":"http://arxiv.org/abs/2408.07583v1","updated":"2024-08-14T14:28:11Z","published":"2024-08-14T14:28:11Z","title":"Transformers and Large Language Models for Efficient Intrusion Detection\n Systems: A Comprehensive Survey","summary":" With significant advancements in Transformers LLMs, NLP has extended its\nreach into many research fields due to its enhanced capabilities in text\ngeneration and user interaction. One field benefiting greatly from these\nadvancements is cybersecurity. In cybersecurity, many parameters that need to\nbe protected and exchanged between senders and receivers are in the form of\ntext and tabular data, making NLP a valuable tool in enhancing the security\nmeasures of communication protocols. This survey paper provides a comprehensive\nanalysis of the utilization of Transformers and LLMs in cyber-threat detection\nsystems. The methodology of paper selection and bibliometric analysis is\noutlined to establish a rigorous framework for evaluating existing research.\nThe fundamentals of Transformers are discussed, including background\ninformation on various cyber-attacks and datasets commonly used in this field.\nThe survey explores the application of Transformers in IDSs, focusing on\ndifferent architectures such as Attention-based models, LLMs like BERT and GPT,\nCNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.\nFurthermore, it explores the diverse environments and applications where\nTransformers and LLMs-based IDS have been implemented, including computer\nnetworks, IoT devices, critical infrastructure protection, cloud computing,\nSDN, as well as in autonomous vehicles. The paper also addresses research\nchallenges and future directions in this area, identifying key issues such as\ninterpretability, scalability, and adaptability to evolving threats, and more.\nFinally, the conclusion summarizes the findings and highlights the significance\nof Transformers and LLMs in enhancing cyber-threat detection capabilities,\nwhile also outlining potential avenues for further research and development.\n","authors":["Hamza Kheddar"],"pdf_url":"https://arxiv.org/pdf/2408.07583v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.04760 by other authors"},{"id":"http://arxiv.org/abs/2408.07576v1","updated":"2024-08-14T14:16:52Z","published":"2024-08-14T14:16:52Z","title":"MetaSeg: MetaFormer-based Global Contexts-aware Network for Efficient\n Semantic Segmentation","summary":" Beyond the Transformer, it is important to explore how to exploit the\ncapacity of the MetaFormer, an architecture that is fundamental to the\nperformance improvements of the Transformer. Previous studies have exploited it\nonly for the backbone network. Unlike previous studies, we explore the capacity\nof the Metaformer architecture more extensively in the semantic segmentation\ntask. We propose a powerful semantic segmentation network, MetaSeg, which\nleverages the Metaformer architecture from the backbone to the decoder. Our\nMetaSeg shows that the MetaFormer architecture plays a significant role in\ncapturing the useful contexts for the decoder as well as for the backbone. In\naddition, recent segmentation methods have shown that using a CNN-based\nbackbone for extracting the spatial information and a decoder for extracting\nthe global information is more effective than using a transformer-based\nbackbone with a CNN-based decoder. This motivates us to adopt the CNN-based\nbackbone using the MetaFormer block and design our MetaFormer-based decoder,\nwhich consists of a novel self-attention module to capture the global contexts.\nTo consider both the global contexts extraction and the computational\nefficiency of the self-attention for semantic segmentation, we propose a\nChannel Reduction Attention (CRA) module that reduces the channel dimension of\nthe query and key into the one dimension. In this way, our proposed MetaSeg\noutperforms the previous state-of-the-art methods with more efficient\ncomputational costs on popular semantic segmentation and a medical image\nsegmentation benchmark, including ADE20K, Cityscapes, COCO-stuff, and Synapse.\nThe code is available at \\url{https://github.com/hyunwoo137/MetaSeg}.\n","authors":["Beoungwoo Kang","Seunghun Moon","Yubin Cho","Hyunwoo Yu","Suk-Ju Kang"],"pdf_url":"https://arxiv.org/pdf/2408.07576v1.pdf","comment":"Accepted by WACV 2024"},{"id":"http://arxiv.org/abs/2311.01686v2","updated":"2024-08-14T14:11:25Z","published":"2023-11-03T03:18:40Z","title":"Disentangled Representation Learning with Transmitted Information\n Bottleneck","summary":" Encoding only the task-related information from the raw data, \\ie,\ndisentangled representation learning, can greatly contribute to the robustness\nand generalizability of models. Although significant advances have been made by\nregularizing the information in representations with information theory, two\nmajor challenges remain: 1) the representation compression inevitably leads to\nperformance drop; 2) the disentanglement constraints on representations are in\ncomplicated optimization. To these issues, we introduce Bayesian networks with\ntransmitted information to formulate the interaction among input and\nrepresentations during disentanglement. Building upon this framework, we\npropose \\textbf{DisTIB} (\\textbf{T}ransmitted \\textbf{I}nformation\n\\textbf{B}ottleneck for \\textbf{Dis}entangled representation learning), a novel\nobjective that navigates the balance between information compression and\npreservation. We employ variational inference to derive a tractable estimation\nfor DisTIB. This estimation can be simply optimized via standard gradient\ndescent with a reparameterization trick. Moreover, we theoretically prove that\nDisTIB can achieve optimal disentanglement, underscoring its superior efficacy.\nTo solidify our claims, we conduct extensive experiments on various downstream\ntasks to demonstrate the appealing efficacy of DisTIB and validate our\ntheoretical analyses.\n","authors":["Zhuohang Dang","Minnan Luo","Chengyou Jia","Guang Dai","Jihong Wang","Xiaojun Chang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07558v1","updated":"2024-08-14T13:43:59Z","published":"2024-08-14T13:43:59Z","title":"Sonic: Fast and Transferable Data Poisoning on Clustering Algorithms","summary":" Data poisoning attacks on clustering algorithms have received limited\nattention, with existing methods struggling to scale efficiently as dataset\nsizes and feature counts increase. These attacks typically require\nre-clustering the entire dataset multiple times to generate predictions and\nassess the attacker's objectives, significantly hindering their scalability.\nThis paper addresses these limitations by proposing Sonic, a novel genetic data\npoisoning attack that leverages incremental and scalable clustering algorithms,\ne.g., FISHDBC, as surrogates to accelerate poisoning attacks against\ngraph-based and density-based clustering methods, such as HDBSCAN. We\nempirically demonstrate the effectiveness and efficiency of Sonic in poisoning\nthe target clustering algorithms. We then conduct a comprehensive analysis of\nthe factors affecting the scalability and transferability of poisoning attacks\nagainst clustering algorithms, and we conclude by examining the robustness of\nhyperparameters in our attack strategy Sonic.\n","authors":["Francesco Villani","Dario Lazzaro","Antonio Emanuele Cinà","Matteo Dell'Amico","Battista Biggio","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2408.07558v1.pdf","comment":"preprint paper"},{"id":"http://arxiv.org/abs/2403.10683v2","updated":"2024-08-14T13:43:28Z","published":"2024-03-15T21:06:14Z","title":"GS-Pose: Generalizable Segmentation-based 6D Object Pose Estimation with\n 3D Gaussian Splatting","summary":" This paper introduces GS-Pose, a unified framework for localizing and\nestimating the 6D pose of novel objects. GS-Pose begins with a set of posed RGB\nimages of a previously unseen object and builds three distinct representations\nstored in a database. At inference, GS-Pose operates sequentially by locating\nthe object in the input image, estimating its initial 6D pose using a retrieval\napproach, and refining the pose with a render-and-compare method. The key\ninsight is the application of the appropriate object representation at each\nstage of the process. In particular, for the refinement step, we leverage 3D\nGaussian splatting, a novel differentiable rendering technique that offers high\nrendering speed and relatively low optimization time. Off-the-shelf toolchains\nand commodity hardware, such as mobile phones, can be used to capture new\nobjects to be added to the database. Extensive evaluations on the LINEMOD and\nOnePose-LowTexture datasets demonstrate excellent performance, establishing the\nnew state-of-the-art. Project page: https://dingdingcai.github.io/gs-pose.\n","authors":["Dingding Cai","Janne Heikkilä","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2403.10683v2.pdf","comment":"Project Page: https://dingdingcai.github.io/gs-pose"},{"id":"http://arxiv.org/abs/2312.05826v4","updated":"2024-08-14T13:41:41Z","published":"2023-12-10T08:59:43Z","title":"R2Human: Real-Time 3D Human Appearance Rendering from a Single Image","summary":" Rendering 3D human appearance from a single image in real-time is crucial for\nachieving holographic communication and immersive VR/AR. Existing methods\neither rely on multi-camera setups or are constrained to offline operations. In\nthis paper, we propose R2Human, the first approach for real-time inference and\nrendering of photorealistic 3D human appearance from a single image. The core\nof our approach is to combine the strengths of implicit texture fields and\nexplicit neural rendering with our novel representation, namely Z-map. Based on\nthis, we present an end-to-end network that performs high-fidelity color\nreconstruction of visible areas and provides reliable color inference for\noccluded regions. To further enhance the 3D perception ability of our network,\nwe leverage the Fourier occupancy field as a prior for generating the texture\nfield and providing a sampling surface in the rendering stage. We also propose\na consistency loss and a spatial fusion strategy to ensure the multi-view\ncoherence. Experimental results show that our method outperforms the\nstate-of-the-art methods on both synthetic data and challenging real-world\nimages, in real-time. The project page can be found at\nhttp://cic.tju.edu.cn/faculty/likun/projects/R2Human.\n","authors":["Yuanwang Yang","Qiao Feng","Yu-Kun Lai","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2312.05826v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07543v1","updated":"2024-08-14T13:23:43Z","published":"2024-08-14T13:23:43Z","title":"MathScape: Evaluating MLLMs in multimodal Math Scenarios through a\n Hierarchical Benchmark","summary":" With the development of Multimodal Large Language Models (MLLMs), the\nevaluation of multimodal models in the context of mathematical problems has\nbecome a valuable research field. Multimodal visual-textual mathematical\nreasoning serves as a critical indicator for evaluating the comprehension and\ncomplex multi-step quantitative reasoning abilities of MLLMs. However, previous\nmultimodal math benchmarks have not sufficiently integrated visual and textual\ninformation. To address this gap, we proposed MathScape, a new benchmark that\nemphasizes the understanding and application of combined visual and textual\ninformation. MathScape is designed to evaluate photo-based math problem\nscenarios, assessing the theoretical understanding and application ability of\nMLLMs through a categorical hierarchical approach. We conduct a\nmulti-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark\nis challenging even for the most sophisticated models. By analyzing the\nevaluation results, we identify the limitations of MLLMs, offering valuable\ninsights for enhancing model performance.\n","authors":["Minxuan Zhou","Hao Liang","Tianpeng Li","Zhiyu Wu","Mingan Lin","Linzhuang Sun","Yaqi Zhou","Yan Zhang","Xiaoqin Huang","Yicong Chen","Yujing Qiao","Weipeng Chen","Bin Cui","Wentao Zhang","Zenan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09805v2","updated":"2024-08-14T13:22:50Z","published":"2024-03-14T18:52:34Z","title":"On the Utility of 3D Hand Poses for Action Recognition","summary":" 3D hand pose is an underexplored modality for action recognition. Poses are\ncompact yet informative and can greatly benefit applications with limited\ncompute budgets. However, poses alone offer an incomplete understanding of\nactions, as they cannot fully capture objects and environments with which\nhumans interact. We propose HandFormer, a novel multimodal transformer, to\nefficiently model hand-object interactions. HandFormer combines 3D hand poses\nat a high temporal resolution for fine-grained motion modeling with sparsely\nsampled RGB frames for encoding scene semantics. Observing the unique\ncharacteristics of hand poses, we temporally factorize hand modeling and\nrepresent each joint by its short-term trajectories. This factorized pose\nrepresentation combined with sparse RGB samples is remarkably efficient and\nhighly accurate. Unimodal HandFormer with only hand poses outperforms existing\nskeleton-based methods at 5x fewer FLOPs. With RGB, we achieve new\nstate-of-the-art performance on Assembly101 and H2O with significant\nimprovements in egocentric action recognition.\n","authors":["Md Salman Shamil","Dibyadip Chatterjee","Fadime Sener","Shugao Ma","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.09805v2.pdf","comment":"ECCV 2024; https://s-shamil.github.io/HandFormer/"},{"id":"http://arxiv.org/abs/2408.07541v1","updated":"2024-08-14T13:20:52Z","published":"2024-08-14T13:20:52Z","title":"DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model","summary":" The flat lensless camera design reduces the camera size and weight\nsignificantly. In this design, the camera lens is replaced by another optical\nelement that interferes with the incoming light. The image is recovered from\nthe raw sensor measurements using a reconstruction algorithm. Yet, the quality\nof the reconstructed images is not satisfactory. To mitigate this, we propose\nutilizing a pre-trained diffusion model with a control network and a learned\nseparable transformation for reconstruction. This allows us to build a\nprototype flat camera with high-quality imaging, presenting state-of-the-art\nresults in both terms of quality and perceptuality. We demonstrate its ability\nto leverage also textual descriptions of the captured scene to further enhance\nreconstruction. Our reconstruction method which leverages the strong\ncapabilities of a pre-trained diffusion model can be used in other imaging\nsystems for improved reconstruction results.\n","authors":["Erez Yosef","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2408.07541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07540v1","updated":"2024-08-14T13:17:42Z","published":"2024-08-14T13:17:42Z","title":"3D Gaussian Editing with A Single Image","summary":" The modeling and manipulation of 3D scenes captured from the real world are\npivotal in various applications, attracting growing research interest. While\nprevious works on editing have achieved interesting results through\nmanipulating 3D meshes, they often require accurately reconstructed meshes to\nperform editing, which limits their application in 3D content generation. To\naddress this gap, we introduce a novel single-image-driven 3D scene editing\napproach based on 3D Gaussian Splatting, enabling intuitive manipulation via\ndirectly editing the content on a 2D image plane. Our method learns to optimize\nthe 3D Gaussians to align with an edited version of the image rendered from a\nuser-specified viewpoint of the original scene. To capture long-range object\ndeformation, we introduce positional loss into the optimization process of 3D\nGaussian Splatting and enable gradient propagation through reparameterization.\nTo handle occluded 3D Gaussians when rendering from the specified viewpoint, we\nbuild an anchor-based structure and employ a coarse-to-fine optimization\nstrategy capable of handling long-range deformation while maintaining\nstructural stability. Furthermore, we design a novel masking strategy to\nadaptively identify non-rigid deformation regions for fine-scale modeling.\nExtensive experiments show the effectiveness of our method in handling\ngeometric details, long-range, and non-rigid deformation, demonstrating\nsuperior editing flexibility and quality compared to previous approaches.\n","authors":["Guan Luo","Tian-Xing Xu","Ying-Tian Liu","Xiao-Xiong Fan","Fang-Lue Zhang","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07540v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.07539v1","updated":"2024-08-14T13:17:41Z","published":"2024-08-14T13:17:41Z","title":"Cross-aware Early Fusion with Stage-divided Vision and Language\n Transformer Encoders for Referring Image Segmentation","summary":" Referring segmentation aims to segment a target object related to a natural\nlanguage expression. Key challenges of this task are understanding the meaning\nof complex and ambiguous language expressions and determining the relevant\nregions in the image with multiple objects by referring to the expression.\nRecent models have focused on the early fusion with the language features at\nthe intermediate stage of the vision encoder, but these approaches have a\nlimitation that the language features cannot refer to the visual information.\nTo address this issue, this paper proposes a novel architecture, Cross-aware\nearly fusion with stage-divided Vision and Language Transformer encoders\n(CrossVLT), which allows both language and vision encoders to perform the early\nfusion for improving the ability of the cross-modal context modeling. Unlike\nprevious methods, our method enables the vision and language features to refer\nto each other's information at each stage to mutually enhance the robustness of\nboth encoders. Furthermore, unlike the conventional scheme that relies solely\non the high-level features for the cross-modal alignment, we introduce a\nfeature-based alignment scheme that enables the low-level to high-level\nfeatures of the vision and language encoders to engage in the cross-modal\nalignment. By aligning the intermediate cross-modal features in all encoder\nstages, this scheme leads to effective cross-modal fusion. In this way, the\nproposed approach is simple but effective for referring image segmentation, and\nit outperforms the previous state-of-the-art methods on three public\nbenchmarks.\n","authors":["Yubin Cho","Hyunwoo Yu","Suk-ju Kang"],"pdf_url":"https://arxiv.org/pdf/2408.07539v1.pdf","comment":"Published in IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2309.08927v3","updated":"2024-08-14T13:09:13Z","published":"2023-09-16T08:46:59Z","title":"DynaMoN: Motion-Aware Fast and Robust Camera Localization for Dynamic\n Neural Radiance Fields","summary":" The accurate reconstruction of dynamic scenes with neural radiance fields is\nsignificantly dependent on the estimation of camera poses. Widely used\nstructure-from-motion pipelines encounter difficulties in accurately tracking\nthe camera trajectory when faced with separate dynamics of the scene content\nand the camera movement. To address this challenge, we propose Dynamic\nMotion-Aware Fast and Robust Camera Localization for Dynamic Neural Radiance\nFields (DynaMoN). DynaMoN utilizes semantic segmentation and generic motion\nmasks to handle dynamic content for initial camera pose estimation and\nstatics-focused ray sampling for fast and accurate novel-view synthesis. Our\nnovel iterative learning scheme switches between training the NeRF and updating\nthe pose parameters for an improved reconstruction and trajectory estimation\nquality. The proposed pipeline shows significant acceleration of the training\nprocess. We extensively evaluate our approach on two real-world dynamic\ndatasets, the TUM RGB-D dataset and the BONN RGB-D Dynamic dataset. DynaMoN\nimproves over the state-of-the-art both in terms of reconstruction quality and\ntrajectory accuracy. We plan to make our code public to enhance research in\nthis area.\n","authors":["Nicolas Schischka","Hannah Schieber","Mert Asim Karaoglu","Melih Görgülü","Florian Grötzner","Alexander Ladikos","Daniel Roth","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2309.08927v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12577v5","updated":"2024-08-14T13:07:13Z","published":"2022-01-29T12:40:19Z","title":"Volley Revolver: A Novel Matrix-Encoding Method for Privacy-Preserving\n Neural Networks (Inference)","summary":" In this work, we present a novel matrix-encoding method that is particularly\nconvenient for neural networks to make predictions in a privacy-preserving\nmanner using homomorphic encryption. Based on this encoding method, we\nimplement a convolutional neural network for handwritten image classification\nover encryption. For two matrices $A$ and $B$ to perform homomorphic\nmultiplication, the main idea behind it, in a simple version, is to encrypt\nmatrix $A$ and the transpose of matrix $B$ into two ciphertexts respectively.\nWith additional operations, the homomorphic matrix multiplication can be\ncalculated over encrypted matrices efficiently. For the convolution operation,\nwe in advance span each convolution kernel to a matrix space of the same size\nas the input image so as to generate several ciphertexts, each of which is\nlater used together with the ciphertext encrypting input images for calculating\nsome of the final convolution results. We accumulate all these intermediate\nresults and thus complete the convolution operation.\n In a public cloud with 40 vCPUs, our convolutional neural network\nimplementation on the MNIST testing dataset takes $\\sim$ 287 seconds to compute\nten likelihoods of 32 encrypted images of size $28 \\times 28$ simultaneously.\nThe data owner only needs to upload one ciphertext ($\\sim 19.8$ MB) encrypting\nthese 32 images to the public cloud.\n","authors":["John Chiang"],"pdf_url":"https://arxiv.org/pdf/2201.12577v5.pdf","comment":"The encoding method we proposed in this work, $\\texttt{Volley\n Revolver}$, is particularly tailored for privacy-preserving neural networks.\n There is a great chance that it can be used to assist the private neural\n networks training, in which case for the backpropagation algorithm of the\n fully-connected layer the first matrix $A$ is revolved while the second\n matrix $B$ is settled to be still"},{"id":"http://arxiv.org/abs/2408.07532v1","updated":"2024-08-14T13:03:48Z","published":"2024-08-14T13:03:48Z","title":"Improved 3D Whole Heart Geometry from Sparse CMR Slices","summary":" Cardiac magnetic resonance (CMR) imaging and computed tomography (CT) are two\ncommon non-invasive imaging methods for assessing patients with cardiovascular\ndisease. CMR typically acquires multiple sparse 2D slices, with unavoidable\nrespiratory motion artefacts between slices, whereas CT acquires isotropic\ndense data but uses ionising radiation. In this study, we explore the\ncombination of Slice Shifting Algorithm (SSA), Spatial Transformer Network\n(STN), and Label Transformer Network (LTN) to: 1) correct respiratory motion\nbetween segmented slices, and 2) transform sparse segmentation data into dense\nsegmentation. All combinations were validated using synthetic motion-corrupted\nCMR slice segmentation generated from CT in 1699 cases, where the dense CT\nserves as the ground truth. In 199 testing cases, SSA-LTN achieved the best\nresults for Dice score and Huasdorff distance (94.0% and 4.7 mm respectively,\naverage over 5 labels) but gave topological errors in 8 cases. STN was\neffective as a plug-in tool for correcting all topological errors with minimal\nimpact on overall performance (93.5% and 5.0 mm respectively). SSA also proves\nto be a valuable plug-in tool, enhancing performance over both STN-based and\nLTN-based models. The code for these different combinations is available at\nhttps://github.com/XESchong/STACOM2024.\n","authors":["Yiyang Xu","Hao Xu","Matthew Sinclair","Esther Puyol-Antón","Steven A Niederer","Amedeo Chiribiri","Steven E Williams","Michelle C Williams","Alistair A Young"],"pdf_url":"https://arxiv.org/pdf/2408.07532v1.pdf","comment":"13 pages, STACOM2024"},{"id":"http://arxiv.org/abs/2408.07530v1","updated":"2024-08-14T13:03:31Z","published":"2024-08-14T13:03:31Z","title":"Towards Real-time Video Compressive Sensing on Mobile Devices","summary":" Video Snapshot Compressive Imaging (SCI) uses a low-speed 2D camera to\ncapture high-speed scenes as snapshot compressed measurements, followed by a\nreconstruction algorithm to retrieve the high-speed video frames. The fast\nevolving mobile devices and existing high-performance video SCI reconstruction\nalgorithms motivate us to develop mobile reconstruction methods for real-world\napplications. Yet, it is still challenging to deploy previous reconstruction\nalgorithms on mobile devices due to the complex inference process, let alone\nreal-time mobile reconstruction. To the best of our knowledge, there is no\nvideo SCI reconstruction model designed to run on the mobile devices. Towards\nthis end, in this paper, we present an effective approach for video SCI\nreconstruction, dubbed MobileSCI, which can run at real-time speed on the\nmobile devices for the first time. Specifically, we first build a U-shaped 2D\nconvolution-based architecture, which is much more efficient and\nmobile-friendly than previous state-of-the-art reconstruction methods. Besides,\nan efficient feature mixing block, based on the channel splitting and shuffling\nmechanisms, is introduced as a novel bottleneck block of our proposed MobileSCI\nto alleviate the computational burden. Finally, a customized knowledge\ndistillation strategy is utilized to further improve the reconstruction\nquality. Extensive results on both simulated and real data show that our\nproposed MobileSCI can achieve superior reconstruction quality with high\nefficiency on the mobile devices. Particularly, we can reconstruct a 256 X 256\nX 8 snapshot compressed measurement with real-time performance (about 35 FPS)\non an iPhone 15. Code is available at https://github.com/mcao92/MobileSCI.\n","authors":["Miao Cao","Lishun Wang","Huan Wang","Guoqing Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.07530v1.pdf","comment":"9 pages, Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.07527v1","updated":"2024-08-14T13:02:20Z","published":"2024-08-14T13:02:20Z","title":"Evidential Graph Contrastive Alignment for Source-Free Blending-Target\n Domain Adaptation","summary":" In this paper, we firstly tackle a more realistic Domain Adaptation (DA)\nsetting: Source-Free Blending-Target Domain Adaptation (SF-BTDA), where we can\nnot access to source domain data while facing mixed multiple target domains\nwithout any domain labels in prior. Compared to existing DA scenarios, SF-BTDA\ngenerally faces the co-existence of different label shifts in different\ntargets, along with noisy target pseudo labels generated from the source model.\nIn this paper, we propose a new method called Evidential Contrastive Alignment\n(ECA) to decouple the blending target domain and alleviate the effect from\nnoisy target pseudo labels. First, to improve the quality of pseudo target\nlabels, we propose a calibrated evidential learning module to iteratively\nimprove both the accuracy and certainty of the resulting model and adaptively\ngenerate high-quality pseudo target labels. Second, we design a graph\ncontrastive learning with the domain distance matrix and confidence-uncertainty\ncriterion, to minimize the distribution gap of samples of a same class in the\nblended target domains, which alleviates the co-existence of different label\nshifts in blended targets. We conduct a new benchmark based on three standard\nDA datasets and ECA outperforms other methods with considerable gains and\nachieves comparable results compared with those that have domain labels or\nsource data in prior.\n","authors":["Juepeng Zheng","Yibin Wen","Jinxiao Zhang","Runmin Dong","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.07527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07519v1","updated":"2024-08-14T12:52:13Z","published":"2024-08-14T12:52:13Z","title":"Whitening Consistently Improves Self-Supervised Learning","summary":" Self-supervised learning (SSL) has been shown to be a powerful approach for\nlearning visual representations. In this study, we propose incorporating ZCA\nwhitening as the final layer of the encoder in self-supervised learning to\nenhance the quality of learned features by normalizing and decorrelating them.\nAlthough whitening has been utilized in SSL in previous works, its potential to\nuniversally improve any SSL model has not been explored. We demonstrate that\nadding whitening as the last layer of SSL pretrained encoders is independent of\nthe self-supervised learning method and encoder architecture, thus it improves\nperformance for a wide range of SSL methods across multiple encoder\narchitectures and datasets. Our experiments show that whitening is capable of\nimproving linear and k-NN probing accuracy by 1-5%. Additionally, we propose\nmetrics that allow for a comprehensive analysis of the learned features,\nprovide insights into the quality of the representations and help identify\ncollapse patterns.\n","authors":["András Kalapos","Bálint Gyires-Tóth"],"pdf_url":"https://arxiv.org/pdf/2408.07519v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.07516v1","updated":"2024-08-14T12:49:50Z","published":"2024-08-14T12:49:50Z","title":"DIffSteISR: Harnessing Diffusion Prior for Superior Real-world Stereo\n Image Super-Resolution","summary":" We introduce DiffSteISR, a pioneering framework for reconstructing real-world\nstereo images. DiffSteISR utilizes the powerful prior knowledge embedded in\npre-trained text-to-image model to efficiently recover the lost texture details\nin low-resolution stereo images. Specifically, DiffSteISR implements a\ntime-aware stereo cross attention with temperature adapter (TASCATA) to guide\nthe diffusion process, ensuring that the generated left and right views exhibit\nhigh texture consistency thereby reducing disparity error between the\nsuper-resolved images and the ground truth (GT) images. Additionally, a stereo\nomni attention control network (SOA ControlNet) is proposed to enhance the\nconsistency of super-resolved images with GT images in the pixel, perceptual,\nand distribution space. Finally, DiffSteISR incorporates a stereo semantic\nextractor (SSE) to capture unique viewpoint soft semantic information and\nshared hard tag semantic information, thereby effectively improving the\nsemantic accuracy and consistency of the generated left and right images.\nExtensive experimental results demonstrate that DiffSteISR accurately\nreconstructs natural and precise textures from low-resolution stereo images\nwhile maintaining a high consistency of semantic and texture between the left\nand right views.\n","authors":["Yuanbo Zhou","Xinlin Zhang","Wei Deng","Tao Wang","Tao Tan","Qinquan Gao","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2408.07516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07514v1","updated":"2024-08-14T12:48:37Z","published":"2024-08-14T12:48:37Z","title":"CNN-JEPA: Self-Supervised Pretraining Convolutional Neural Networks\n Using Joint Embedding Predictive Architecture","summary":" Self-supervised learning (SSL) has become an important approach in\npretraining large neural networks, enabling unprecedented scaling of model and\ndataset sizes. While recent advances like I-JEPA have shown promising results\nfor Vision Transformers, adapting such methods to Convolutional Neural Networks\n(CNNs) presents unique challenges. In this paper, we introduce CNN-JEPA, a\nnovel SSL method that successfully applies the joint embedding predictive\narchitecture approach to CNNs. Our method incorporates a sparse CNN encoder to\nhandle masked inputs, a fully convolutional predictor using depthwise separable\nconvolutions, and an improved masking strategy. We demonstrate that CNN-JEPA\noutperforms I-JEPA with ViT architectures on ImageNet-100, achieving 73.3%\nlinear top-1 accuracy with a standard ResNet-50 encoder. Compared to other\nCNN-based SSL methods, CNN-JEPA requires 17-35% less training time for the same\nnumber of epochs and approaches the linear and k-NN top-1 accuracies of BYOL,\nSimCLR, and VICReg. Our approach offers a simpler, more efficient alternative\nto existing SSL methods for CNNs, requiring minimal augmentations and no\nseparate projector network.\n","authors":["András Kalapos","Bálint Gyires-Tóth"],"pdf_url":"https://arxiv.org/pdf/2408.07514v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.09477v2","updated":"2024-08-14T12:43:52Z","published":"2024-03-14T15:19:19Z","title":"VIRUS-NeRF -- Vision, InfraRed and UltraSonic based Neural Radiance\n Fields","summary":" Autonomous mobile robots are an increasingly integral part of modern factory\nand warehouse operations. Obstacle detection, avoidance and path planning are\ncritical safety-relevant tasks, which are often solved using expensive LiDAR\nsensors and depth cameras. We propose to use cost-effective low-resolution\nranging sensors, such as ultrasonic and infrared time-of-flight sensors by\ndeveloping VIRUS-NeRF - Vision, InfraRed, and UltraSonic based Neural Radiance\nFields. Building upon Instant Neural Graphics Primitives with a Multiresolution\nHash Encoding (Instant-NGP), VIRUS-NeRF incorporates depth measurements from\nultrasonic and infrared sensors and utilizes them to update the occupancy grid\nused for ray marching. Experimental evaluation in 2D demonstrates that\nVIRUS-NeRF achieves comparable mapping performance to LiDAR point clouds\nregarding coverage. Notably, in small environments, its accuracy aligns with\nthat of LiDAR measurements, while in larger ones, it is bounded by the utilized\nultrasonic sensors. An in-depth ablation study reveals that adding ultrasonic\nand infrared sensors is highly effective when dealing with sparse data and low\nview variation. Further, the proposed occupancy grid of VIRUS-NeRF improves the\nmapping capabilities and increases the training speed by 46% compared to\nInstant-NGP. Overall, VIRUS-NeRF presents a promising approach for\ncost-effective local mapping in mobile robotics, with potential applications in\nsafety and navigation tasks. The code can be found at\nhttps://github.com/ethz-asl/virus nerf.\n","authors":["Nicolaj Schmid","Cornelius von Einem","Cesar Cadena","Roland Siegwart","Lorenz Hruby","Florian Tschopp"],"pdf_url":"https://arxiv.org/pdf/2403.09477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07500v1","updated":"2024-08-14T12:29:49Z","published":"2024-08-14T12:29:49Z","title":"Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation\n Approach","summary":" In this paper, we construct a large-scale benchmark dataset for\nGround-to-Aerial Video-based person Re-Identification, named G2A-VReID, which\ncomprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct\nidentities. To our knowledge, this is the first dataset for video ReID under\nGround-to-Aerial scenarios. G2A-VReID dataset has the following\ncharacteristics: 1) Drastic view changes; 2) Large number of annotated\nidentities; 3) Rich outdoor scenarios; 4) Huge difference in resolution.\nAdditionally, we propose a new benchmark approach for cross-platform ReID by\ntransforming the cross-platform visual alignment problem into visual-semantic\nalignment through vision-language model (i.e., CLIP) and applying a\nparameter-efficient Video Set-Level-Adapter module to adapt image-based\nfoundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further\nreduce the great discrepancy across the platforms, we also devise the\nplatform-bridge prompts for efficient visual feature alignment. Extensive\nexperiments demonstrate the superiority of the proposed method on all existing\nvideo ReID datasets and our proposed G2A-VReID dataset.\n","authors":["Shizhou Zhang","Wenlong Luo","De Cheng","Qingchun Yang","Lingyan Ran","Yinghui Xing","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12544v2","updated":"2024-08-14T12:25:53Z","published":"2024-06-18T12:23:00Z","title":"Integrating Representational Gestures into Automatically Generated\n Embodied Explanations and its Effects on Understanding and Interaction\n Quality","summary":" In human interaction, gestures serve various functions such as marking speech\nrhythm, highlighting key elements, and supplementing information. These\ngestures are also observed in explanatory contexts. However, the impact of\ngestures on explanations provided by virtual agents remains underexplored. A\nuser study was carried out to investigate how different types of gestures\ninfluence perceived interaction quality and listener understanding. This study\naddresses the effect of gestures in explanation by developing an embodied\nvirtual explainer integrating both beat gestures and iconic gestures to enhance\nits automatically generated verbal explanations. Our model combines beat\ngestures generated by a learned speech-driven synthesis module with manually\ncaptured iconic gestures, supporting the agent's verbal expressions about the\nboard game Quarto! as an explanation scenario. Findings indicate that neither\nthe use of iconic gestures alone nor their combination with beat gestures\noutperforms the baseline or beat-only conditions in terms of understanding.\nNonetheless, compared to prior research, the embodied agent significantly\nenhances understanding.\n","authors":["Amelie Sophie Robrecht","Hendric Voss","Lisa Gottschalk","Stefan Kopp"],"pdf_url":"https://arxiv.org/pdf/2406.12544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07490v1","updated":"2024-08-14T12:12:43Z","published":"2024-08-14T12:12:43Z","title":"Attention-Guided Perturbation for Unsupervised Image Anomaly Detection","summary":" Reconstruction-based methods have significantly advanced modern unsupervised\nanomaly detection. However, the strong capacity of neural networks often\nviolates the underlying assumptions by reconstructing abnormal samples well. To\nalleviate this issue, we present a simple yet effective reconstruction\nframework named Attention-Guided Pertuation Network (AGPNet), which learns to\nadd perturbation noise with an attention mask, for accurate unsupervised\nanomaly detection. Specifically, it consists of two branches, \\ie, a plain\nreconstruction branch and an auxiliary attention-based perturbation branch. The\nreconstruction branch is simply a plain reconstruction network that learns to\nreconstruct normal samples, while the auxiliary branch aims to produce\nattention masks to guide the noise perturbation process for normal samples from\neasy to hard. By doing so, we are expecting to synthesize hard yet more\ninformative anomalies for training, which enable the reconstruction branch to\nlearn important inherent normal patterns both comprehensively and efficiently.\nExtensive experiments are conducted on three popular benchmarks covering\nMVTec-AD, VisA, and MVTec-3D, and show that our framework obtains leading\nanomaly detection performance under various setups including few-shot,\none-class, and multi-class setups.\n","authors":["Tingfeng Huang","Yuxuan Cheng","Jingbo Xia","Rui Yu","Yuxuan Cai","Jinhai Xiang","Xinwei He","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2408.07490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07486v1","updated":"2024-08-14T12:05:07Z","published":"2024-08-14T12:05:07Z","title":"OMR: Occlusion-Aware Memory-Based Refinement for Video Lane Detection","summary":" A novel algorithm for video lane detection is proposed in this paper. First,\nwe extract a feature map for a current frame and detect a latent mask for\nobstacles occluding lanes. Then, we enhance the feature map by developing an\nocclusion-aware memory-based refinement (OMR) module. It takes the obstacle\nmask and feature map from the current frame, previous output, and memory\ninformation as input, and processes them recursively in a video. Moreover, we\napply a novel data augmentation scheme for training the OMR module effectively.\nExperimental results show that the proposed algorithm outperforms existing\ntechniques on video lane datasets. Our codes are available at\nhttps://github.com/dongkwonjin/OMR.\n","authors":["Dongkwon Jin","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2408.07486v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.06868v2","updated":"2024-08-14T11:58:36Z","published":"2024-08-13T13:06:50Z","title":"A Comprehensive Survey on Synthetic Infrared Image synthesis","summary":" Synthetic infrared (IR) scene and target generation is an important computer\nvision problem as it allows the generation of realistic IR images and targets\nfor training and testing of various applications, such as remote sensing,\nsurveillance, and target recognition. It also helps reduce the cost and risk\nassociated with collecting real-world IR data. This survey paper aims to\nprovide a comprehensive overview of the conventional mathematical\nmodelling-based methods and deep learning-based methods used for generating\nsynthetic IR scenes and targets. The paper discusses the importance of\nsynthetic IR scene and target generation and briefly covers the mathematics of\nblackbody and grey body radiations, as well as IR image-capturing methods. The\npotential use cases of synthetic IR scenes and target generation are also\ndescribed, highlighting the significance of these techniques in various fields.\nAdditionally, the paper explores possible new ways of developing new techniques\nto enhance the efficiency and effectiveness of synthetic IR scenes and target\ngeneration while highlighting the need for further research to advance this\nfield.\n","authors":["Avinash Upadhyay","Manoj sharma","Prerana Mukherjee","Amit Singhal","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2408.06868v2.pdf","comment":"Submitted in Journal of Infrared Physics & Technology"},{"id":"http://arxiv.org/abs/2408.07484v1","updated":"2024-08-14T11:56:35Z","published":"2024-08-14T11:56:35Z","title":"GRFormer: Grouped Residual Self-Attention for Lightweight Single Image\n Super-Resolution","summary":" Previous works have shown that reducing parameter overhead and computations\nfor transformer-based single image super-resolution (SISR) models (e.g.,\nSwinIR) usually leads to a reduction of performance. In this paper, we present\nGRFormer, an efficient and lightweight method, which not only reduces the\nparameter overhead and computations, but also greatly improves performance. The\ncore of GRFormer is Grouped Residual Self-Attention (GRSA), which is\nspecifically oriented towards two fundamental components. Firstly, it\nintroduces a novel grouped residual layer (GRL) to replace the Query, Key,\nValue (QKV) linear layer in self-attention, aimed at efficiently reducing\nparameter overhead, computations, and performance loss at the same time.\nSecondly, it integrates a compact Exponential-Space Relative Position Bias\n(ES-RPB) as a substitute for the original relative position bias to improve the\nability to represent position information while further minimizing the\nparameter count. Extensive experimental results demonstrate that GRFormer\noutperforms state-of-the-art transformer-based methods for $\\times$2, $\\times$3\nand $\\times$4 SISR tasks, notably outperforming SOTA by a maximum PSNR of\n0.23dB when trained on the DIV2K dataset, while reducing the number of\nparameter and MACs by about \\textbf{60\\%} and \\textbf{49\\% } in only\nself-attention module respectively. We hope that our simple and effective\nmethod that can easily applied to SR models based on window-division\nself-attention can serve as a useful tool for further research in image\nsuper-resolution. The code is available at\n\\url{https://github.com/sisrformer/GRFormer}.\n","authors":["Yuzhen Li","Zehang Deng","Yuxin Cao","Lihua Liu"],"pdf_url":"https://arxiv.org/pdf/2408.07484v1.pdf","comment":"Accepted for ACM MM 2024"},{"id":"http://arxiv.org/abs/2301.01732v6","updated":"2024-08-14T11:56:02Z","published":"2023-01-04T18:02:59Z","title":"Explicit Abnormality Extraction for Unsupervised Motion Artifact\n Reduction in Magnetic Resonance Imaging","summary":" Motion artifacts compromise the quality of magnetic resonance imaging (MRI)\nand pose challenges to achieving diagnostic outcomes and image-guided\ntherapies. In recent years, supervised deep learning approaches have emerged as\nsuccessful solutions for motion artifact reduction (MAR). One disadvantage of\nthese methods is their dependency on acquiring paired sets of motion\nartifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images\nfor training purposes. Obtaining such image pairs is difficult and therefore\nlimits the application of supervised training. In this paper, we propose a\nnovel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this\nproblem. Our network is capable of working with unpaired MA-corrupted and\nMA-free images. It converts the MA-corrupted images to MA-reduced images by\nextracting abnormalities from the MA-corrupted images using a proposed artifact\nextractor, which intercepts the residual artifact maps from the MA-corrupted MR\nimages explicitly, and a reconstructor to restore the original input from the\nMA-reduced images. The performance of UNAEN was assessed by experimenting with\nvarious publicly available MRI datasets and comparing them with\nstate-of-the-art methods. The quantitative evaluation demonstrates the\nsuperiority of UNAEN over alternative MAR methods and visually exhibits fewer\nresidual artifacts. Our results substantiate the potential of UNAEN as a\npromising solution applicable in real-world clinical environments, with the\ncapability to enhance diagnostic accuracy and facilitate image-guided\ntherapies. Our codes are publicly available at\nhttps://github.com/YuSheng-Zhou/UNAEN.\n","authors":["Yusheng Zhou","Hao Li","Jianan Liu","Zhengmin Kong","Tao Huang","Euijoon Ahn","Zhihan Lv","Jinman Kim","David Dagan Feng"],"pdf_url":"https://arxiv.org/pdf/2301.01732v6.pdf","comment":"Accepted by IEEE Journal of Biomedical and Health Informatics"},{"id":"http://arxiv.org/abs/2308.10994v2","updated":"2024-08-14T11:54:05Z","published":"2023-08-21T19:20:20Z","title":"Switched auxiliary loss for robust training of transformer models for\n histopathological image segmentation","summary":" Functional tissue Units (FTUs) are cell population neighborhoods local to a\nparticular organ performing its main function.The FTUs provide crucial\ninformation to the pathologist in understanding the disease affecting a\nparticular organ by providing information at the cellular level.In our\nresearch, we have developed a model to segment multi-organ FTUs across 5 organs\nnamely: the kidney, large intestine, lung, prostate and spleen by utilizing the\n'HuBMAP + HPA - Hacking the Human Body' competition dataset.We propose adding\nswitched auxiliary loss for training models like the transformers to overcome\nthe diminishing gradient problem which poses a challenge towards optimal\ntraining of deep models.Overall, our model achieved a dice score of 0.793 on\nthe public dataset and 0.778 on the private dataset.The results supports the\nrobustness of the proposed training methodology.The findings also bolster the\nuse of transformers models for dense prediction tasks in the field of medical\nimage analysis.The study assists in understanding the relationships between\ncell and tissue organization thereby providing a useful medium to look at the\nimpact of cellular functions on human health.\n","authors":["Mustaffa Hussain","Saharsh Barve"],"pdf_url":"https://arxiv.org/pdf/2308.10994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07481v1","updated":"2024-08-14T11:53:40Z","published":"2024-08-14T11:53:40Z","title":"DeCo: Decoupled Human-Centered Diffusion Video Editing with Motion\n Consistency","summary":" Diffusion models usher a new era of video editing, flexibly manipulating the\nvideo contents with text prompts. Despite the widespread application demand in\nediting human-centered videos, these models face significant challenges in\nhandling complex objects like humans. In this paper, we introduce DeCo, a novel\nvideo editing framework specifically designed to treat humans and the\nbackground as separate editable targets, ensuring global spatial-temporal\nconsistency by maintaining the coherence of each individual component.\nSpecifically, we propose a decoupled dynamic human representation that utilizes\na parametric human body prior to generate tailored humans while preserving the\nconsistent motions as the original video. In addition, we consider the\nbackground as a layered atlas to apply text-guided image editing approaches on\nit. To further enhance the geometry and texture of humans during the\noptimization, we extend the calculation of score distillation sampling into\nnormal space and image space. Moreover, we tackle inconsistent lighting between\nthe edited targets by leveraging a lighting-aware video harmonizer, a problem\npreviously overlooked in decompose-edit-combine approaches. Extensive\nqualitative and numerical experiments demonstrate that DeCo outperforms prior\nvideo editing methods in human-centered videos, especially in longer videos.\n","authors":["Xiaojing Zhong","Xinyi Huang","Xiaofeng Yang","Guosheng Lin","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.07481v1.pdf","comment":"European Conference on Computer Vision"},{"id":"http://arxiv.org/abs/2408.07476v1","updated":"2024-08-14T11:47:22Z","published":"2024-08-14T11:47:22Z","title":"One Step Diffusion-based Super-Resolution with Time-Aware Distillation","summary":" Diffusion-based image super-resolution (SR) methods have shown promise in\nreconstructing high-resolution images with fine details from low-resolution\ncounterparts. However, these approaches typically require tens or even hundreds\nof iterative samplings, resulting in significant latency. Recently, techniques\nhave been devised to enhance the sampling efficiency of diffusion-based SR\nmodels via knowledge distillation. Nonetheless, when aligning the knowledge of\nstudent and teacher models, these solutions either solely rely on pixel-level\nloss constraints or neglect the fact that diffusion models prioritize varying\nlevels of information at different time steps. To accomplish effective and\nefficient image super-resolution, we propose a time-aware diffusion\ndistillation method, named TAD-SR. Specifically, we introduce a novel score\ndistillation strategy to align the data distribution between the outputs of the\nstudent and teacher models after minor noise perturbation. This distillation\nstrategy enables the student network to concentrate more on the high-frequency\ndetails. Furthermore, to mitigate performance limitations stemming from\ndistillation, we integrate a latent adversarial loss and devise a time-aware\ndiscriminator that leverages diffusion priors to effectively distinguish\nbetween real images and generated images. Extensive experiments conducted on\nsynthetic and real-world datasets demonstrate that the proposed method achieves\ncomparable or even superior performance compared to both previous\nstate-of-the-art (SOTA) methods and the teacher model in just one sampling\nstep. Codes are available at https://github.com/LearningHx/TAD-SR.\n","authors":["Xiao He","Huaao Tang","Zhijun Tu","Junchao Zhang","Kun Cheng","Hanting Chen","Yong Guo","Mingrui Zhu","Nannan Wang","Xinbo Gao","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07476v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2407.19832v2","updated":"2024-08-14T11:42:02Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention for\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model, which\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear scalability and fast processing of long sequences. We replace\nthe Transformer-based backbone with a pre-trained Mamba-2 model and explore\nmethods for integrating 2D visual selective scanning mechanisms into multimodal\nlearning while also trying various visual encoders and Mamba-2 model variants.\nOur extensive experiments in various multimodal benchmark tests demonstrate the\ncompetitive performance of ML-Mamba and highlight the potential of state space\nmodels in multimodal tasks. The experimental results show that: (1) we\nempirically explore how to effectively apply the 2D vision selective scan\nmechanism for multimodal learning. We propose a novel multimodal connector\ncalled the Mamba-2 Scan Connector (MSC), which enhances representational\ncapabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling while faster inference speed; (3) Compared to multimodal models\nutilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference\nperformance and effectiveness.\n","authors":["Wenjun Huang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.13600,\n arXiv:2406.07537 by other authors"},{"id":"http://arxiv.org/abs/2408.07467v1","updated":"2024-08-14T11:24:13Z","published":"2024-08-14T11:24:13Z","title":"Domain-invariant Representation Learning via Segment Anything Model for\n Blood Cell Classification","summary":" Accurate classification of blood cells is of vital significance in the\ndiagnosis of hematological disorders. However, in real-world scenarios, domain\nshifts caused by the variability in laboratory procedures and settings, result\nin a rapid deterioration of the model's generalization performance. To address\nthis issue, we propose a novel framework of domain-invariant representation\nlearning (DoRL) via segment anything model (SAM) for blood cell classification.\nThe DoRL comprises two main components: a LoRA-based SAM (LoRA-SAM) and a\ncross-domain autoencoder (CAE). The advantage of DoRL is that it can extract\ndomain-invariant representations from various blood cell datasets in an\nunsupervised manner. Specifically, we first leverage the large-scale foundation\nmodel of SAM, fine-tuned with LoRA, to learn general image embeddings and\nsegment blood cells. Additionally, we introduce CAE to learn domain-invariant\nrepresentations across different-domain datasets while mitigating images'\nartifacts. To validate the effectiveness of domain-invariant representations,\nwe employ five widely used machine learning classifiers to construct blood cell\nclassification models. Experimental results on two public blood cell datasets\nand a private real dataset demonstrate that our proposed DoRL achieves a new\nstate-of-the-art cross-domain performance, surpassing existing methods by a\nsignificant margin. The source code can be available at the URL\n(https://github.com/AnoK3111/DoRL).\n","authors":["Yongcheng Li","Lingcong Cai","Ying Lu","Cheng Lin","Yupeng Zhang","Jingyan Jiang","Genan Dai","Bowen Zhang","Jingzhou Cao","Xiangzhong Zhang","Xiaomao Fan"],"pdf_url":"https://arxiv.org/pdf/2408.07467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06432v2","updated":"2024-08-14T11:05:45Z","published":"2024-06-10T16:24:07Z","title":"SYM3D: Learning Symmetric Triplanes for Better 3D-Awareness of GANs","summary":" Despite the growing success of 3D-aware GANs, which can be trained on 2D\nimages to generate high-quality 3D assets, they still rely on multi-view images\nwith camera annotations to synthesize sufficient details from all viewing\ndirections. However, the scarce availability of calibrated multi-view image\ndatasets, especially in comparison to single-view images, has limited the\npotential of 3D GANs. Moreover, while bypassing camera pose annotations with a\ncamera distribution constraint reduces dependence on exact camera parameters,\nit still struggles to generate a consistent orientation of 3D assets. To this\nend, we propose SYM3D, a novel 3D-aware GAN designed to leverage the prevalent\nreflectional symmetry structure found in natural and man-made objects,\nalongside a proposed view-aware spatial attention mechanism in learning the 3D\nrepresentation. We evaluate SYM3D on both synthetic (ShapeNet Chairs, Cars, and\nAirplanes) and real-world datasets (ABO-Chair), demonstrating its superior\nperformance in capturing detailed geometry and texture, even when trained on\nonly single-view images. Finally, we demonstrate the effectiveness of\nincorporating symmetry regularization in helping reduce artifacts in the\nmodeling of 3D assets in the text-to-3D task. Project is at\n\\url{https://jingyang2017.github.io/sym3d.github.io/}\n","authors":["Jing Yang","Kyle Fogarty","Fangcheng Zhong","Cengiz Oztireli"],"pdf_url":"https://arxiv.org/pdf/2406.06432v2.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2408.06753v2","updated":"2024-08-14T10:53:34Z","published":"2024-08-13T09:19:59Z","title":"Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies","summary":" Existing methods on audio-visual deepfake detection mainly focus on\nhigh-level features for modeling inconsistencies between audio and visual data.\nAs a result, these approaches usually overlook finer audio-visual artifacts,\nwhich are inherent to deepfakes. Herein, we propose the introduction of\nfine-grained mechanisms for detecting subtle artifacts in both spatial and\ntemporal domains. First, we introduce a local audio-visual model capable of\ncapturing small spatial regions that are prone to inconsistencies with audio.\nFor that purpose, a fine-grained mechanism based on a spatially-local distance\ncoupled with an attention module is adopted. Second, we introduce a\ntemporally-local pseudo-fake augmentation to include samples incorporating\nsubtle temporal inconsistencies in our training set. Experiments on the DFDC\nand the FakeAVCeleb datasets demonstrate the superiority of the proposed method\nin terms of generalization as compared to the state-of-the-art under both\nin-dataset and cross-dataset settings.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2408.06753v2.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2408.07455v1","updated":"2024-08-14T10:49:14Z","published":"2024-08-14T10:49:14Z","title":"Infra-YOLO: Efficient Neural Network Structure with Model Compression\n for Real-Time Infrared Small Object Detection","summary":" Although convolutional neural networks have made outstanding achievements in\nvisible light target detection, there are still many challenges in infrared\nsmall object detection because of the low signal-to-noise ratio, incomplete\nobject structure, and a lack of reliable infrared small object dataset. To\nresolve limitations of the infrared small object dataset, a new dataset named\nInfraTiny was constructed, and more than 85% bounding box is less than 32x32\npixels (3218 images and a total of 20,893 bounding boxes). A multi-scale\nattention mechanism module (MSAM) and a Feature Fusion Augmentation Pyramid\nModule (FFAFPM) were proposed and deployed onto embedded devices. The MSAM\nenables the network to obtain scale perception information by acquiring\ndifferent receptive fields, while the background noise information is\nsuppressed to enhance feature extraction ability. The proposed FFAFPM can\nenrich semantic information, and enhance the fusion of shallow feature and deep\nfeature, thus false positive results have been significantly reduced. By\nintegrating the proposed methods into the YOLO model, which is named\nInfra-YOLO, infrared small object detection performance has been improved.\nCompared to yolov3, mAP@0.5 has been improved by 2.7%; and compared to yolov4,\nthat by 2.5% on the InfraTiny dataset. The proposed Infra-YOLO was also\ntransferred onto the embedded device in the unmanned aerial vehicle (UAV) for\nreal application scenarios, where the channel pruning method is adopted to\nreduce FLOPs and to achieve a tradeoff between speed and accuracy. Even if the\nparameters of Infra-YOLO are reduced by 88% with the pruning method, a gain of\n0.7% is still achieved on mAP@0.5 compared to yolov3, and a gain of 0.5%\ncompared to yolov4. Experimental results show that the proposed MSAM and FFAFPM\nmethod can improve infrared small object detection performance compared with\nthe previous benchmark method.\n","authors":["Zhonglin Chen","Anyu Geng","Jianan Jiang","Jiwu Lu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2408.07455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07445v1","updated":"2024-08-14T10:32:16Z","published":"2024-08-14T10:32:16Z","title":"Modality Invariant Multimodal Learning to Handle Missing Modalities: A\n Single-Branch Approach","summary":" Multimodal networks have demonstrated remarkable performance improvements\nover their unimodal counterparts. Existing multimodal networks are designed in\na multi-branch fashion that, due to the reliance on fusion strategies, exhibit\ndeteriorated performance if one or more modalities are missing. In this work,\nwe propose a modality invariant multimodal learning method, which is less\nsusceptible to the impact of missing modalities. It consists of a single-branch\nnetwork sharing weights across multiple modalities to learn inter-modality\nrepresentations to maximize performance as well as robustness to missing\nmodalities. Extensive experiments are performed on four challenging datasets\nincluding textual-visual (UPMC Food-101, Hateful Memes, Ferramenta) and\naudio-visual modalities (VoxCeleb1). Our proposed method achieves superior\nperformance when all modalities are present as well as in the case of missing\nmodalities during training or testing compared to the existing state-of-the-art\nmethods.\n","authors":["Muhammad Saad Saeed","Shah Nawaz","Muhammad Zaigham Zaheer","Muhammad Haris Khan","Karthik Nandakumar","Muhammad Haroon Yousaf","Hassan Sajjad","Tom De Schepper","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2408.07445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07444v1","updated":"2024-08-14T10:31:19Z","published":"2024-08-14T10:31:19Z","title":"Costal Cartilage Segmentation with Topology Guided Deformable Mamba:\n Method and Benchmark","summary":" Costal cartilage segmentation is crucial to various medical applications,\nnecessitating precise and reliable techniques due to its complex anatomy and\nthe importance of accurate diagnosis and surgical planning. We propose a novel\ndeep learning-based approach called topology-guided deformable Mamba (TGDM) for\ncostal cartilage segmentation. The TGDM is tailored to capture the intricate\nlong-range costal cartilage relationships. Our method leverages a deformable\nmodel that integrates topological priors to enhance the adaptability and\naccuracy of the segmentation process. Furthermore, we developed a comprehensive\nbenchmark that contains 165 cases for costal cartilage segmentation. This\nbenchmark sets a new standard for evaluating costal cartilage segmentation\ntechniques and provides a valuable resource for future research. Extensive\nexperiments conducted on both in-domain benchmarks and out-of domain test sets\ndemonstrate the superiority of our approach over existing methods, showing\nsignificant improvements in segmentation precision and robustness.\n","authors":["Senmao Wang","Haifan Gong","Runmeng Cui","Boyao Wan","Yicheng Liu","Zhonglin Hu","Haiqing Yang","Jingyang Zhou","Bo Pan","Lin Lin","Haiyue Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.07444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07440v1","updated":"2024-08-14T10:18:42Z","published":"2024-08-14T10:18:42Z","title":"BAPLe: Backdoor Attacks on Medical Foundational Models using Prompt\n Learning","summary":" Medical foundation models are gaining prominence in the medical community for\ntheir ability to derive general representations from extensive collections of\nmedical image-text pairs. Recent research indicates that these models are\nsusceptible to backdoor attacks, which allow them to classify clean images\naccurately but fail when specific triggers are introduced. However, traditional\nbackdoor attacks necessitate a considerable amount of additional data to\nmaliciously pre-train a model. This requirement is often impractical in medical\nimaging applications due to the usual scarcity of data. Inspired by the latest\ndevelopments in learnable prompts, this work introduces a method to embed a\nbackdoor into the medical foundation model during the prompt learning phase. By\nincorporating learnable prompts within the text encoder and introducing\nimperceptible learnable noise trigger to the input images, we exploit the full\ncapabilities of the medical foundation models (Med-FM). Our method, BAPLe,\nrequires only a minimal subset of data to adjust the noise trigger and the text\nprompts for downstream tasks, enabling the creation of an effective backdoor\nattack. Through extensive experiments with four medical foundation models, each\npre-trained on different modalities and evaluated across six downstream\ndatasets, we demonstrate the efficacy of our approach. BAPLe achieves a high\nbackdoor success rate across all models and datasets, outperforming the\nbaseline backdoor attack methods. Our work highlights the vulnerability of\nMed-FMs towards backdoor attacks and strives to promote the safe adoption of\nMed-FMs before their deployment in real-world applications. Code is available\nat https://asif-hanif.github.io/baple/.\n","authors":["Asif Hanif","Fahad Shamshad","Muhammad Awais","Muzammal Naseer","Fahad Shahbaz Khan","Karthik Nandakumar","Salman Khan","Rao Muhammad Anwer"],"pdf_url":"https://arxiv.org/pdf/2408.07440v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.07438v1","updated":"2024-08-14T10:15:34Z","published":"2024-08-14T10:15:34Z","title":"Achieving Data Efficient Neural Networks with Hybrid Concept-based\n Models","summary":" Most datasets used for supervised machine learning consist of a single label\nper data point. However, in cases where more information than just the class\nlabel is available, would it be possible to train models more efficiently? We\nintroduce two novel model architectures, which we call hybrid concept-based\nmodels, that train using both class labels and additional information in the\ndataset referred to as concepts. In order to thoroughly assess their\nperformance, we introduce ConceptShapes, an open and flexible class of datasets\nwith concept labels. We show that the hybrid concept-based models outperform\nstandard computer vision models and previously proposed concept-based models\nwith respect to accuracy, especially in sparse data settings. We also introduce\nan algorithm for performing adversarial concept attacks, where an image is\nperturbed in a way that does not change a concept-based model's concept\npredictions, but changes the class prediction. The existence of such\nadversarial examples raises questions about the interpretable qualities\npromised by concept-based models.\n","authors":["Tobias A. Opsahl","Vegard Antun"],"pdf_url":"https://arxiv.org/pdf/2408.07438v1.pdf","comment":"11 pages, 8 figures, appendix"},{"id":"http://arxiv.org/abs/2408.07433v1","updated":"2024-08-14T10:08:46Z","published":"2024-08-14T10:08:46Z","title":"MagicFace: Training-free Universal-Style Human Image Customized\n Synthesis","summary":" Existing human image personalized generation methods often require tedious\ntraining: either fine-tuning with a few images or retraining on large-scale\ndatasets. In such cases, these methods are prone to overfitting and encounter\ndifficulties when personalizing individuals of diverse styles. Moreover, these\ntraining-based approaches also struggle with multi-concept human image\ncustomizing. To this end, we propose MagicFace, the first method for\nuniversal-style human image personalized synthesis that enables\nsingle/multi-concept customization for humans of any style in a training-free\nmanner. MagicFace introduces a coarse-to-fine generation pipeline, involving\ntwo sequential stages: semantic scene construction and concept feature\ninjection. This is achieved by our Reference-aware Self-Attention (RSA) and\nRegion-grouped Blend Attention (RBA) mechanisms. Specifically, in the first\nstage, RSA enables the latent image to query features from reference concepts\nsimultaneously, extracting the coarse-grained overall semantic understanding to\nfacilitate the initial semantic layout establishment. In the second stage, we\nemploy an attention-based semantic segmentation method to pinpoint the\ngenerated regions of all concepts in the latent image at each step. Following\nthis, RBA divides the pixels of the latent image into semantic groups, with\neach group querying fine-grained features from its reference concept, which\nensures precise attribute alignment and feature injection. Throughout the\ntwo-stage process, a weight mask strategy is employed to ensure the model\nfocuses more on the reference concepts. Extensive experiments demonstrate our\nsuperiority in both human-centric subject-to-image synthesis and multi-concept\nhuman image customization. Our approach also can be applied to texture\ntransformation, further enhancing its versatility and applicability.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07433v1.pdf","comment":"project page: https://codegoat24.github.io/MagicFace"},{"id":"http://arxiv.org/abs/2408.07430v1","updated":"2024-08-14T10:06:39Z","published":"2024-08-14T10:06:39Z","title":"UAHOI: Uncertainty-aware Robust Interaction Learning for HOI Detection","summary":" This paper focuses on Human-Object Interaction (HOI) detection, addressing\nthe challenge of identifying and understanding the interactions between humans\nand objects within a given image or video frame. Spearheaded by Detection\nTransformer (DETR), recent developments lead to significant improvements by\nreplacing traditional region proposals by a set of learnable queries. However,\ndespite the powerful representation capabilities provided by Transformers,\nexisting Human-Object Interaction (HOI) detection methods still yield low\nconfidence levels when dealing with complex interactions and are prone to\noverlooking interactive actions. To address these issues, we propose a novel\napproach \\textsc{UAHOI}, Uncertainty-aware Robust Human-Object Interaction\nLearning that explicitly estimates prediction uncertainty during the training\nprocess to refine both detection and interaction predictions. Our model not\nonly predicts the HOI triplets but also quantifies the uncertainty of these\npredictions. Specifically, we model this uncertainty through the variance of\npredictions and incorporate it into the optimization objective, allowing the\nmodel to adaptively adjust its confidence threshold based on prediction\nvariance. This integration helps in mitigating the adverse effects of incorrect\nor ambiguous predictions that are common in traditional methods without any\nhand-designed components, serving as an automatic confidence threshold. Our\nmethod is flexible to existing HOI detection methods and demonstrates improved\naccuracy. We evaluate \\textsc{UAHOI} on two standard benchmarks in the field:\nV-COCO and HICO-DET, which represent challenging scenarios for HOI detection.\nThrough extensive experiments, we demonstrate that \\textsc{UAHOI} achieves\nsignificant improvements over existing state-of-the-art methods, enhancing both\nthe accuracy and robustness of HOI detection.\n","authors":["Mu Chen","Minghan Chen","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2408.07430v1.pdf","comment":"Accepted by CVIU"},{"id":"http://arxiv.org/abs/2408.07422v1","updated":"2024-08-14T10:00:16Z","published":"2024-08-14T10:00:16Z","title":"LLMI3D: Empowering LLM with 3D Perception from a Single 2D Image","summary":" Recent advancements in autonomous driving, augmented reality, robotics, and\nembodied intelligence have necessitated 3D perception algorithms. However,\ncurrent 3D perception methods, particularly small models, struggle with\nprocessing logical reasoning, question-answering, and handling open scenario\ncategories. On the other hand, generative multimodal large language models\n(MLLMs) excel in general capacity but underperform in 3D tasks, due to weak\nspatial and local object perception, poor text-based geometric numerical\noutput, and inability to handle camera focal variations. To address these\nchallenges, we propose the following solutions: Spatial-Enhanced Local Feature\nMining for better spatial feature extraction, 3D Query Token-Derived Info\nDecoding for precise geometric regression, and Geometry Projection-Based 3D\nReasoning for handling camera focal length variations. We employ\nparameter-efficient fine-tuning for a pre-trained MLLM and develop LLMI3D, a\npowerful 3D perception MLLM. Additionally, we have constructed the IG3D\ndataset, which provides fine-grained descriptions and question-answer\nannotations. Extensive experiments demonstrate that our LLMI3D achieves\nstate-of-the-art performance, significantly outperforming existing methods.\n","authors":["Fan Yang","Sicheng Zhao","Yanhao Zhang","Haoxiang Chen","Hui Chen","Wenbo Tang","Haonan Lu","Pengfei Xu","Zhenyu Yang","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2408.07422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07419v1","updated":"2024-08-14T09:59:04Z","published":"2024-08-14T09:59:04Z","title":"Unsupervised Stereo Matching Network For VHR Remote Sensing Images Based\n On Error Prediction","summary":" Stereo matching in remote sensing has recently garnered increased attention,\nprimarily focusing on supervised learning. However, datasets with ground truth\ngenerated by expensive airbone Lidar exhibit limited quantity and diversity,\nconstraining the effectiveness of supervised networks. In contrast,\nunsupervised learning methods can leverage the increasing availability of\nvery-high-resolution (VHR) remote sensing images, offering considerable\npotential in the realm of stereo matching. Motivated by this intuition, we\npropose a novel unsupervised stereo matching network for VHR remote sensing\nimages. A light-weight module to bridge confidence with predicted error is\nintroduced to refine the core model. Robust unsupervised losses are formulated\nto enhance network convergence. The experimental results on US3D and WHU-Stereo\ndatasets demonstrate that the proposed network achieves superior accuracy\ncompared to other unsupervised networks and exhibits better generalization\ncapabilities than supervised models. Our code will be available at\nhttps://github.com/Elenairene/CBEM.\n","authors":["Liting Jiang","Yuming Xiang","Feng Wang","Hongjian You"],"pdf_url":"https://arxiv.org/pdf/2408.07419v1.pdf","comment":"Accepted to International Geoscience and Remote Sensing Symposium\n (IGARSS), 2024"},{"id":"http://arxiv.org/abs/2408.07416v1","updated":"2024-08-14T09:50:02Z","published":"2024-08-14T09:50:02Z","title":"Rethinking Open-Vocabulary Segmentation of Radiance Fields in 3D Space","summary":" Understanding the 3D semantics of a scene is a fundamental problem for\nvarious scenarios such as embodied agents. While NeRFs and 3DGS excel at\nnovel-view synthesis, previous methods for understanding their semantics have\nbeen limited to incomplete 3D understanding: their segmentation results are 2D\nmasks and their supervision is anchored at 2D pixels. This paper revisits the\nproblem set to pursue a better 3D understanding of a scene modeled by NeRFs and\n3DGS as follows. 1) We directly supervise the 3D points to train the language\nembedding field. It achieves state-of-the-art accuracy without relying on\nmulti-scale language embeddings. 2) We transfer the pre-trained language field\nto 3DGS, achieving the first real-time rendering speed without sacrificing\ntraining time or accuracy. 3) We introduce a 3D querying and evaluation\nprotocol for assessing the reconstructed geometry and semantics together. Code,\ncheckpoints, and annotations will be available online. Project page:\nhttps://hyunji12.github.io/Open3DRF\n","authors":["Hyunjee Lee","Youngsik Yun","Jeongmin Bae","Seoha Kim","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2408.07416v1.pdf","comment":"Project page: https://hyunji12.github.io/Open3DRF"},{"id":"http://arxiv.org/abs/2304.02970v7","updated":"2024-08-14T09:21:44Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v7.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2408.07393v1","updated":"2024-08-14T09:13:06Z","published":"2024-08-14T09:13:06Z","title":"Segment Using Just One Example","summary":" Semantic segmentation is an important topic in computer vision with many\nrelevant application in Earth observation. While supervised methods exist, the\nconstraints of limited annotated data has encouraged development of\nunsupervised approaches. However, existing unsupervised methods resemble\nclustering and cannot be directly mapped to explicit target classes. In this\npaper, we deal with single shot semantic segmentation, where one example for\nthe target class is provided, which is used to segment the target class from\nquery/test images. Our approach exploits recently popular Segment Anything\n(SAM), a promptable foundation model. We specifically design several techniques\nto automatically generate prompts from the only example/key image in such a way\nthat the segmentation is successfully achieved on a stitch or concatenation of\nthe example/key and query/test images. Proposed technique does not involve any\ntraining phase and just requires one example image to grasp the concept.\nFurthermore, no text-based prompt is required for the proposed method. We\nevaluated the proposed techniques on building and car classes.\n","authors":["Pratik Vora","Sudipan Saha"],"pdf_url":"https://arxiv.org/pdf/2408.07393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09828v2","updated":"2024-08-14T09:11:33Z","published":"2024-04-15T14:26:00Z","title":"Interaction as Explanation: A User Interaction-based Method for\n Explaining Image Classification Models","summary":" In computer vision, explainable AI (xAI) methods seek to mitigate the\n'black-box' problem by making the decision-making process of deep learning\nmodels more interpretable and transparent. Traditional xAI methods concentrate\non visualizing input features that influence model predictions, providing\ninsights primarily suited for experts. In this work, we present an\ninteraction-based xAI method that enhances user comprehension of image\nclassification models through their interaction. Thus, we developed a web-based\nprototype allowing users to modify images via painting and erasing, thereby\nobserving changes in classification results. Our approach enables users to\ndiscern critical features influencing the model's decision-making process,\naligning their mental models with the model's logic. Experiments conducted with\nfive images demonstrate the potential of the method to reveal feature\nimportance through user interaction. Our work contributes a novel perspective\nto xAI by centering on end-user engagement and understanding, paving the way\nfor more intuitive and accessible explainability in AI systems.\n","authors":["Hyeonggeun Yun"],"pdf_url":"https://arxiv.org/pdf/2404.09828v2.pdf","comment":"IJCAI 2024 (International Joint Conference on Artificial Intelligence\n 2024) Workshop on Explainable Artificial Intelligence (XAI)"},{"id":"http://arxiv.org/abs/2309.16414v3","updated":"2024-08-14T09:06:00Z","published":"2023-09-28T13:08:08Z","title":"AutoCLIP: Auto-tuning Zero-Shot Classifiers for Vision-Language Models","summary":" Classifiers built upon vision-language models such as CLIP have shown\nremarkable zero-shot performance across a broad range of image classification\ntasks. Prior work has studied different ways of automatically creating\ndescriptor sets for every class based on prompt templates, ranging from\nmanually engineered templates over templates obtained from a large language\nmodel to templates built from random words and characters. Up until now,\nderiving zero-shot classifiers from the respective encoded class descriptors\nhas remained nearly unchanged, i.e., classify to the class that maximizes\ncosine similarity between its averaged encoded class descriptors and the image\nencoding. However, weighing all class descriptors equally can be suboptimal\nwhen certain descriptors match visual clues on a given image better than\nothers. In this work, we propose AutoCLIP, a method for auto-tuning zero-shot\nclassifiers. AutoCLIP tunes per-image weights to each prompt template at\ninference time, based on statistics of class descriptor-image similarities.\nAutoCLIP is fully unsupervised, has only a minor additional computation\noverhead, and can be easily implemented in few lines of code. We show that\nAutoCLIP outperforms baselines across a broad range of vision-language models,\ndatasets, and prompt templates consistently and by up to 3 percent point\naccuracy.\n","authors":["Jan Hendrik Metzen","Piyapat Saranrittichai","Chaithanya Kumar Mummadi"],"pdf_url":"https://arxiv.org/pdf/2309.16414v3.pdf","comment":"accepted at TMLR, Camera Ready Version"},{"id":"http://arxiv.org/abs/2209.13232v4","updated":"2024-08-14T09:05:15Z","published":"2022-09-27T08:10:14Z","title":"A Survey on Graph Neural Networks and Graph Transformers in Computer\n Vision: A Task-Oriented Perspective","summary":" Graph Neural Networks (GNNs) have gained momentum in graph representation\nlearning and boosted the state of the art in a variety of areas, such as data\nmining (\\emph{e.g.,} social network analysis and recommender systems), computer\nvision (\\emph{e.g.,} object detection and point cloud learning), and natural\nlanguage processing (\\emph{e.g.,} relation extraction and sequence learning),\nto name a few. With the emergence of Transformers in natural language\nprocessing and computer vision, graph Transformers embed a graph structure into\nthe Transformer architecture to overcome the limitations of local neighborhood\naggregation while avoiding strict structural inductive biases. In this paper,\nwe present a comprehensive review of GNNs and graph Transformers in computer\nvision from a task-oriented perspective. Specifically, we divide their\napplications in computer vision into five categories according to the modality\nof input data, \\emph{i.e.,} 2D natural images, videos, 3D data, vision +\nlanguage, and medical images. In each category, we further divide the\napplications according to a set of vision tasks. Such a task-oriented taxonomy\nallows us to examine how each task is tackled by different GNN-based approaches\nand how well these approaches perform. Based on the necessary preliminaries, we\nprovide the definitions and challenges of the tasks, in-depth coverage of the\nrepresentative approaches, as well as discussions regarding insights,\nlimitations, and future directions.\n","authors":["Chaoqi Chen","Yushuang Wu","Qiyuan Dai","Hong-Yu Zhou","Mutian Xu","Sibei Yang","Xiaoguang Han","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2209.13232v4.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (T-PAMI)"},{"id":"http://arxiv.org/abs/2308.13273v2","updated":"2024-08-14T08:48:40Z","published":"2023-08-25T09:51:03Z","title":"Bridging the Gap: Sketch-Aware Interpolation Network for High-Quality\n Animation Sketch Inbetweening","summary":" Hand-drawn 2D animation workflow is typically initiated with the creation of\nsketch keyframes. Subsequent manual inbetweens are crafted for smoothness,\nwhich is a labor-intensive process and the prospect of automatic animation\nsketch interpolation has become highly appealing. Yet, common frame\ninterpolation methods are generally hindered by two key issues: 1) limited\ntexture and colour details in sketches, and 2) exaggerated alterations between\ntwo sketch keyframes. To overcome these issues, we propose a novel deep\nlearning method - Sketch-Aware Interpolation Network (SAIN). This approach\nincorporates multi-level guidance that formulates region-level correspondence,\nstroke-level correspondence and pixel-level dynamics. A multi-stream\nU-Transformer is then devised to characterize sketch inbetweening patterns\nusing these multi-level guides through the integration of self /\ncross-attention mechanisms. Additionally, to facilitate future research on\nanimation sketch inbetweening, we constructed a large-scale dataset - STD-12K,\ncomprising 30 sketch animation series in diverse artistic styles. Comprehensive\nexperiments on this dataset convincingly show that our proposed SAIN surpasses\nthe state-of-the-art interpolation methods.\n","authors":["Jiaming Shen","Kun Hu","Wei Bao","Chang Wen Chen","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03586v3","updated":"2024-08-14T08:29:55Z","published":"2023-08-07T13:44:44Z","title":"SSL-SoilNet: A Hybrid Transformer-based Framework with Self-Supervised\n Learning for Large-scale Soil Organic Carbon Prediction","summary":" Soil Organic Carbon (SOC) constitutes a fundamental component of terrestrial\necosystem functionality, playing a pivotal role in nutrient cycling,\nhydrological balance, and erosion mitigation. Precise mapping of SOC\ndistribution is imperative for the quantification of ecosystem services,\nnotably carbon sequestration and soil fertility enhancement. Digital soil\nmapping (DSM) leverages statistical models and advanced technologies, including\nmachine learning (ML), to accurately map soil properties, such as SOC,\nutilizing diverse data sources like satellite imagery, topography, remote\nsensing indices, and climate series. Within the domain of ML, self-supervised\nlearning (SSL), which exploits unlabeled data, has gained prominence in recent\nyears. This study introduces a novel approach that aims to learn the\ngeographical link between multimodal features via self-supervised contrastive\nlearning, employing pretrained Vision Transformers (ViT) for image inputs and\nTransformers for climate data, before fine-tuning the model with ground\nreference samples. The proposed approach has undergone rigorous testing on two\ndistinct large-scale datasets, with results indicating its superiority over\ntraditional supervised learning models, which depends solely on labeled data.\nFurthermore, through the utilization of various evaluation metrics (e.g., RMSE,\nMAE, CCC, etc.), the proposed model exhibits higher accuracy when compared to\nother conventional ML algorithms like random forest and gradient boosting. This\nmodel is a robust tool for predicting SOC and contributes to the advancement of\nDSM techniques, thereby facilitating land management and decision-making\nprocesses based on accurate information.\n","authors":["Nafiseh Kakhani","Moien Rangzan","Ali Jamali","Sara Attarchi","Seyed Kazem Alavipanah","Michael Mommert","Nikolaos Tziolas","Thomas Scholten"],"pdf_url":"https://arxiv.org/pdf/2308.03586v3.pdf","comment":"Accepted for publication in IEEE Transactions on Geoscience and\n Remote Sensing (TGRS)"},{"id":"http://arxiv.org/abs/2306.17466v4","updated":"2024-08-14T08:08:55Z","published":"2023-06-30T08:22:48Z","title":"MedAugment: Universal Automatic Data Augmentation Plug-in for Medical\n Image Analysis","summary":" Data augmentation (DA) has been widely leveraged in computer vision to\nalleviate the data shortage, whereas the DA in medical image analysis (MIA)\nfaces multiple challenges. The prevalent DA approaches in MIA encompass\nconventional DA, synthetic DA, and automatic DA. However, utilizing these\napproaches poses various challenges such as experience-driven design and\nintensive computation cost. Here, we propose an efficient and effective\nautomatic DA method termed MedAugment. We propose a pixel augmentation space\nand spatial augmentation space and exclude the operations that can break\nmedical details and features, such as severe color distortions or structural\nalterations that can compromise image diagnostic value. Besides, we propose a\nnovel sampling strategy by sampling a limited number of operations from the two\nspaces. Moreover, we present a hyperparameter mapping relationship to produce a\nrational augmentation level and make the MedAugment fully controllable using a\nsingle hyperparameter. These configurations settle the differences between\nnatural and medical images, such as high sensitivity to certain attributes such\nas brightness and posterize. Extensive experimental results on four\nclassification and four segmentation datasets demonstrate the superiority of\nMedAugment. Compared with existing approaches, the proposed MedAugment serves\nas a more suitable yet general processing pipeline for medical images without\nproducing color distortions or structural alterations and involving negligible\ncomputational overhead. We emphasize that our method can serve as a plugin for\narbitrary projects without any extra training stage, thereby holding the\npotential to make a valuable contribution to the medical field, particularly\nfor medical experts without a solid foundation in deep learning. Code is\navailable at https://github.com/NUS-Tim/MedAugment.\n","authors":["Zhaoshan Liu","Qiujie Lv","Yifan Li","Ziduo Yang","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2306.17466v4.pdf","comment":"29 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.07311v2","updated":"2024-08-14T08:02:39Z","published":"2024-07-10T02:11:01Z","title":"ViTime: A Visual Intelligence-Based Foundation Model for Time Series\n Forecasting","summary":" The success of large pretrained models in natural language processing (NLP)\nand computer vision (CV) has opened new avenues for constructing foundation\nmodels for time series forecasting (TSF). Traditional TSF foundation models\nrely heavily on numerical data fitting. In contrast, the human brain is\ninherently skilled at processing visual information, prefer predicting future\ntrends by observing visualized sequences. From a biomimetic perspective,\nutilizing models to directly process numerical sequences might not be the most\neffective route to achieving Artificial General Intelligence (AGI). This paper\nproposes ViTime, a novel Visual Intelligence-based foundation model for TSF.\nViTime overcomes the limitations of numerical time series data fitting by\nutilizing visual data processing paradigms and employs a innovative data\nsynthesis method during training, called Real Time Series (RealTS). Experiments\non a diverse set of previously unseen forecasting datasets demonstrate that\nViTime achieves state-of-the-art zero-shot performance, even surpassing the\nbest individually trained supervised models in some situations. These findings\nsuggest that visual intelligence can significantly enhance time series analysis\nand forecasting, paving the way for more advanced and versatile models in the\nfield. The code for our framework is accessible at\nhttps://github.com/IkeYang/ViTime.\n","authors":["Luoxiao Yang","Yun Wang","Xinqi Fan","Israel Cohen","Jingdong Chen","Yue Zhao","Zijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07197v2","updated":"2024-08-14T07:49:55Z","published":"2023-06-12T15:54:52Z","title":"AROID: Improving Adversarial Robustness Through Online Instance-Wise\n Data Augmentation","summary":" Deep neural networks are vulnerable to adversarial examples. Adversarial\ntraining (AT) is an effective defense against adversarial examples. However, AT\nis prone to overfitting which degrades robustness substantially. Recently, data\naugmentation (DA) was shown to be effective in mitigating robust overfitting if\nappropriately designed and optimized for AT. This work proposes a new method to\nautomatically learn online, instance-wise, DA policies to improve robust\ngeneralization for AT. This is the first automated DA method specific for\nrobustness. A novel policy learning objective, consisting of Vulnerability,\nAffinity and Diversity, is proposed and shown to be sufficiently effective and\nefficient to be practical for automatic DA generation during AT. Importantly,\nour method dramatically reduces the cost of policy search from the 5000 hours\nof AutoAugment and the 412 hours of IDBH to 9 hours, making automated DA more\npractical to use for adversarial robustness. This allows our method to\nefficiently explore a large search space for a more effective DA policy and\nevolve the policy as training progresses. Empirically, our method is shown to\noutperform all competitive DA methods across various model architectures and\ndatasets. Our DA policy reinforced vanilla AT to surpass several\nstate-of-the-art AT methods regarding both accuracy and robustness. It can also\nbe combined with those advanced AT methods to further boost robustness. Code\nand pre-trained models are available at https://github.com/TreeLLi/AROID.\n","authors":["Lin Li","Jianing Qiu","Michael Spratling"],"pdf_url":"https://arxiv.org/pdf/2306.07197v2.pdf","comment":"published at the IJCV in press"},{"id":"http://arxiv.org/abs/2408.07349v1","updated":"2024-08-14T07:47:25Z","published":"2024-08-14T07:47:25Z","title":"Automated Retinal Image Analysis and Medical Report Generation through\n Deep Learning","summary":" The increasing prevalence of retinal diseases poses a significant challenge\nto the healthcare system, as the demand for ophthalmologists surpasses the\navailable workforce. This imbalance creates a bottleneck in diagnosis and\ntreatment, potentially delaying critical care. Traditional methods of\ngenerating medical reports from retinal images rely on manual interpretation,\nwhich is time-consuming and prone to errors, further straining\nophthalmologists' limited resources. This thesis investigates the potential of\nArtificial Intelligence (AI) to automate medical report generation for retinal\nimages. AI can quickly analyze large volumes of image data, identifying subtle\npatterns essential for accurate diagnosis. By automating this process, AI\nsystems can greatly enhance the efficiency of retinal disease diagnosis,\nreducing doctors' workloads and enabling them to focus on more complex cases.\nThe proposed AI-based methods address key challenges in automated report\ngeneration: (1) Improved methods for medical keyword representation enhance the\nsystem's ability to capture nuances in medical terminology; (2) A multi-modal\ndeep learning approach captures interactions between textual keywords and\nretinal images, resulting in more comprehensive medical reports; (3) Techniques\nto enhance the interpretability of the AI-based report generation system,\nfostering trust and acceptance in clinical practice. These methods are\nrigorously evaluated using various metrics and achieve state-of-the-art\nperformance. This thesis demonstrates AI's potential to revolutionize retinal\ndisease diagnosis by automating medical report generation, ultimately improving\nclinical efficiency, diagnostic accuracy, and patient care.\n[https://github.com/Jhhuangkay/DeepOpht-Medical-Report-Generation-for-Retinal-Images-via-Deep-Models-and-Visual-Explanation]\n","authors":["Jia-Hong Huang"],"pdf_url":"https://arxiv.org/pdf/2408.07349v1.pdf","comment":"Ph.D. thesis, 124 pages"},{"id":"http://arxiv.org/abs/2408.04145v2","updated":"2024-08-14T07:43:06Z","published":"2024-08-08T01:12:21Z","title":"ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive\n Language-Image Pre-traning Model","summary":" Contrastive Language-Image Pre-training (CLIP) model excels in integrating\nsemantic information between images and text through contrastive learning\ntechniques. It has achieved remarkable performance in various multimodal tasks.\nHowever, the deployment of large CLIP models is hindered in resource-limited\nenvironments, while smaller models frequently fail to meet the performance\nbenchmarks required for practical applications. In this paper, we propose a\nnovel approach, ComKD-CLIP: Comprehensive Knowledge Distillation for\nContrastive Language-Image Pre-traning Model, which aims to comprehensively\ndistill the knowledge from a large teacher CLIP model into a smaller student\nmodel, ensuring comparable performance with significantly reduced parameters.\nComKD-CLIP is composed of two key mechanisms: Image Feature Alignment (IFAlign)\nand Educational Attention (EduAttention). IFAlign makes the image features\nextracted by the student model closely match those extracted by the teacher\nmodel, enabling the student to learn teacher's knowledge of extracting image\nfeatures. EduAttention explores the cross-relationships between text features\nextracted by the teacher model and image features extracted by the student\nmodel, enabling the student model to learn how the teacher model integrates\ntext-image features. In addition, ComKD-CLIP can refine the knowledge distilled\nfrom IFAlign and EduAttention by leveraging the text-image feature fusion\nresults of the teacher model, ensuring the student model accurately absorbs the\nteacher's knowledge. Extensive experiments conducted on 11 datasets have\ndemonstrated the superiority of the proposed method.\n","authors":["Yifan Chen","Xiaozhen Qiao","Zhe Sun","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.04145v2.pdf","comment":"update"},{"id":"http://arxiv.org/abs/2408.07344v1","updated":"2024-08-14T07:37:24Z","published":"2024-08-14T07:37:24Z","title":"RTAT: A Robust Two-stage Association Tracker for Multi-Object Tracking","summary":" Data association is an essential part in the tracking-by-detection based\nMulti-Object Tracking (MOT). Most trackers focus on how to design a better data\nassociation strategy to improve the tracking performance. The rule-based\nhandcrafted association methods are simple and highly efficient but lack\ngeneralization capability to deal with complex scenes. While the learnt\nassociation methods can learn high-order contextual information to deal with\nvarious complex scenes, but they have the limitations of higher complexity and\ncost. To address these limitations, we propose a Robust Two-stage Association\nTracker, named RTAT. The first-stage association is performed between tracklets\nand detections to generate tracklets with high purity, and the second-stage\nassociation is performed between tracklets to form complete trajectories. For\nthe first-stage association, we use a simple data association strategy to\ngenerate tracklets with high purity by setting a low threshold for the matching\ncost in the assignment process. We conduct the tracklet association in the\nsecond-stage based on the framework of message-passing GNN. Our method models\nthe tracklet association as a series of edge classification problem in\nhierarchical graphs, which can recursively merge short tracklets into longer\nones. Our tracker RTAT ranks first on the test set of MOT17 and MOT20\nbenchmarks in most of the main MOT metrics: HOTA, IDF1, and AssA. We achieve\n67.2 HOTA, 84.7 IDF1, and 69.7 AssA on MOT17, and 66.2 HOTA, 82.5 IDF1, and\n68.1 AssA on MOT20.\n","authors":["Song Guo","Rujie Liu","Narishige Abe"],"pdf_url":"https://arxiv.org/pdf/2408.07344v1.pdf","comment":"ICPR2024"},{"id":"http://arxiv.org/abs/2408.07343v1","updated":"2024-08-14T07:37:07Z","published":"2024-08-14T07:37:07Z","title":"Gradient Alignment Improves Test-Time Adaptation for Medical Image\n Segmentation","summary":" Although recent years have witnessed significant advancements in medical\nimage segmentation, the pervasive issue of domain shift among medical images\nfrom diverse centres hinders the effective deployment of pre-trained models.\nMany Test-time Adaptation (TTA) methods have been proposed to address this\nissue by fine-tuning pre-trained models with test data during inference. These\nmethods, however, often suffer from less-satisfactory optimization due to\nsuboptimal optimization direction (dictated by the gradient) and fixed\nstep-size (predicated on the learning rate). In this paper, we propose the\nGradient alignment-based Test-time adaptation (GraTa) method to improve both\nthe gradient direction and learning rate in the optimization procedure. Unlike\nconventional TTA methods, which primarily optimize the pseudo gradient derived\nfrom a self-supervised objective, our method incorporates an auxiliary gradient\nwith the pseudo one to facilitate gradient alignment. Such gradient alignment\nenables the model to excavate the similarities between different gradients and\ncorrect the gradient direction to approximate the empirical gradient related to\nthe current segmentation task. Additionally, we design a dynamic learning rate\nbased on the cosine similarity between the pseudo and auxiliary gradients,\nthereby empowering the adaptive fine-tuning of pre-trained models on diverse\ntest data. Extensive experiments establish the effectiveness of the proposed\ngradient alignment and dynamic learning rate and substantiate the superiority\nof our GraTa method over other state-of-the-art TTA methods on a benchmark\nmedical image segmentation task. The code and weights of pre-trained source\nmodels will be available.\n","authors":["Ziyang Chen","Yiwen Ye","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2408.07343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07341v1","updated":"2024-08-14T07:34:12Z","published":"2024-08-14T07:34:12Z","title":"Robust Semi-supervised Multimodal Medical Image Segmentation via Cross\n Modality Collaboration","summary":" Multimodal learning leverages complementary information derived from\ndifferent modalities, thereby enhancing performance in medical image\nsegmentation. However, prevailing multimodal learning methods heavily rely on\nextensive well-annotated data from various modalities to achieve accurate\nsegmentation performance. This dependence often poses a challenge in clinical\nsettings due to limited availability of such data. Moreover, the inherent\nanatomical misalignment between different imaging modalities further\ncomplicates the endeavor to enhance segmentation performance. To address this\nproblem, we propose a novel semi-supervised multimodal segmentation framework\nthat is robust to scarce labeled data and misaligned modalities. Our framework\nemploys a novel cross modality collaboration strategy to distill\nmodality-independent knowledge, which is inherently associated with each\nmodality, and integrates this information into a unified fusion layer for\nfeature amalgamation. With a channel-wise semantic consistency loss, our\nframework ensures alignment of modality-independent information from a\nfeature-wise perspective across modalities, thereby fortifying it against\nmisalignments in multimodal scenarios. Furthermore, our framework effectively\nintegrates contrastive consistent learning to regulate anatomical structures,\nfacilitating anatomical-wise prediction alignment on unlabeled data in\nsemi-supervised segmentation tasks. Our method achieves competitive performance\ncompared to other multimodal methods across three tasks: cardiac, abdominal\nmulti-organ, and thyroid-associated orbitopathy segmentations. It also\ndemonstrates outstanding robustness in scenarios involving scarce labeled data\nand misaligned modalities.\n","authors":["Xiaogen Zhon","Yiyou Sun","Min Deng","Winnie Chiu Wing Chu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2408.07341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00816v3","updated":"2024-08-14T07:26:08Z","published":"2024-02-26T01:17:50Z","title":"Read and Think: An Efficient Step-wise Multimodal Language Model for\n Document Understanding and Reasoning","summary":" Understanding the contents of multimodal documents is essential to accurately\nextract relevant evidence and use it for reasoning. Existing document\nunderstanding models tend to generate answers with a single word or phrase\ndirectly, ignoring the source document's evidence and lacking interpretability.\nIn this work, we address the lack of step-wise capabilities through data\naugmentation and extension. Specifically, We use Multi-modal Large Language\nModels (MLLMs), which have strong visual understanding and reasoning abilities,\nas data generators to generate step-wise question-and-answer pairs for document\nimages and use a high-performance LLM as the error detector to filter out noisy\ndata. This step-wise data generation pipeline is implemented using both\ntemplate-based and few-shot methods. We then use the generated high-quality\ndata to train a humanized document understanding and reasoning model,\nspecifically designed to solve complex questions that require reasoning or\nmulti-hop question answering, dubbed DocAssistant. Experimental results\ndemonstrate the effectiveness and application value of step-wise generation,\nshowing a 5 improvement on InfoVQA with complex layouts and a 7 improvement on\nChartQA with complex reasoning, compared to directly generated answers. We hope\nour work highlights the potential of synthetic data and encourages further\nexploration of multi-modal document reasoning capabilities.\n","authors":["Jinxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.00816v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07337v1","updated":"2024-08-14T07:22:28Z","published":"2024-08-14T07:22:28Z","title":"KIND: Knowledge Integration and Diversion in Diffusion Models","summary":" Pre-trained models have become the preferred backbone due to the expansion of\nmodel parameters, with techniques like Parameter-Efficient Fine-Tuning (PEFTs)\ntypically fixing the parameters of these models. However, pre-trained models\nmay not always be optimal, especially when there are discrepancies between\ntraining tasks and target tasks, potentially resulting in negative transfer. To\naddress this, we introduce \\textbf{KIND}, which performs \\textbf{K}nowledge\n\\textbf{IN}tegration and \\textbf{D}iversion in diffusion models. KIND first\nintegrates knowledge by decomposing parameter matrices of models using $U$,\n$\\Sigma$, and $V$ matrices, formally inspired by singular value decomposition\n(SVD). Then it explicitly partitions the components of these matrices into\n\\textbf{learngenes} and \\textbf{tailors} to condense common and class-specific\nknowledge, respectively, through a class gate. In this way, KIND redefines\ntraditional pre-training methods by adjusting training objectives from\nmaximizing model performance on current tasks to condensing transferable common\nknowledge, leveraging the \\textit{Learngene} framework. We conduct experiments\non ImageNet-1K and compare KIND with PEFT and other learngene methods. Results\nindicate that KIND achieves state-of-the-art performance compared to other PEFT\nand learngene methods. Specifically, the images generated by KIND achieves more\nthan 6.54 and 1.07 decrease in FID and sFID on DiT-L/2, utilizing only 45.4M\ntrainable parameters and saving at least 35.4G FLOPs in computational cost.\n","authors":["Yucheng Xie","Fu Feng","Jing Wang","Xin Geng","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2408.07337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12437v3","updated":"2024-08-14T07:05:23Z","published":"2023-05-21T11:51:09Z","title":"SCP: Soft Conditional Prompt Learning for Aerial Video Action\n Recognition","summary":" We present a new learning approach, Soft Conditional Prompt Learning (SCP),\nwhich leverages the strengths of prompt learning for aerial video action\nrecognition. Our approach is designed to predict the action of each agent by\nhelping the models focus on the descriptions or instructions associated with\nactions in the input videos for aerial/robot visual perception. Our formulation\nsupports various prompts, including learnable prompts, auxiliary visual\ninformation, and large vision models to improve the recognition performance. We\npresent a soft conditional prompt method that learns to dynamically generate\nprompts from a pool of prompt experts under different video inputs. By sharing\nthe same objective with the task, our proposed SCP can optimize prompts that\nguide the model's predictions while explicitly learning input-invariant (prompt\nexperts pool) and input-specific (data-dependent) prompt knowledge. In\npractice, we observe a 3.17-10.2% accuracy improvement on the aerial video\ndatasets (Okutama, NECDrone), which consist of scenes with single-agent and\nmulti-agent actions. We further evaluate our approach on ground camera videos\nto verify the effectiveness and generalization and achieve a 1.0-3.6%\nimprovement on dataset SSV2. We integrate our method into the ROS2 as well.\n","authors":["Xijun Wang","Ruiqi Xian","Tianrui Guan","Fuxiao Liu","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2305.12437v3.pdf","comment":"IROS2024"},{"id":"http://arxiv.org/abs/2408.03219v2","updated":"2024-08-14T06:25:50Z","published":"2024-08-06T14:25:23Z","title":"Learning to Learn without Forgetting using Attention","summary":" Continual learning (CL) refers to the ability to continually learn over time\nby accommodating new knowledge while retaining previously learned experience.\nWhile this concept is inherent in human learning, current machine learning\nmethods are highly prone to overwrite previously learned patterns and thus\nforget past experience. Instead, model parameters should be updated selectively\nand carefully, avoiding unnecessary forgetting while optimally leveraging\npreviously learned patterns to accelerate future learning. Since hand-crafting\neffective update mechanisms is difficult, we propose meta-learning a\ntransformer-based optimizer to enhance CL. This meta-learned optimizer uses\nattention to learn the complex relationships between model parameters across a\nstream of tasks, and is designed to generate effective weight updates for the\ncurrent task while preventing catastrophic forgetting on previously encountered\ntasks. Evaluations on benchmark datasets like SplitMNIST, RotatedMNIST, and\nSplitCIFAR-100 affirm the efficacy of the proposed approach in terms of both\nforward and backward transfer, even on small sets of labeled data, highlighting\nthe advantages of integrating a meta-learned optimizer within the continual\nlearning framework.\n","authors":["Anna Vettoruzzo","Joaquin Vanschoren","Mohamed-Rafik Bouguelia","Thorsteinn Rögnvaldsson"],"pdf_url":"https://arxiv.org/pdf/2408.03219v2.pdf","comment":"Published at the 3rd Conference on Lifelong Learning Agents (CoLLAs),\n 2024"},{"id":"http://arxiv.org/abs/2303.09735v2","updated":"2024-08-14T06:12:36Z","published":"2023-03-17T02:38:44Z","title":"SRFormerV2: Taking a Closer Look at Permuted Self-Attention for Image\n Super-Resolution","summary":" Previous works have shown that increasing the window size for\nTransformer-based image super-resolution models (e.g., SwinIR) can\nsignificantly improve the model performance. Still, the computation overhead is\nalso considerable when the window size gradually increases. In this paper, we\npresent SRFormer, a simple but novel method that can enjoy the benefit of large\nwindow self-attention but introduces even less computational burden. The core\nof our SRFormer is the permuted self-attention (PSA), which strikes an\nappropriate balance between the channel and spatial information for\nself-attention. Without any bells and whistles, we show that our SRFormer\nachieves a 33.86dB PSNR score on the Urban100 dataset, which is 0.46dB higher\nthan that of SwinIR but uses fewer parameters and computations. In addition, we\nalso attempt to scale up the model by further enlarging the window size and\nchannel numbers to explore the potential of Transformer-based models.\nExperiments show that our scaled model, named SRFormerV2, can further improve\nthe results and achieves state-of-the-art. We hope our simple and effective\napproach could be useful for future research in super-resolution model design.\nThe homepage is https://z-yupeng.github.io/SRFormer/.\n","authors":["Yupeng Zhou","Zhen Li","Chun-Le Guo","Li Liu","Ming-Ming Cheng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2303.09735v2.pdf","comment":"Previous version has been accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2408.04831v2","updated":"2024-08-14T05:20:44Z","published":"2024-08-09T03:09:22Z","title":"Self-augmented Gaussian Splatting with Structure-aware Masks for\n Sparse-view 3D Reconstruction","summary":" Sparse-view 3D reconstruction stands as a formidable challenge in computer\nvision, aiming to build complete three-dimensional models from a limited array\nof viewing perspectives. This task confronts several difficulties: 1) the\nlimited number of input images that lack consistent information; 2) dependence\non the quality of input images; and 3) the substantial size of model\nparameters. To address these challenges, we propose a self-augmented\ncoarse-to-fine Gaussian splatting paradigm, enhanced with a structure-aware\nmask, for sparse-view 3D reconstruction. In particular, our method initially\nemploys a coarse Gaussian model to obtain a basic 3D representation from\nsparse-view inputs. Subsequently, we develop a fine Gaussian network to enhance\nconsistent and detailed representation of the output with both 3D geometry\naugmentation and perceptual view augmentation. During training, we design a\nstructure-aware masking strategy to further improve the model's robustness\nagainst sparse inputs and noise.Experimental results on the MipNeRF360 and\nOmniObject3D datasets demonstrate that the proposed method achieves\nstate-of-the-art performances for sparse input views in both perceptual quality\nand efficiency.\n","authors":["Lingbei Meng","Bi'an Du","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2408.04831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07303v1","updated":"2024-08-14T05:18:43Z","published":"2024-08-14T05:18:43Z","title":"Enhancing Visual Question Answering through Ranking-Based Hybrid\n Training and Multimodal Fusion","summary":" Visual Question Answering (VQA) is a challenging task that requires systems\nto provide accurate answers to questions based on image content. Current VQA\nmodels struggle with complex questions due to limitations in capturing and\nintegrating multimodal information effectively. To address these challenges, we\npropose the Rank VQA model, which leverages a ranking-inspired hybrid training\nstrategy to enhance VQA performance. The Rank VQA model integrates high-quality\nvisual features extracted using the Faster R-CNN model and rich semantic text\nfeatures obtained from a pre-trained BERT model. These features are fused\nthrough a sophisticated multimodal fusion technique employing multi-head\nself-attention mechanisms. Additionally, a ranking learning module is\nincorporated to optimize the relative ranking of answers, thus improving answer\naccuracy. The hybrid training strategy combines classification and ranking\nlosses, enhancing the model's generalization ability and robustness across\ndiverse datasets. Experimental results demonstrate the effectiveness of the\nRank VQA model. Our model significantly outperforms existing state-of-the-art\nmodels on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of\nboth accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank\nVQA is evident in its ability to handle complex questions that require\nunderstanding nuanced details and making sophisticated inferences from the\nimage and text. This work highlights the effectiveness of a ranking-based\nhybrid training strategy in improving VQA performance and lays the groundwork\nfor further research in multimodal learning methods.\n","authors":["Peiyuan Chen","Zecheng Zhang","Yiping Dong","Li Zhou","Han Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07303v1.pdf","comment":"Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal\n Fusion, Ranking Learning, Hybrid Training Strategy"},{"id":"http://arxiv.org/abs/2408.06891v2","updated":"2024-08-14T05:16:46Z","published":"2024-08-13T13:38:32Z","title":"Automatic Feature Recognition and Dimensional Attributes Extraction From\n CAD Models for Hybrid Additive-Subtractive Manufacturing","summary":" The integration of Computer-Aided Design (CAD), Computer-Aided Process\nPlanning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in\nmodern manufacturing, facilitating seamless transitions from digital designs to\nphysical products. However, a significant challenge within this integration is\nthe Automatic Feature Recognition (AFR) of CAD models, especially in the\ncontext of hybrid manufacturing that combines subtractive and additive\nmanufacturing processes. Traditional AFR methods, focused mainly on the\nidentification of subtractive (machined) features including holes, fillets,\nchamfers, pockets, and slots, fail to recognize features pertinent to additive\nmanufacturing. Furthermore, the traditional methods fall short in accurately\nextracting geometric dimensions and orientations, which are also key factors\nfor effective manufacturing process planning. This paper presents a novel\napproach for creating a synthetic CAD dataset that encompasses features\nrelevant to both additive and subtractive machining through Python Open\nCascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is\nimplemented to accurately identify the composite additive-subtractive features\nwithin the synthetic CAD dataset. The key novelty and contribution of the\nproposed methodology lie in its ability to recognize a wide range of\nmanufacturing features, and precisely extracting their dimensions,\norientations, and stock sizes. The proposed model demonstrates remarkable\nfeature recognition accuracy exceeding 97% and a dimension extraction accuracy\nof 100% for identified features. Therefore, the proposed methodology enhances\nthe integration of CAD, CAPP, and CAM within hybrid manufacturing by providing\nprecise feature recognition and dimension extraction. It facilitates improved\nmanufacturing process planning, by enabling more informed decision-making.\n","authors":["Muhammad Tayyab Khan","Wenhe Feng","Lequn Chen","Ye Han Ng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2408.06891v2.pdf","comment":"10 pages, 12 figures. This paper has been accepted for presentation\n at the ASME IDETC-CIE 2024 conference"},{"id":"http://arxiv.org/abs/2405.12223v3","updated":"2024-08-14T04:56:58Z","published":"2024-04-06T03:02:47Z","title":"Cascaded Multi-path Shortcut Diffusion Model for Medical Image\n Translation","summary":" Image-to-image translation is a vital component in medical imaging\nprocessing, with many uses in a wide range of imaging modalities and clinical\nscenarios. Previous methods include Generative Adversarial Networks (GANs) and\nDiffusion Models (DMs), which offer realism but suffer from instability and\nlack uncertainty estimation. Even though both GAN and DM methods have\nindividually exhibited their capability in medical image translation tasks, the\npotential of combining a GAN and DM to further improve translation performance\nand to enable uncertainty estimation remains largely unexplored. In this work,\nwe address these challenges by proposing a Cascade Multi-path Shortcut\nDiffusion Model (CMDM) for high-quality medical image translation and\nuncertainty estimation. To reduce the required number of iterations and ensure\nrobust performance, our method first obtains a conditional GAN-generated prior\nimage that will be used for the efficient reverse translation with a DM in the\nsubsequent step. Additionally, a multi-path shortcut diffusion strategy is\nemployed to refine translation results and estimate uncertainty. A cascaded\npipeline further enhances translation quality, incorporating residual averaging\nbetween cascades. We collected three different medical image datasets with two\nsub-tasks for each dataset to test the generalizability of our approach. Our\nexperimental results found that CMDM can produce high-quality translations\ncomparable to state-of-the-art methods while providing reasonable uncertainty\nestimations that correlate well with the translation error.\n","authors":["Yinchi Zhou","Tianqi Chen","Jun Hou","Huidong Xie","Nicha C. Dvornek","S. Kevin Zhou","David L. Wilson","James S. Duncan","Chi Liu","Bo Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.12223v3.pdf","comment":"Accepted at Medical Image Analysis Journal"},{"id":"http://arxiv.org/abs/2312.13764v2","updated":"2024-08-14T04:05:23Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v2.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2403.08651v5","updated":"2024-08-14T03:56:01Z","published":"2024-03-13T16:06:07Z","title":"HAIFIT: Human-to-AI Fashion Image Translation","summary":" In the realm of fashion design, sketches serve as the canvas for expressing\nan artist's distinctive drawing style and creative vision, capturing intricate\ndetails like stroke variations and texture nuances. The advent of\nsketch-to-image cross-modal translation technology has notably aided designers.\nHowever, existing methods often compromise these sketch details during image\ngeneration, resulting in images that deviate from the designer's intended\nconcept. This limitation hampers the ability to offer designers a precise\npreview of the final output. To overcome this challenge, we introduce HAIFIT, a\nnovel approach that transforms sketches into high-fidelity, lifelike clothing\nimages by integrating multi-scale features and capturing extensive feature map\ndependencies from diverse perspectives. Through extensive qualitative and\nquantitative evaluations conducted on our self-collected dataset, our method\ndemonstrates superior performance compared to existing methods in generating\nphotorealistic clothing images. Our method excels in preserving the distinctive\nstyle and intricate details essential for fashion design applications. In\naddition, our method also has obvious advantages in model training and\ninference speed, contributing to reducing designers' time costs and improving\ndesign efficiency.\n","authors":["Jianan Jiang","Xinglin Li","Weiren Yu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2403.08651v5.pdf","comment":"10 pages,8 figures"},{"id":"http://arxiv.org/abs/2408.07269v1","updated":"2024-08-14T03:35:11Z","published":"2024-08-14T03:35:11Z","title":"Image-Based Leopard Seal Recognition: Approaches and Challenges in\n Current Automated Systems","summary":" This paper examines the challenges and advancements in recognizing seals\nwithin their natural habitats using conventional photography, underscored by\nthe emergence of machine learning technologies. We used the leopard seal,\n\\emph{Hydrurga leptonyx}, a key species within Antarctic ecosystems, to review\nthe different available methods found. As apex predators, Leopard seals are\ncharacterized by their significant ecological role and elusive nature so\nstudying them is crucial to understand the health of their ecosystem.\nTraditional methods of monitoring seal species are often constrained by the\nlabor-intensive and time-consuming processes required for collecting data,\ncompounded by the limited insights these methods provide. The advent of machine\nlearning, particularly through the application of vision transformers, heralds\na new era of efficiency and precision in species monitoring. By leveraging\nstate-of-the-art approaches in detection, segmentation, and recognition within\ndigital imaging, this paper presents a synthesis of the current landscape,\nhighlighting both the cutting-edge methodologies and the predominant challenges\nfaced in accurately identifying seals through photographic data.\n","authors":["Jorge Yero Salazar","Pablo Rivas","Renato Borras-Chavez","Sarah Kienle"],"pdf_url":"https://arxiv.org/pdf/2408.07269v1.pdf","comment":"28th International Conference on Image Processing, Computer Vision, &\n Pattern Recognition (IPCV'24), Las Vegas, USA"},{"id":"http://arxiv.org/abs/2305.07895v6","updated":"2024-08-14T03:30:14Z","published":"2023-05-13T11:28:37Z","title":"On the Hidden Mystery of OCR in Large Multimodal Models","summary":" Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results presented in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Mingxin Huang","Biao Yang","Wenwen Yu","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00429v2","updated":"2024-08-14T03:28:01Z","published":"2023-08-01T10:15:15Z","title":"Patch-wise Auto-Encoder for Visual Anomaly Detection","summary":" Anomaly detection without priors of the anomalies is challenging. In the\nfield of unsupervised anomaly detection, traditional auto-encoder (AE) tends to\nfail based on the assumption that by training only on normal images, the model\nwill not be able to reconstruct abnormal images correctly. On the contrary, we\npropose a novel patch-wise auto-encoder (Patch AE) framework, which aims at\nenhancing the reconstruction ability of AE to anomalies instead of weakening\nit. Each patch of image is reconstructed by corresponding spatially distributed\nfeature vector of the learned feature representation, i.e., patch-wise\nreconstruction, which ensures anomaly-sensitivity of AE. Our method is simple\nand efficient. It advances the state-of-the-art performances on Mvtec AD\nbenchmark, which proves the effectiveness of our model. It shows great\npotential in practical industrial application scenarios.\n","authors":["Yajie Cui","Zhaoxiang Liu","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2308.00429v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07266v1","updated":"2024-08-14T03:18:04Z","published":"2024-08-14T03:18:04Z","title":"Enhanced Scale-aware Depth Estimation for Monocular Endoscopic Scenes\n with Geometric Modeling","summary":" Scale-aware monocular depth estimation poses a significant challenge in\ncomputer-aided endoscopic navigation. However, existing depth estimation\nmethods that do not consider the geometric priors struggle to learn the\nabsolute scale from training with monocular endoscopic sequences. Additionally,\nconventional methods face difficulties in accurately estimating details on\ntissue and instruments boundaries. In this paper, we tackle these problems by\nproposing a novel enhanced scale-aware framework that only uses monocular\nimages with geometric modeling for depth estimation. Specifically, we first\npropose a multi-resolution depth fusion strategy to enhance the quality of\nmonocular depth estimation. To recover the precise scale between relative depth\nand real-world values, we further calculate the 3D poses of instruments in the\nendoscopic scenes by algebraic geometry based on the image-only geometric\nprimitives (i.e., boundaries and tip of instruments). Afterwards, the 3D poses\nof surgical instruments enable the scale recovery of relative depth maps. By\ncoupling scale factors and relative depth estimation, the scale-aware depth of\nthe monocular endoscopic scenes can be estimated. We evaluate the pipeline on\nin-house endoscopic surgery videos and simulated data. The results demonstrate\nthat our method can learn the absolute scale with geometric modeling and\naccurately estimate scale-aware depth for monocular scenes.\n","authors":["Ruofeng Wei","Bin Li","Kai Chen","Yiyao Ma","Yunhui Liu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2408.07266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03736v2","updated":"2024-08-14T03:06:56Z","published":"2024-04-04T18:05:18Z","title":"SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer","summary":" Recent advances in 2D/3D generative models enable the generation of dynamic\n3D objects from a single-view video. Existing approaches utilize score\ndistillation sampling to form the dynamic scene as dynamic NeRF or dense 3D\nGaussians. However, these methods struggle to strike a balance among reference\nview alignment, spatio-temporal consistency, and motion fidelity under\nsingle-view conditions due to the implicit nature of NeRF or the intricate\ndense Gaussian motion prediction. To address these issues, this paper proposes\nan efficient, sparse-controlled video-to-4D framework named SC4D, that\ndecouples motion and appearance to achieve superior video-to-4D generation.\nMoreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian\nAlignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity\nof the learned motion and shape. Comprehensive experimental results demonstrate\nthat our method surpasses existing methods in both quality and efficiency. In\naddition, facilitated by the disentangled modeling of motion and appearance of\nSC4D, we devise a novel application that seamlessly transfers the learned\nmotion onto a diverse array of 4D entities according to textual descriptions.\n","authors":["Zijie Wu","Chaohui Yu","Yanqin Jiang","Chenjie Cao","Fan Wang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2404.03736v2.pdf","comment":"Accepted by ECCV2024! Project Page: https://sc4d.github.io/ Code is\n available at: https://github.com/JarrentWu1031/SC4D"},{"id":"http://arxiv.org/abs/2408.07264v1","updated":"2024-08-14T03:06:04Z","published":"2024-08-14T03:06:04Z","title":"Lesion-aware network for diabetic retinopathy diagnosis","summary":" Deep learning brought boosts to auto diabetic retinopathy (DR) diagnosis,\nthus, greatly helping ophthalmologists for early disease detection, which\ncontributes to preventing disease deterioration that may eventually lead to\nblindness. It has been proved that convolutional neural network (CNN)-aided\nlesion identifying or segmentation benefits auto DR screening. The key to\nfine-grained lesion tasks mainly lies in: (1) extracting features being both\nsensitive to tiny lesions and robust against DR-irrelevant interference, and\n(2) exploiting and re-using encoded information to restore lesion locations\nunder extremely imbalanced data distribution. To this end, we propose a\nCNN-based DR diagnosis network with attention mechanism involved, termed\nlesion-aware network, to better capture lesion information from imbalanced\ndata. Specifically, we design the lesion-aware module (LAM) to capture\nnoise-like lesion areas across deeper layers, and the feature-preserve module\n(FPM) to assist shallow-to-deep feature fusion. Afterward, the proposed\nlesion-aware network (LANet) is constructed by embedding the LAM and FPM into\nthe CNN decoders for DR-related information utilization. The proposed LANet is\nthen further extended to a DR screening network by adding a classification\nlayer. Through experiments on three public fundus datasets with pixel-level\nannotations, our method outperforms the mainstream methods with an area under\ncurve of 0.967 in DR screening, and increases the overall average precision by\n7.6%, 2.1%, and 1.2% in lesion segmentation on three datasets. Besides, the\nablation study validates the effectiveness of the proposed sub-modules.\n","authors":["Xue Xia","Kun Zhan","Yuming Fang","Wenhui Jiang","Fei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.07264v1.pdf","comment":"This is submitted version wihout improvements by reviewers. The final\n version is published on International Journal of Imaging Systems and\n Techonology (https://onlinelibrary.wiley.com/doi/10.1002/ima.22933)"},{"id":"http://arxiv.org/abs/2408.07262v1","updated":"2024-08-14T02:57:38Z","published":"2024-08-14T02:57:38Z","title":"Ensemble architecture in polyp segmentation","summary":" In this research, we revisit the architecture of semantic segmentation and\nevaluate the models excelling in polyp segmentation. We introduce an integrated\nframework that harnesses the advantages of different models to attain an\noptimal outcome. More specifically, we fuse the learned features from\nconvolutional and transformer models for prediction, and we view this approach\nas an ensemble technique to enhance model performance. Our experiments on polyp\nsegmentation reveal that the proposed architecture surpasses other top models,\nexhibiting improved learning capacity and resilience. The code is available at\nhttps://github.com/HuangDLab/EnFormer.\n","authors":["Hao-Yun Hsu","Yi-Ching Cheng","Guan-Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2408.07262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13777v2","updated":"2024-08-14T02:51:54Z","published":"2023-08-26T06:03:06Z","title":"Self-Supervised Scalable Deep Compressed Sensing","summary":" Compressed sensing (CS) is a promising tool for reducing sampling costs.\nCurrent deep neural network (NN)-based CS methods face the challenges of\ncollecting labeled measurement-ground truth (GT) data and generalizing to real\napplications. This paper proposes a novel $\\mathbf{S}$elf-supervised\ns$\\mathbf{C}$alable deep CS method, comprising a deep $\\mathbf{L}$earning\nscheme called $\\mathbf{SCL}$ and a family of $\\mathbf{Net}$works named\n$\\mathbf{SCNet}$, which does not require GT and can handle arbitrary sampling\nratios and matrices once trained on a partial measurement set. Our SCL contains\na dual-domain loss and a four-stage recovery strategy. The former encourages a\ncross-consistency on two measurement parts and a sampling-reconstruction\ncycle-consistency regarding arbitrary ratios and matrices to maximize\ndata/information utilization. The latter can progressively leverage common\nsignal prior in external measurements and internal characteristics of test\nsamples and learned NNs to improve accuracy. SCNet combines both the explicit\nguidance from optimization algorithms with implicit regularization from\nadvanced NN blocks to learn a collaborative signal representation. Our\ntheoretical analyses and experiments on simulated and real captured data,\ncovering 1-/2-/3-D natural and scientific signals, demonstrate the\neffectiveness, superior performance, flexibility, and generalization ability of\nour method over existing self-supervised methods and its significant potential\nin competing against state-of-the-art supervised methods. Code is available at\nhttps://github.com/Guaishou74851/SCNet.\n","authors":["Bin Chen","Xuanyu Zhang","Shuai Liu","Yongbing Zhang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13777v2.pdf","comment":"Accepted by Internaltional Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2403.06189v3","updated":"2024-08-14T02:38:55Z","published":"2024-03-10T12:11:34Z","title":"Harmonious Group Choreography with Trajectory-Controllable Diffusion","summary":" Creating group choreography from music has gained attention in cultural\nentertainment and virtual reality, aiming to coordinate visually cohesive and\ndiverse group movements. Despite increasing interest, recent works face\nchallenges in achieving aesthetically appealing choreography, primarily for two\nkey issues: multi-dancer collision and single-dancer foot slide. To address\nthese issues, we propose a Trajectory-Controllable Diffusion (TCDiff), a novel\napproach that harnesses non-overlapping trajectories to facilitate coherent\ndance movements. Specifically, to tackle dancer collisions, we introduce a\nDance-Beat Navigator capable of generating trajectories for multiple dancers\nbased on the music, complemented by a Distance-Consistency loss to maintain\nappropriate spacing among trajectories within a reasonable threshold. To\nmitigate foot sliding, we present a Footwork Adaptor that utilizes trajectory\ndisplacement from adjacent frames to enable flexible footwork, coupled with a\nRelative Forward-Kinematic loss to adjust the positioning of individual\ndancers' root nodes and joints. Extensive experiments demonstrate that our\nmethod achieves state-of-the-art results.\n","authors":["Yuqin Dai","Wanlu Zhu","Ronghui Li","Zeping Ren","Xiangzheng Zhou","Xiu Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.06189v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07259v1","updated":"2024-08-14T02:26:46Z","published":"2024-08-14T02:26:46Z","title":"GRIF-DM: Generation of Rich Impression Fonts using Diffusion Models","summary":" Fonts are integral to creative endeavors, design processes, and artistic\nproductions. The appropriate selection of a font can significantly enhance\nartwork and endow advertisements with a higher level of expressivity. Despite\nthe availability of numerous diverse font designs online, traditional\nretrieval-based methods for font selection are increasingly being supplanted by\ngeneration-based approaches. These newer methods offer enhanced flexibility,\ncatering to specific user preferences and capturing unique stylistic\nimpressions. However, current impression font techniques based on Generative\nAdversarial Networks (GANs) necessitate the utilization of multiple auxiliary\nlosses to provide guidance during generation. Furthermore, these methods\ncommonly employ weighted summation for the fusion of impression-related\nkeywords. This leads to generic vectors with the addition of more impression\nkeywords, ultimately lacking in detail generation capacity. In this paper, we\nintroduce a diffusion-based method, termed \\ourmethod, to generate fonts that\nvividly embody specific impressions, utilizing an input consisting of a single\nletter and a set of descriptive impression keywords. The core innovation of\n\\ourmethod lies in the development of dual cross-attention modules, which\nprocess the characteristics of the letters and impression keywords\nindependently but synergistically, ensuring effective integration of both types\nof information. Our experimental results, conducted on the MyFonts dataset,\naffirm that this method is capable of producing realistic, vibrant, and\nhigh-fidelity fonts that are closely aligned with user specifications. This\nconfirms the potential of our approach to revolutionize font generation by\naccommodating a broad spectrum of user-driven design requirements. Our code is\npublicly available at \\url{https://github.com/leitro/GRIF-DM}.\n","authors":["Lei Kang","Fei Yang","Kai Wang","Mohamed Ali Souibgui","Lluis Gomez","Alicia Fornés","Ernest Valveny","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2408.07259v1.pdf","comment":"Accepted to ECAI2024"},{"id":"http://arxiv.org/abs/2408.05577v2","updated":"2024-08-14T02:20:50Z","published":"2024-08-10T15:01:19Z","title":"Camera Perspective Transformation to Bird's Eye View via Spatial\n Transformer Model for Road Intersection Monitoring","summary":" Road intersection monitoring and control research often utilize bird's eye\nview (BEV) simulators. In real traffic settings, achieving a BEV akin to that\nin a simulator necessitates the deployment of drones or specific sensor\nmounting, which is neither feasible nor practical. Consequently, traffic\nintersection management remains confined to simulation environments given these\nconstraints. In this paper, we address the gap between simulated environments\nand real-world implementation by introducing a novel deep-learning model that\nconverts a single camera's perspective of a road intersection into a BEV. We\ncreated a simulation environment that closely resembles a real-world traffic\njunction. The proposed model transforms the vehicles into BEV images,\nfacilitating road intersection monitoring and control model processing.\nInspired by image transformation techniques, we propose a Spatial-Transformer\nDouble Decoder-UNet (SDD-UNet) model that aims to eliminate the transformed\nimage distortions. In addition, the model accurately estimates the vehicle's\npositions and enables the direct application of simulation-trained models in\nreal-world contexts. SDD-UNet model achieves an average dice similarity\ncoefficient (DSC) above 95% which is 40% better than the original UNet model.\nThe mean absolute error (MAE) is 0.102 and the centroid of the predicted mask\nis 0.14 meters displaced, on average, indicating high accuracy.\n","authors":["Rukesh Prajapati","Amr S. El-Wakeel"],"pdf_url":"https://arxiv.org/pdf/2408.05577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07253v1","updated":"2024-08-14T02:06:24Z","published":"2024-08-14T02:06:24Z","title":"All-around Neural Collapse for Imbalanced Classification","summary":" Neural Collapse (NC) presents an elegant geometric structure that enables\nindividual activations (features), class means and classifier (weights) vectors\nto reach \\textit{optimal} inter-class separability during the terminal phase of\ntraining on a \\textit{balanced} dataset. Once shifted to imbalanced\nclassification, such an optimal structure of NC can be readily destroyed by the\nnotorious \\textit{minority collapse}, where the classifier vectors\ncorresponding to the minority classes are squeezed. In response, existing works\nendeavor to recover NC typically by optimizing classifiers. However, we\ndiscover that this squeezing phenomenon is not only confined to classifier\nvectors but also occurs with class means.\n Consequently, reconstructing NC solely at the classifier aspect may be\nfutile, as the feature means remain compressed, leading to the violation of\ninherent \\textit{self-duality} in NC (\\textit{i.e.}, class means and classifier\nvectors converge mutually) and incidentally, resulting in an unsatisfactory\ncollapse of individual activations towards the corresponding class means. To\nshake off these dilemmas, we present a unified \\textbf{All}-around\n\\textbf{N}eural \\textbf{C}ollapse framework (AllNC), aiming to comprehensively\nrestore NC across multiple aspects including individual activations, class\nmeans and classifier vectors. We thoroughly analyze its effectiveness and\nverify on multiple benchmark datasets that it achieves state-of-the-art in both\nbalanced and imbalanced settings.\n","authors":["Enhao Zhang","Chaohua Li","Chuanxing Geng","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08729v2","updated":"2024-08-14T01:46:33Z","published":"2024-07-11T17:58:10Z","title":"BiEquiFormer: Bi-Equivariant Representations for Global Point Cloud\n Registration","summary":" The goal of this paper is to address the problem of global point cloud\nregistration (PCR) i.e., finding the optimal alignment between point clouds\nirrespective of the initial poses of the scans. This problem is notoriously\nchallenging for classical optimization methods due to computational\nconstraints. First, we show that state-of-the-art deep learning methods suffer\nfrom huge performance degradation when the point clouds are arbitrarily placed\nin space. We propose that equivariant deep learning should be utilized for\nsolving this task and we characterize the specific type of bi-equivariance of\nPCR. Then, we design BiEquiformer a novel and scalable bi-equivariant pipeline\ni.e. equivariant to the independent transformations of the input point clouds.\nWhile a naive approach would process the point clouds independently we design\nexpressive bi-equivariant layers that fuse the information from both point\nclouds. This allows us to extract high-quality superpoint correspondences and\nin turn, robust point-cloud registration. Extensive comparisons against\nstate-of-the-art methods show that our method achieves comparable performance\nin the canonical setting and superior performance in the robust setting in both\nthe 3DMatch and the challenging low-overlap 3DLoMatch dataset.\n","authors":["Stefanos Pertigkiozoglou","Evangelos Chatzipantazis","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2407.08729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06840v2","updated":"2024-08-14T01:37:17Z","published":"2024-08-13T12:01:22Z","title":"Dynamic and Compressive Adaptation of Transformers From Images to Videos","summary":" Recently, the remarkable success of pre-trained Vision Transformers (ViTs)\nfrom image-text matching has sparked an interest in image-to-video adaptation.\nHowever, most current approaches retain the full forward pass for each frame,\nleading to a high computation overhead for processing entire videos. In this\npaper, we present InTI, a novel approach for compressive image-to-video\nadaptation using dynamic Inter-frame Token Interpolation. InTI aims to softly\npreserve the informative tokens without disrupting their coherent\nspatiotemporal structure. Specifically, each token pair at identical positions\nwithin neighbor frames is linearly aggregated into a new token, where the\naggregation weights are generated by a multi-scale context-aware network. In\nthis way, the information of neighbor frames can be adaptively compressed in a\npoint-by-point manner, thereby effectively reducing the number of processed\nframes by half each time. Importantly, InTI can be seamlessly integrated with\nexisting adaptation methods, achieving strong performance without extra-complex\ndesign. On Kinetics-400, InTI reaches a top-1 accuracy of 87.1 with a\nremarkable 37.5% reduction in GFLOPs compared to naive adaptation. When\ncombined with additional temporal modules, InTI achieves a top-1 accuracy of\n87.6 with a 37% reduction in GFLOPs. Similar conclusions have been verified in\nother common datasets.\n","authors":["Guozhen Zhang","Jingyu Liu","Shengming Cao","Xiaotong Zhao","Kevin Zhao","Kai Ma","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13672v6","updated":"2024-08-14T01:24:34Z","published":"2023-09-24T15:40:40Z","title":"RL-I2IT: Image-to-Image Translation with Deep Reinforcement Learning","summary":" Most existing Image-to-Image Translation (I2IT) methods generate images in a\nsingle run of a deep learning (DL) model. However, designing such a single-step\nmodel is always challenging, requiring a huge number of parameters and easily\nfalling into bad global minimums and overfitting. In this work, we reformulate\nI2IT as a step-wise decision-making problem via deep reinforcement learning\n(DRL) and propose a novel framework that performs RL-based I2IT (RL-I2IT). The\nkey feature in the RL-I2IT framework is to decompose a monolithic learning\nprocess into small steps with a lightweight model to progressively transform a\nsource image successively to a target image. Considering that it is challenging\nto handle high dimensional continuous state and action spaces in the\nconventional RL framework, we introduce meta policy with a new concept Plan to\nthe standard Actor-Critic model, which is of a lower dimension than the\noriginal image and can facilitate the actor to generate a tractable high\ndimensional action. In the RL-I2IT framework, we also employ a task-specific\nauxiliary learning strategy to stabilize the training process and improve the\nperformance of the corresponding task. Experiments on several I2IT tasks\ndemonstrate the effectiveness and robustness of the proposed method when facing\nhigh-dimensional continuous action space problems. Our implementation of the\nRL-I2IT framework is available at\nhttps://github.com/Algolzw/SPAC-Deformable-Registration.\n","authors":["Xin Wang","Ziwei Luo","Jing Hu","Chengming Feng","Shu Hu","Bin Zhu","Xi Wu","Hongtu Zhu","Xin Li","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2309.13672v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07249v1","updated":"2024-08-14T01:24:09Z","published":"2024-08-14T01:24:09Z","title":"GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval","summary":" In the rapidly expanding domain of web video content, the task of text-video\nretrieval has become increasingly critical, bridging the semantic gap between\ntextual queries and video data. This paper introduces a novel data-centric\napproach, Generalized Query Expansion (GQE), to address the inherent\ninformation imbalance between text and video, enhancing the effectiveness of\ntext-video retrieval systems. Unlike traditional model-centric methods that\nfocus on designing intricate cross-modal interaction mechanisms, GQE aims to\nexpand the text queries associated with videos both during training and testing\nphases. By adaptively segmenting videos into short clips and employing\nzero-shot captioning, GQE enriches the training dataset with comprehensive\nscene descriptions, effectively bridging the data imbalance gap. Furthermore,\nduring retrieval, GQE utilizes Large Language Models (LLM) to generate a\ndiverse set of queries and a query selection module to filter these queries\nbased on relevance and diversity, thus optimizing retrieval performance while\nreducing computational overhead. Our contributions include a detailed\nexamination of the information imbalance challenge, a novel approach to query\nexpansion in video-text datasets, and the introduction of a query selection\nstrategy that enhances retrieval accuracy without increasing computational\ncosts. GQE achieves state-of-the-art performance on several benchmarks,\nincluding MSR-VTT, MSVD, LSMDC, and VATEX, demonstrating the effectiveness of\naddressing text-video retrieval from a data-centric perspective.\n","authors":["Zechen Bai","Tianjun Xiao","Tong He","Pichao Wang","Zheng Zhang","Thomas Brox","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.07249v1.pdf","comment":"18 pages including appendix"},{"id":"http://arxiv.org/abs/2408.07246v1","updated":"2024-08-14T01:16:40Z","published":"2024-08-14T01:16:40Z","title":"Seeing and Understanding: Bridging Vision with Chemical Knowledge Via\n ChemVLM","summary":" In this technical report, we propose ChemVLM, the first open-source\nmultimodal large language model dedicated to the fields of chemistry, designed\nto address the incompatibility between chemical image understanding and text\nanalysis. Built upon the VIT-MLP-LLM architecture, we leverage ChemLLM-20B as\nthe foundational large model, endowing our model with robust capabilities in\nunderstanding and utilizing chemical text knowledge. Additionally, we employ\nInternVIT-6B as a powerful image encoder. We have curated high-quality data\nfrom the chemical domain, including molecules, reaction formulas, and chemistry\nexamination data, and compiled these into a bilingual multimodal\nquestion-answering dataset. We test the performance of our model on multiple\nopen-source benchmarks and three custom evaluation sets. Experimental results\ndemonstrate that our model achieves excellent performance, securing\nstate-of-the-art results in five out of six involved tasks. Our model can be\nfound at https://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v1.pdf","comment":"Techical report"},{"id":"http://arxiv.org/abs/2408.07244v1","updated":"2024-08-14T00:56:51Z","published":"2024-08-14T00:56:51Z","title":"Sign language recognition based on deep learning and low-cost\n handcrafted descriptors","summary":" In recent years, deep learning techniques have been used to develop sign\nlanguage recognition systems, potentially serving as a communication tool for\nmillions of hearing-impaired individuals worldwide. However, there are inherent\nchallenges in creating such systems. Firstly, it is important to consider as\nmany linguistic parameters as possible in gesture execution to avoid ambiguity\nbetween words. Moreover, to facilitate the real-world adoption of the created\nsolution, it is essential to ensure that the chosen technology is realistic,\navoiding expensive, intrusive, or low-mobility sensors, as well as very complex\ndeep learning architectures that impose high computational requirements. Based\non this, our work aims to propose an efficient sign language recognition system\nthat utilizes low-cost sensors and techniques. To this end, an object detection\nmodel was trained specifically for detecting the interpreter's face and hands,\nensuring focus on the most relevant regions of the image and generating inputs\nwith higher semantic value for the classifier. Additionally, we introduced a\nnovel approach to obtain features representing hand location and movement by\nleveraging spatial information derived from centroid positions of bounding\nboxes, thereby enhancing sign discrimination. The results demonstrate the\nefficiency of our handcrafted features, increasing accuracy by 7.96% on the\nAUTSL dataset, while adding fewer than 700 thousand parameters and incurring\nless than 10 milliseconds of additional inference time. These findings\nhighlight the potential of our technique to strike a favorable balance between\ncomputational cost and accuracy, making it a promising approach for practical\nsign language recognition applications.\n","authors":["Alvaro Leandro Cavalcante Carneiro","Denis Henrique Pinheiro Salvadeo","Lucas de Brito Silva"],"pdf_url":"https://arxiv.org/pdf/2408.07244v1.pdf","comment":"28 pages, 12 figures, submitted to Image and Vision Computing Journal"},{"id":"http://arxiv.org/abs/2408.07243v1","updated":"2024-08-14T00:55:52Z","published":"2024-08-14T00:55:52Z","title":"Leveraging Perceptual Scores for Dataset Pruning in Computer Vision\n Tasks","summary":" In this paper we propose a score of an image to use for coreset selection in\nimage classification and semantic segmentation tasks. The score is the entropy\nof an image as approximated by the bits-per-pixel of its compressed version.\nThus the score is intrinsic to an image and does not require supervision or\ntraining. It is very simple to compute and readily available as all images are\nstored in a compressed format. The motivation behind our choice of score is\nthat most other scores proposed in literature are expensive to compute. More\nimportantly, we want a score that captures the perceptual complexity of an\nimage. Entropy is one such measure, images with clutter tend to have a higher\nentropy. However sampling only low entropy iconic images, for example, leads to\nbiased learning and an overall decrease in test performance with current deep\nlearning models. To mitigate the bias we use a graph based method that\nincreases the spatial diversity of the selected samples. We show that this\nsimple score yields good results, particularly for semantic segmentation tasks.\n","authors":["Raghavendra Singh"],"pdf_url":"https://arxiv.org/pdf/2408.07243v1.pdf","comment":"1st workshop on Dataset Distillation CVPR 2024"},{"id":"http://arxiv.org/abs/2408.07239v1","updated":"2024-08-14T00:08:28Z","published":"2024-08-14T00:08:28Z","title":"Enhancing Autonomous Vehicle Perception in Adverse Weather through Image\n Augmentation during Semantic Segmentation Training","summary":" Robust perception is crucial in autonomous vehicle navigation and\nlocalization. Visual processing tasks, like semantic segmentation, should work\nin varying weather conditions and during different times of day. Semantic\nsegmentation is where each pixel is assigned a class, which is useful for\nlocating overall features (1). Training a segmentation model requires large\namounts of data, and the labeling process for segmentation data is especially\ntedious. Additionally, many large datasets include only images taken in clear\nweather. This is a problem because training a model exclusively on clear\nweather data hinders performance in adverse weather conditions like fog or\nrain. We hypothesize that given a dataset of only clear days images, applying\nimage augmentation (such as random rain, fog, and brightness) during training\nallows for domain adaptation to diverse weather conditions. We used CARLA, a 3D\nrealistic autonomous vehicle simulator, to collect 1200 images in clear weather\ncomposed of 29 classes from 10 different towns (2). We also collected 1200\nimages of random weather effects. We trained encoder-decoder UNet models to\nperform semantic segmentation. Applying augmentations significantly improved\nsegmentation under weathered night conditions (p < 0.001). However, models\ntrained on weather data have significantly lower losses than those trained on\naugmented data in all conditions except for clear days. This shows there is\nroom for improvement in the domain adaptation approach. Future work should test\nmore types of augmentations and also use real-life images instead of CARLA.\nIdeally, the augmented model meets or exceeds the performance of the weather\nmodel.\n","authors":["Ethan Kou","Noah Curran"],"pdf_url":"https://arxiv.org/pdf/2408.07239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15478v3","updated":"2024-08-14T22:39:41Z","published":"2023-11-27T01:41:25Z","title":"HawkI: Homography & Mutual Information Guidance for 3D-free Single Image\n to Aerial View","summary":" We present HawkI, for synthesizing aerial-view images from text and an\nexemplar image, without any additional multi-view or 3D information for\nfinetuning or at inference. HawkI uses techniques from classical computer\nvision and information theory. It seamlessly blends the visual features from\nthe input image within a pretrained text-to-2Dimage stable diffusion model with\na test-time optimization process for a careful bias-variance trade-off, which\nuses an Inverse Perspective Mapping (IPM) homography transformation to provide\nsubtle cues for aerialview synthesis. At inference, HawkI employs a unique\nmutual information guidance formulation to steer the generated image towards\nfaithfully replicating the semantic details of the input-image, while\nmaintaining a realistic aerial perspective. Mutual information guidance\nmaximizes the semantic consistency between the generated image and the input\nimage, without enforcing pixel-level correspondence between vastly different\nviewpoints. Through extensive qualitative and quantitative comparisons against\ntext + exemplar-image based methods and 3D/ multi-view based novel-view\nsynthesis methods on proposed synthetic and real datasets, we demonstrate that\nour method achieves a significantly better bias-variance trade-off towards\ngenerating high fidelity aerial-view images.Code and data is available at\nhttps://github.com/divyakraman/HawkI2024.\n","authors":["Divya Kothandaraman","Tianyi Zhou","Ming Lin","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2311.15478v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15894v2","updated":"2024-08-14T21:58:50Z","published":"2024-02-24T20:02:00Z","title":"Multi-graph Graph Matching for Coronary Artery Semantic Labeling","summary":" Coronary artery disease (CAD) stands as the leading cause of death worldwide,\nand invasive coronary angiography (ICA) remains the gold standard for assessing\nvascular anatomical information. However, deep learning-based methods encounter\nchallenges in generating semantic labels for arterial segments, primarily due\nto the morphological similarity between arterial branches and varying anatomy\nof arterial system between different projection view angles and patients. To\naddress this challenge, we model the vascular tree as a graph and propose a\nmulti-graph graph matching (MGM) algorithm for coronary artery semantic\nlabeling. The MGM algorithm assesses the similarity between arterials in\nmultiple vascular tree graphs, considering the cycle consistency between each\npair of graphs. As a result, the unannotated arterial segments are\nappropriately labeled by matching them with annotated segments. Through the\nincorporation of anatomical graph structure, radiomics features, and semantic\nmapping, the proposed MGM model achieves an impressive accuracy of 0.9471 for\ncoronary artery semantic labeling using our multi-site dataset with 718 ICAs.\nWith the semantic labeled arteries, an overall accuracy of 0.9155 was achieved\nfor stenosis detection. The proposed MGM presents a novel tool for coronary\nartery analysis using multiple ICA-derived graphs, offering valuable insights\ninto vascular health and pathology.\n","authors":["Chen Zhao","Zhihui Xu","Pukar Baral","Michel Esposito","Weihua Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.15894v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06467v2","updated":"2024-08-14T21:53:32Z","published":"2024-08-12T19:42:09Z","title":"Generalization Enhancement Strategies to Enable Cross-year Cropland\n Mapping with Convolutional Neural Networks Trained Using Historical Samples","summary":" The accuracy of mapping agricultural fields across large areas is steadily\nimproving with high-resolution satellite imagery and deep learning (DL) models,\neven in regions where fields are small and geometrically irregular. However,\ndeveloping effective DL models often requires large, expensive label datasets,\ntypically available only for specific years or locations. This limits the\nability to create annual maps essential for agricultural monitoring, as domain\nshifts occur between years and regions due to changes in farming practices and\nenvironmental conditions. The challenge is to design a model flexible enough to\naccount for these shifts without needing yearly labels. While domain adaptation\ntechniques or semi-supervised training are common solutions, we explored\nenhancing the model's generalization power. Our results indicate that a\nholistic approach is essential, combining methods to improve generalization.\nSpecifically, using an area-based loss function, such as Tversky-focal loss\n(TFL), significantly improved predictions across multiple years. The use of\ndifferent augmentation techniques helped to encode different types of\ninvariance, particularly photometric augmentations encoded invariance to\nbrightness changes, though they increased false positives. The combination of\nphotometric augmentation, TFL loss, and MC-dropout produced the best results,\nalthough dropout alone led to more false negatives in subsequent year\npredictions. Additionally, the choice of input normalization had a significant\nimpact, with the best results obtained when statistics were calculated either\nlocally or across the entire dataset over all bands (lab and gab). We developed\na workflow that enabled a U-Net model to generate effective multi-year crop\nmaps over large areas. Our code, available at:\nhttps://github.com/agroimpacts/cnn-generalization-enhancement, will be\nregularly updated with improvements.\n","authors":["Sam Khallaghi","Rahebe Abedi","Hanan Abou Ali","Mary Dziedzorm Asipunu","Ismail Alatise","Nguyen Ha","Boka Luo","Cat Mai","Lei Song","Amos Wussah","Sitian Xiong","Qi Zhang","Lyndon D. Estes"],"pdf_url":"https://arxiv.org/pdf/2408.06467v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00738v2","updated":"2024-08-14T21:18:38Z","published":"2024-08-01T17:35:58Z","title":"Virchow2: Scaling Self-Supervised Mixed Magnification Models in\n Pathology","summary":" Foundation models are rapidly being developed for computational pathology\napplications. However, it remains an open question which factors are most\nimportant for downstream performance with data scale and diversity, model size,\nand training algorithm all playing a role. In this work, we propose algorithmic\nmodifications, tailored for pathology, and we present the result of scaling\nboth data and model size, surpassing previous studies in both dimensions. We\nintroduce two new models: Virchow2, a 632 million parameter vision transformer,\nand Virchow2G, a 1.9 billion parameter vision transformer, each trained with\n3.1 million histopathology whole slide images, with diverse tissues,\noriginating institutions, and stains. We achieve state of the art performance\non 12 tile-level tasks, as compared to the top performing competing models. Our\nresults suggest that data diversity and domain-specific methods can outperform\nmodels that only scale in the number of parameters, but, on average,\nperformance benefits from the combination of domain-specific methods, data\nscale, and model scale.\n","authors":["Eric Zimmermann","Eugene Vorontsov","Julian Viret","Adam Casson","Michal Zelechowski","George Shaikovski","Neil Tenenholtz","James Hall","David Klimstra","Razik Yousfi","Thomas Fuchs","Nicolo Fusi","Siqi Liu","Kristen Severson"],"pdf_url":"https://arxiv.org/pdf/2408.00738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07815v1","updated":"2024-08-14T21:10:05Z","published":"2024-08-14T21:10:05Z","title":"Algebraic Representations for Faster Predictions in Convolutional Neural\n Networks","summary":" Convolutional neural networks (CNNs) are a popular choice of model for tasks\nin computer vision. When CNNs are made with many layers, resulting in a deep\nneural network, skip connections may be added to create an easier gradient\noptimization problem while retaining model expressiveness. In this paper, we\nshow that arbitrarily complex, trained, linear CNNs with skip connections can\nbe simplified into a single-layer model, resulting in greatly reduced\ncomputational requirements during prediction time. We also present a method for\ntraining nonlinear models with skip connections that are gradually removed\nthroughout training, giving the benefits of skip connections without requiring\ncomputational overhead during during prediction time. These results are\ndemonstrated with practical examples on Residual Networks (ResNet)\narchitecture.\n","authors":["Johnny Joyce","Jan Verschelde"],"pdf_url":"https://arxiv.org/pdf/2408.07815v1.pdf","comment":"Accepted for publication in the proceedings of the 27th International\n Workshop on Computer Algebra in Scientific Computing (CASC 2024)"},{"id":"http://arxiv.org/abs/2312.00938v2","updated":"2024-08-14T20:21:01Z","published":"2023-12-01T21:36:14Z","title":"WATonoBus: Field-Tested All-Weather Autonomous Shuttle Technology","summary":" All-weather autonomous vehicle operation poses significant challenges,\nencompassing modules from perception and decision-making to path planning and\ncontrol. The complexity arises from the need to address adverse weather\nconditions such as rain, snow, and fog across the autonomy stack. Conventional\nmodel-based single-module approaches often lack holistic integration with\nupstream or downstream tasks. We tackle this problem by proposing a\nmulti-module and modular system architecture with considerations for adverse\nweather across the perception level, through features such as snow covered curb\ndetection, to decision-making and safety monitoring. Through daily weekday\nservice on the WATonoBus platform for almost two years, we demonstrate that our\nproposed approach is capable of addressing adverse weather conditions and\nprovide valuable insights from edge cases observed during operation.\n","authors":["Neel P. Bhatt","Ruihe Zhang","Minghao Ning","Ahmad Reza Alghooneh","Joseph Sun","Pouya Panahandeh","Ehsan Mohammadbagher","Ted Ecclestone","Ben MacCallum","Ehsan Hashemi","Amir Khajepour"],"pdf_url":"https://arxiv.org/pdf/2312.00938v2.pdf","comment":"8 pages, 10 figures. This work has been submitted to the ITSC for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2408.07791v1","updated":"2024-08-14T20:03:53Z","published":"2024-08-14T20:03:53Z","title":"An Efficient and Explanatory Image and Text Clustering System with\n Multimodal Autoencoder Architecture","summary":" We demonstrate the efficiencies and explanatory abilities of extensions to\nthe common tools of Autoencoders and LLM interpreters, in the novel context of\ncomparing different cultural approaches to the same international news event.\nWe develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model\nthat extends the modalities of previous CVAE models, by using fully-connected\nlatent layers to embed in parallel the CNN encodings of video frames, together\nwith the LSTM encodings of their related text derived from audio. We\nincorporate the model within a larger system that includes frame-caption\nalignment, latent space vector clustering, and a novel LLM-based cluster\ninterpreter. We measure, tune, and apply this system to the task of summarizing\na video into three to five thematic clusters, with each theme described by ten\nLLM-produced phrases. We apply this system to two news topics, COVID-19 and the\nWinter Olympics, and five other topics are in progress.\n","authors":["Tiancheng Shi","Yuanchen Wei","John R. Kender"],"pdf_url":"https://arxiv.org/pdf/2408.07791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07790v1","updated":"2024-08-14T20:03:03Z","published":"2024-08-14T20:03:03Z","title":"Cropper: Vision-Language Model for Image Cropping through In-Context\n Learning","summary":" The goal of image cropping is to identify visually appealing crops within an\nimage. Conventional methods rely on specialized architectures trained on\nspecific datasets, which struggle to be adapted to new requirements. Recent\nbreakthroughs in large vision-language models (VLMs) have enabled visual\nin-context learning without explicit training. However, effective strategies\nfor vision downstream tasks with VLMs remain largely unclear and underexplored.\nIn this paper, we propose an effective approach to leverage VLMs for better\nimage cropping. First, we propose an efficient prompt retrieval mechanism for\nimage cropping to automate the selection of in-context examples. Second, we\nintroduce an iterative refinement strategy to iteratively enhance the predicted\ncrops. The proposed framework, named Cropper, is applicable to a wide range of\ncropping tasks, including free-form cropping, subject-aware cropping, and\naspect ratio-aware cropping. Extensive experiments and a user study demonstrate\nthat Cropper significantly outperforms state-of-the-art methods across several\nbenchmarks.\n","authors":["Seung Hyun Lee","Junjie Ke","Yinxiao Li","Junfeng He","Steven Hickson","Katie Datsenko","Sangpil Kim","Ming-Hsuan Yang","Irfan Essa","Feng Yang"],"pdf_url":"https://arxiv.org/pdf/2408.07790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07786v1","updated":"2024-08-14T19:49:19Z","published":"2024-08-14T19:49:19Z","title":"Perspectives: Comparison of Deep Learning Segmentation Models on\n Biophysical and Biomedical Data","summary":" Deep learning based approaches are now widely used across biophysics to help\nautomate a variety of tasks including image segmentation, feature selection,\nand deconvolution. However, the presence of multiple competing deep learning\narchitectures, each with its own unique advantages and disadvantages, makes it\nchallenging to select an architecture best suited for a specific application.\nAs such, we present a comprehensive comparison of common models. Here, we focus\non the task of segmentation assuming the typically small training dataset sizes\navailable from biophysics experiments and compare the following four commonly\nused architectures: convolutional neural networks, U-Nets, vision transformers,\nand vision state space models. In doing so, we establish criteria for\ndetermining optimal conditions under which each model excels, thereby offering\npractical guidelines for researchers and practitioners in the field.\n","authors":["J Shepard Bryan IV","Meyam Tavakoli","Steve Presse"],"pdf_url":"https://arxiv.org/pdf/2408.07786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07785v1","updated":"2024-08-14T19:36:54Z","published":"2024-08-14T19:36:54Z","title":"NeuroPapyri: A Deep Attention Embedding Network for Handwritten Papyri\n Retrieval","summary":" The intersection of computer vision and machine learning has emerged as a\npromising avenue for advancing historical research, facilitating a more\nprofound exploration of our past. However, the application of machine learning\napproaches in historical palaeography is often met with criticism due to their\nperceived ``black box'' nature. In response to this challenge, we introduce\nNeuroPapyri, an innovative deep learning-based model specifically designed for\nthe analysis of images containing ancient Greek papyri. To address concerns\nrelated to transparency and interpretability, the model incorporates an\nattention mechanism. This attention mechanism not only enhances the model's\nperformance but also provides a visual representation of the image regions that\nsignificantly contribute to the decision-making process. Specifically\ncalibrated for processing images of papyrus documents with lines of handwritten\ntext, the model utilizes individual attention maps to inform the presence or\nabsence of specific characters in the input image. This paper presents the\nNeuroPapyri model, including its architecture and training methodology. Results\nfrom the evaluation demonstrate NeuroPapyri's efficacy in document retrieval,\nshowcasing its potential to advance the analysis of historical manuscripts.\n","authors":["Giuseppe De Gregorio","Simon Perrin","Rodrigo C. G. Pena","Isabelle Marthot-Santaniello","Harold Mouchère"],"pdf_url":"https://arxiv.org/pdf/2408.07785v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.07650v1","updated":"2024-08-14T16:21:28Z","published":"2024-08-14T16:21:28Z","title":"Exact Trajectory Similarity Search With N-tree: An Efficient Metric\n Index for kNN and Range Queries","summary":" Similarity search is the problem of finding in a collection of objects those\nthat are similar to a given query object. It is a fundamental problem in modern\napplications and the objects considered may be as diverse as locations in\nspace, text documents, images, twitter messages, or trajectories of moving\nobjects.\n In this paper we are motivated by the latter application. Trajectories are\nrecorded movements of mobile objects such as vehicles, animals, public\ntransportation, or parts of the human body. We propose a novel distance\nfunction called DistanceAvg to capture the similarity of such movements. To be\npractical, it is necessary to provide indexing for this distance measure.\n Fortunately we do not need to start from scratch. A generic and unifying\napproach is metric space, which organizes the set of objects solely by a\ndistance (similarity) function with certain natural properties. Our function\nDistanceAvg is a metric.\n Although metric indexes have been studied for decades and many such\nstructures are available, they do not offer the best performance with\ntrajectories. In this paper we propose a new design, which outperforms the best\nexisting indexes for kNN queries and is equally good for range queries. It is\nespecially suitable for expensive distance functions as they occur in\ntrajectory similarity search. In many applications, kNN queries are more\npractical than range queries as it may be difficult to determine an appropriate\nsearch radius. Our index provides exact result sets for the given distance\nfunction.\n","authors":["Ralf Hartmut Güting","Suvam Kumar Das","Fabio Valdés","Suprio Ray"],"pdf_url":"https://arxiv.org/pdf/2408.07650v1.pdf","comment":"54 pages, 26 figures"},{"id":"http://arxiv.org/abs/2408.07630v1","updated":"2024-08-14T15:56:27Z","published":"2024-08-14T15:56:27Z","title":"Towards Fair and Rigorous Evaluations: Hyperparameter Optimization for\n Top-N Recommendation Task with Implicit Feedback","summary":" The widespread use of the internet has led to an overwhelming amount of data,\nwhich has resulted in the problem of information overload. Recommender systems\nhave emerged as a solution to this problem by providing personalized\nrecommendations to users based on their preferences and historical data.\nHowever, as recommendation models become increasingly complex, finding the best\nhyperparameter combination for different models has become a challenge. The\nhigh-dimensional hyperparameter search space poses numerous challenges for\nresearchers, and failure to disclose hyperparameter settings may impede the\nreproducibility of research results. In this paper, we investigate the Top-N\nimplicit recommendation problem and focus on optimizing the benchmark\nrecommendation algorithm commonly used in comparative experiments using\nhyperparameter optimization algorithms. We propose a research methodology that\nfollows the principles of a fair comparison, employing seven types of\nhyperparameter search algorithms to fine-tune six common recommendation\nalgorithms on three datasets. We have identified the most suitable\nhyperparameter search algorithms for various recommendation algorithms on\ndifferent types of datasets as a reference for later study. This study\ncontributes to algorithmic research in recommender systems based on\nhyperparameter optimization, providing a fair basis for comparison.\n","authors":["Hui Fang","Xu Feng","Lu Qin","Zhu Sun"],"pdf_url":"https://arxiv.org/pdf/2408.07630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02354v3","updated":"2024-08-14T15:19:41Z","published":"2024-08-05T10:02:29Z","title":"RECE: Reduced Cross-Entropy Loss for Large-Catalogue Sequential\n Recommenders","summary":" Scalability is a major challenge in modern recommender systems. In sequential\nrecommendations, full Cross-Entropy (CE) loss achieves state-of-the-art\nrecommendation quality but consumes excessive GPU memory with large item\ncatalogs, limiting its practicality. Using a GPU-efficient locality-sensitive\nhashing-like algorithm for approximating large tensor of logits, this paper\nintroduces a novel RECE (REduced Cross-Entropy) loss. RECE significantly\nreduces memory consumption while allowing one to enjoy the state-of-the-art\nperformance of full CE loss. Experimental results on various datasets show that\nRECE cuts training peak memory usage by up to 12 times compared to existing\nmethods while retaining or exceeding performance metrics of CE loss. The\napproach also opens up new possibilities for large-scale applications in other\ndomains.\n","authors":["Danil Gusak","Gleb Mezentsev","Ivan Oseledets","Evgeny Frolov"],"pdf_url":"https://arxiv.org/pdf/2408.02354v3.pdf","comment":"5 pages, accepted for CIKM'24"},{"id":"http://arxiv.org/abs/2408.07611v1","updated":"2024-08-14T15:19:16Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n Integrating Web Search and Knowledge Graphs","summary":" Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v1.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2406.03109v3","updated":"2024-08-14T14:46:43Z","published":"2024-06-05T09:57:58Z","title":"CAPRI-FAIR: Integration of Multi-sided Fairness in Contextual POI\n Recommendation Framework","summary":" Point-of-interest (POI) recommendation considers spatio-temporal factors like\ndistance, peak hours, and user check-ins. Given their influence on both\nconsumer experience and POI business, it's crucial to consider fairness from\nmultiple perspectives. Unfortunately, these systems often provide less accurate\nrecommendations to inactive users and less exposure to unpopular POIs. This\npaper develops a post-filter method that includes provider and consumer\nfairness in existing models, aiming to balance fairness metrics like item\nexposure with performance metrics such as precision and distance. Experiments\nshow that a linear scoring model for provider fairness in re-scoring items\noffers the best balance between performance and long-tail exposure, sometimes\nwithout much precision loss. Addressing consumer fairness by recommending more\npopular POIs to inactive users increased precision in some models and datasets.\nHowever, combinations that reached the Pareto front of consumer and provider\nfairness resulted in the lowest precision values, highlighting that tradeoffs\ndepend greatly on the model and dataset.\n","authors":["Francis Zac dela Cruz","Flora D. Salim","Yonchanok Khaokaew","Jeffrey Chan"],"pdf_url":"https://arxiv.org/pdf/2406.03109v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05736v2","updated":"2024-08-14T14:14:02Z","published":"2024-05-09T12:52:22Z","title":"Optimal Baseline Corrections for Off-Policy Contextual Bandits","summary":" The off-policy learning paradigm allows for recommender systems and general\nranking applications to be framed as decision-making problems, where we aim to\nlearn decision policies that optimize an unbiased offline estimate of an online\nreward metric. With unbiasedness comes potentially high variance, and prevalent\nmethods exist to reduce estimation variance. These methods typically make use\nof control variates, either additive (i.e., baseline corrections or doubly\nrobust methods) or multiplicative (i.e., self-normalisation). Our work unifies\nthese approaches by proposing a single framework built on their equivalence in\nlearning scenarios. The foundation of our framework is the derivation of an\nequivalent baseline correction for all of the existing control variates.\nConsequently, our framework enables us to characterize the variance-optimal\nunbiased estimator and provide a closed-form solution for it. This optimal\nestimator brings significantly improved performance in both evaluation and\nlearning, and minimizes data requirements. Empirical observations corroborate\nour theoretical findings.\n","authors":["Shashank Gupta","Olivier Jeunen","Harrie Oosterhuis","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2405.05736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07542v1","updated":"2024-08-14T13:22:14Z","published":"2024-08-14T13:22:14Z","title":"New Curriculum, New Chance -- Retrieval Augmented Generation for Lesson\n Planning in Ugandan Secondary Schools. Prototype Quality Evaluation","summary":" Introduction: Poor educational quality in Secondary Schools is still regarded\nas one of the major struggles in 21st century Uganda - especially in rural\nareas. Research identifies several problems, including low quality or absent\nteacher lesson planning. As the government pushes towards the implementation of\na new curriculum, exiting lesson plans become obsolete and the problem is\nworsened. Using a Retrieval Augmented Generation approach, we developed a\nprototype that generates customized lesson plans based on the\ngovernment-accredited textbooks. This helps teachers create lesson plans more\nefficiently and with better quality, ensuring they are fully aligned the new\ncurriculum and the competence-based learning approach.\n Methods: The prototype was created using Cohere LLM and Sentence Embeddings,\nand LangChain Framework - and thereafter made available on a public website.\nVector stores were trained for three new curriculum textbooks (ICT,\nMathematics, History), all at Secondary 1 Level. Twenty-four lessons plans were\ngenerated following a pseudo-random generation protocol, based on the suggested\nperiods in the textbooks. The lesson plans were analyzed regarding their\ntechnical quality by three independent raters following the Lesson Plan\nAnalysis Protocol (LPAP) by Ndihokubwayo et al. (2022) that is specifically\ndesigned for East Africa and competence-based curriculums.\n Results: Evaluation of 24 lesson plans using the LPAP resulted in an average\nquality of between 75 and 80%, corresponding to \"very good lesson plan\". None\nof the lesson plans scored below 65%, although one lesson plan could be argued\nto have been missing the topic. In conclusion, the quality of the generated\nlesson plans is at least comparable, if not better, than those created by\nhumans, as demonstrated in a study in Rwanda, whereby no lesson plan even\nreached the benchmark of 50%.\n","authors":["Simon Kloker","Herbertson Bukoli","Twaha Kateete"],"pdf_url":"https://arxiv.org/pdf/2408.07542v1.pdf","comment":"Presented at Ndejje University Second Annual Research Dissemination\n Symposium 2024"},{"id":"http://arxiv.org/abs/2408.05840v2","updated":"2024-08-14T11:07:17Z","published":"2024-08-11T18:22:12Z","title":"Iterative Improvement of an Additively Regularized Topic Model","summary":" Topic modelling is fundamentally a soft clustering problem (of known objects\n-- documents, over unknown clusters -- topics). That is, the task is\nincorrectly posed. In particular, the topic models are unstable and incomplete.\nAll this leads to the fact that the process of finding a good topic model\n(repeated hyperparameter selection, model training, and topic quality\nassessment) can be particularly long and labor-intensive. We aim to simplify\nthe process, to make it more deterministic and provable. To this end, we\npresent a method for iterative training of a topic model. The essence of the\nmethod is that a series of related topic models are trained so that each\nsubsequent model is at least as good as the previous one, i.e., that it retains\nall the good topics found earlier. The connection between the models is\nachieved by additive regularization. The result of this iterative training is\nthe last topic model in the series, which we call the iteratively updated\nadditively regularized topic model (ITAR). Experiments conducted on several\ncollections of natural language texts show that the proposed ITAR model\nperforms better than other popular topic models (LDA, ARTM, BERTopic), its\ntopics are diverse, and its perplexity (ability to \"explain\" the underlying\ndata) is moderate.\n","authors":["Alex Gorbulev","Vasiliy Alekseev","Konstantin Vorontsov"],"pdf_url":"https://arxiv.org/pdf/2408.05840v2.pdf","comment":"Fix HTML view. That is, fix the heap (strikethrough) order of .tex\n files using the auxiliary Arxiv Readme XXX"},{"id":"http://arxiv.org/abs/2408.07427v1","updated":"2024-08-14T10:03:40Z","published":"2024-08-14T10:03:40Z","title":"Beyond Inter-Item Relations: Dynamic Adaptive Mixture-of-Experts for\n LLM-Based Sequential Recommendation","summary":" Sequential recommender system (SRS) predicts the next items that users may\nprefer based on user historical interaction sequences. Inspired by the rise of\nlarge language models (LLMs) in various AI applications, there is a surge of\nwork on LLM-based SRS. Despite their attractive performance, existing LLM-based\nSRS still exhibit some limitations, including neglecting intra-item relations,\nignoring long-term collaborative knowledge and using inflexible architecture\ndesigns for adaption. To alleviate these issues, we propose an LLM-based SRS\nnamed MixRec. Built on top of coarse-grained adaption for capturing inter-item\nrelations, MixRec is further enhanced with (1) context masking that models\nintra-item relations to help LLM better understand token and item semantics in\nthe context of SRS, (2) collaborative knowledge injection that helps LLM\nincorporate long-term collaborative knowledge, and (3) a dynamic adaptive\nmixture-of-experts design that can flexibly choose expert architectures based\non Bayesian optimization to better incorporate different sequential\ninformation. Extensive experiments demonstrate that MixRec can effectively\nhandle sequential recommendation in a dynamic and adaptive manner.\n","authors":["CanYi Liu","Wei Li"," Youchen"," Zhang","Hui Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.07427v1.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.01107v2","updated":"2024-08-14T09:54:24Z","published":"2024-08-02T08:37:03Z","title":"BioRAG: A RAG-LLM Framework for Biological Question Reasoning","summary":" The question-answering system for Life science research, which is\ncharacterized by the rapid pace of discovery, evolving insights, and complex\ninteractions among knowledge entities, presents unique challenges in\nmaintaining a comprehensive knowledge warehouse and accurate information\nretrieval. To address these issues, we introduce BioRAG, a novel\nRetrieval-Augmented Generation (RAG) with the Large Language Models (LLMs)\nframework. Our approach starts with parsing, indexing, and segmenting an\nextensive collection of 22 million scientific papers as the basic knowledge,\nfollowed by training a specialized embedding model tailored to this domain.\nAdditionally, we enhance the vector retrieval process by incorporating a\ndomain-specific knowledge hierarchy, which aids in modeling the intricate\ninterrelationships among each query and context. For queries requiring the most\ncurrent information, BioRAG deconstructs the question and employs an iterative\nretrieval process incorporated with the search engine for step-by-step\nreasoning. Rigorous experiments have demonstrated that our model outperforms\nfine-tuned LLM, LLM with search engines, and other scientific RAG frameworks\nacross multiple life science question-answering tasks.\n","authors":["Chengrui Wang","Qingqing Long","Meng Xiao","Xunxin Cai","Chengjun Wu","Zhen Meng","Xuezhi Wang","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.01107v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.21034v2","updated":"2024-08-14T08:52:29Z","published":"2024-07-17T06:51:24Z","title":"Watermarking Recommender Systems","summary":" Recommender systems embody significant commercial value and represent crucial\nintellectual property. However, the integrity of these systems is constantly\nchallenged by malicious actors seeking to steal their underlying models.\nSafeguarding against such threats is paramount to upholding the rights and\ninterests of the model owner. While model watermarking has emerged as a potent\ndefense mechanism in various domains, its direct application to recommender\nsystems remains unexplored and non-trivial. In this paper, we address this gap\nby introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel\ntechnique tailored specifically for recommender systems. Our approach entails\nselecting an initial item and querying it through the oracle model, followed by\nthe selection of subsequent items with small prediction scores. This iterative\nprocess generates a watermark sequence autoregressively, which is then\ningrained into the model's memory through training. To assess the efficacy of\nthe watermark, the model is tasked with predicting the subsequent item given a\ntruncated watermark sequence. Through extensive experimentation and analysis,\nwe demonstrate the superior performance and robust properties of AOW. Notably,\nour watermarking technique exhibits high-confidence extraction capabilities and\nmaintains effectiveness even in the face of distillation and fine-tuning\nprocesses.\n","authors":["Sixiao Zhang","Cheng Long","Wei Yuan","Hongxu Chen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07222v2","updated":"2024-08-14T08:38:00Z","published":"2023-08-14T15:47:36Z","title":"MM-GEF: Multi-modal representation meet collaborative filtering","summary":" In modern e-commerce, item content features in various modalities offer\naccurate yet comprehensive information to recommender systems. The majority of\nprevious work either focuses on learning effective item representation during\nmodelling user-item interactions, or exploring item-item relationships by\nanalysing multi-modal features. Those methods, however, fail to incorporate the\ncollaborative item-user-item relationships into the multi-modal feature-based\nitem structure. In this work, we propose a graph-based item structure\nenhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion,\nwhich effectively combines the latent item structure underlying multi-modal\ncontents with the collaborative signals. Instead of processing the content\nfeature in different modalities separately, we show that the early-fusion of\nmulti-modal features provides significant improvement. MM-GEF learns refined\nitem representations by injecting structural information obtained from both\nmulti-modal and collaborative signals. Through extensive experiments on four\npublicly available datasets, we demonstrate systematical improvements of our\nmethod over state-of-the-art multi-modal recommendation methods.\n","authors":["Hao Wu","Alejandro Ariza-Casabona","Bartłomiej Twardowski","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2308.07222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08796v3","updated":"2024-08-14T08:06:28Z","published":"2024-04-12T20:03:06Z","title":"The Elephant in the Room: Rethinking the Usage of Pre-trained Language\n Model in Sequential Recommendation","summary":" Sequential recommendation (SR) has seen significant advancements with the\nhelp of Pre-trained Language Models (PLMs). Some PLM-based SR models directly\nuse PLM to encode user historical behavior's text sequences to learn user\nrepresentations, while there is seldom an in-depth exploration of the\ncapability and suitability of PLM in behavior sequence modeling. In this work,\nwe first conduct extensive model analyses between PLMs and PLM-based SR models,\ndiscovering great underutilization and parameter redundancy of PLMs in behavior\nsequence modeling. Inspired by this, we explore different lightweight usages of\nPLMs in SR, aiming to maximally stimulate the ability of PLMs for SR while\nsatisfying the efficiency and usability demands of practical systems. We\ndiscover that adopting behavior-tuned PLMs for item initializations of\nconventional ID-based SR models is the most economical framework of PLM-based\nSR, which would not bring in any additional inference cost but could achieve a\ndramatic performance boost compared with the original version. Extensive\nexperiments on five datasets show that our simple and universal framework leads\nto significant improvement compared to classical SR and SOTA PLM-based SR\nmodels without additional inference costs. Our code can be found in\nhttps://github.com/777pomingzi/Rethinking-PLM-in-RS.\n","authors":["Zekai Qu","Ruobing Xie","Chaojun Xiao","Xingwu Sun","Zhanhui Kang"],"pdf_url":"https://arxiv.org/pdf/2404.08796v3.pdf","comment":"Accepted at RecSys 2024"},{"id":"http://arxiv.org/abs/2310.14483v2","updated":"2024-08-14T07:42:30Z","published":"2023-10-23T01:29:18Z","title":"Chain-of-Factors Paper-Reviewer Matching","summary":" With the rapid increase in paper submissions to academic conferences, the\nneed for automated and accurate paper-reviewer matching is more critical than\never. Previous efforts in this area have considered various factors to assess\nthe relevance of a reviewer's expertise to a paper, such as the semantic\nsimilarity, shared topics, and citation connections between the paper and the\nreviewer's previous works. However, most of these studies focus on only one\nfactor, resulting in an incomplete evaluation of the paper-reviewer relevance.\nTo address this issue, we propose a unified model for paper-reviewer matching\nthat jointly considers semantic, topic, and citation factors. To be specific,\nduring training, we instruction-tune a contextualized language model shared\nacross all factors to capture their commonalities and characteristics; during\ninference, we chain the three factors to enable step-by-step, coarse-to-fine\nsearch for qualified reviewers given a submission. Experiments on four datasets\n(one of which is newly contributed by us) spanning various fields such as\nmachine learning, computer vision, information retrieval, and data mining\nconsistently demonstrate the effectiveness of our proposed Chain-of-Factors\nmodel in comparison with state-of-the-art paper-reviewer matching methods and\nscientific pre-trained language models.\n","authors":["Yu Zhang","Yanzhen Shen","SeongKu Kang","Xiusi Chen","Bowen Jin","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2310.14483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00816v3","updated":"2024-08-14T07:26:08Z","published":"2024-02-26T01:17:50Z","title":"Read and Think: An Efficient Step-wise Multimodal Language Model for\n Document Understanding and Reasoning","summary":" Understanding the contents of multimodal documents is essential to accurately\nextract relevant evidence and use it for reasoning. Existing document\nunderstanding models tend to generate answers with a single word or phrase\ndirectly, ignoring the source document's evidence and lacking interpretability.\nIn this work, we address the lack of step-wise capabilities through data\naugmentation and extension. Specifically, We use Multi-modal Large Language\nModels (MLLMs), which have strong visual understanding and reasoning abilities,\nas data generators to generate step-wise question-and-answer pairs for document\nimages and use a high-performance LLM as the error detector to filter out noisy\ndata. This step-wise data generation pipeline is implemented using both\ntemplate-based and few-shot methods. We then use the generated high-quality\ndata to train a humanized document understanding and reasoning model,\nspecifically designed to solve complex questions that require reasoning or\nmulti-hop question answering, dubbed DocAssistant. Experimental results\ndemonstrate the effectiveness and application value of step-wise generation,\nshowing a 5 improvement on InfoVQA with complex layouts and a 7 improvement on\nChartQA with complex reasoning, compared to directly generated answers. We hope\nour work highlights the potential of synthetic data and encourages further\nexploration of multi-modal document reasoning capabilities.\n","authors":["Jinxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.00816v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06643v2","updated":"2024-08-14T06:18:03Z","published":"2024-08-13T05:27:22Z","title":"BMX: Entropy-weighted Similarity and Semantic-enhanced Lexical Search","summary":" BM25, a widely-used lexical search algorithm, remains crucial in information\nretrieval despite the rise of pre-trained and large language models\n(PLMs/LLMs). However, it neglects query-document similarity and lacks semantic\nunderstanding, limiting its performance. We revisit BM25 and introduce BMX, a\nnovel extension of BM25 incorporating entropy-weighted similarity and semantic\nenhancement techniques. Extensive experiments demonstrate that BMX consistently\noutperforms traditional BM25 and surpasses PLM/LLM-based dense retrieval in\nlong-context and real-world retrieval benchmarks. This study bridges the gap\nbetween classical lexical search and modern semantic approaches, offering a\npromising direction for future information retrieval research. The reference\nimplementation of BMX can be found in Baguetter, which was created in the\ncontext of this work. The code can be found here:\nhttps://github.com/mixedbread-ai/baguetter.\n","authors":["Xianming Li","Julius Lipp","Aamir Shakir","Rui Huang","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2408.06643v2.pdf","comment":"correct the affiliation order"},{"id":"http://arxiv.org/abs/2405.14359v2","updated":"2024-08-14T05:33:07Z","published":"2024-05-23T09:34:28Z","title":"Look into the Future: Deep Contextualized Sequential Recommendation","summary":" Sequential recommendation aims to estimate how a user's interests evolve over\ntime via uncovering valuable patterns from user behavior history. Many previous\nsequential models have solely relied on users' historical information to model\nthe evolution of their interests, neglecting the crucial role that future\ninformation plays in accurately capturing these dynamics. However, effectively\nincorporating future information in sequential modeling is non-trivial since it\nis impossible to make the current-step prediction for any target user by\nleveraging his future data. In this paper, we propose a novel framework of\nsequential recommendation called Look into the Future (LIFT), which builds and\nleverages the contexts of sequential recommendation. In LIFT, the context of a\ntarget user's interaction is represented based on i) his own past behaviors and\nii) the past and future behaviors of the retrieved similar interactions from\nother users. As such, the learned context will be more informative and\neffective in predicting the target user's behaviors in sequential\nrecommendation without temporal data leakage. Furthermore, in order to exploit\nthe intrinsic information embedded within the context itself, we introduce an\ninnovative pretraining methodology incorporating behavior masking. In our\nextensive experiments on five real-world datasets, LIFT achieves significant\nperformance improvement on click-through rate prediction and rating prediction\ntasks in sequential recommendation over strong baselines, demonstrating that\nretrieving and leveraging relevant contexts from the global user pool greatly\nbenefits sequential recommendation. The experiment code is provided at\nhttps://anonymous.4open.science/r/LIFT-277C/Readme.md.\n","authors":["Lei Zheng","Ning Li","Yanhuan Huang","Ruiwen Xu","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2405.14359v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.18304 by other authors"},{"id":"http://arxiv.org/abs/2406.09021v2","updated":"2024-08-14T04:52:02Z","published":"2024-06-13T11:55:40Z","title":"Contextual Distillation Model for Diversified Recommendation","summary":" The diversity of recommendation is equally crucial as accuracy in improving\nuser experience. Existing studies, e.g., Determinantal Point Process (DPP) and\nMaximal Marginal Relevance (MMR), employ a greedy paradigm to iteratively\nselect items that optimize both accuracy and diversity. However, prior methods\ntypically exhibit quadratic complexity, limiting their applications to the\nre-ranking stage and are not applicable to other recommendation stages with a\nlarger pool of candidate items, such as the pre-ranking and ranking stages. In\nthis paper, we propose Contextual Distillation Model (CDM), an efficient\nrecommendation model that addresses diversification, suitable for the\ndeployment in all stages of industrial recommendation pipelines. Specifically,\nCDM utilizes the candidate items in the same user request as context to enhance\nthe diversification of the results. We propose a contrastive context encoder\nthat employs attention mechanisms to model both positive and negative contexts.\nFor the training of CDM, we compare each target item with its context embedding\nand utilize the knowledge distillation framework to learn the win probability\nof each target item under the MMR algorithm, where the teacher is derived from\nMMR outputs. During inference, ranking is performed through a linear\ncombination of the recommendation and student model scores, ensuring both\ndiversity and efficiency. We perform offline evaluations on two industrial\ndatasets and conduct online A/B test of CDM on the short-video platform\nKuaiShou. The considerable enhancements observed in both recommendation quality\nand diversity, as shown by metrics, provide strong superiority for the\neffectiveness of CDM.\n","authors":["Fan Li","Xu Si","Shisong Tang","Dingmin Wang","Kunyan Han","Bing Han","Guorui Zhou","Yang Song","Hechang Chen"],"pdf_url":"https://arxiv.org/pdf/2406.09021v2.pdf","comment":"accepted by KDD 2024 v2"},{"id":"http://arxiv.org/abs/2408.07249v1","updated":"2024-08-14T01:24:09Z","published":"2024-08-14T01:24:09Z","title":"GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval","summary":" In the rapidly expanding domain of web video content, the task of text-video\nretrieval has become increasingly critical, bridging the semantic gap between\ntextual queries and video data. This paper introduces a novel data-centric\napproach, Generalized Query Expansion (GQE), to address the inherent\ninformation imbalance between text and video, enhancing the effectiveness of\ntext-video retrieval systems. Unlike traditional model-centric methods that\nfocus on designing intricate cross-modal interaction mechanisms, GQE aims to\nexpand the text queries associated with videos both during training and testing\nphases. By adaptively segmenting videos into short clips and employing\nzero-shot captioning, GQE enriches the training dataset with comprehensive\nscene descriptions, effectively bridging the data imbalance gap. Furthermore,\nduring retrieval, GQE utilizes Large Language Models (LLM) to generate a\ndiverse set of queries and a query selection module to filter these queries\nbased on relevance and diversity, thus optimizing retrieval performance while\nreducing computational overhead. Our contributions include a detailed\nexamination of the information imbalance challenge, a novel approach to query\nexpansion in video-text datasets, and the introduction of a query selection\nstrategy that enhances retrieval accuracy without increasing computational\ncosts. GQE achieves state-of-the-art performance on several benchmarks,\nincluding MSR-VTT, MSVD, LSMDC, and VATEX, demonstrating the effectiveness of\naddressing text-video retrieval from a data-centric perspective.\n","authors":["Zechen Bai","Tianjun Xiao","Tong He","Pichao Wang","Zheng Zhang","Thomas Brox","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.07249v1.pdf","comment":"18 pages including appendix"},{"id":"http://arxiv.org/abs/2407.15462v2","updated":"2024-08-14T00:57:42Z","published":"2024-07-22T08:19:34Z","title":"Efficient Retrieval with Learned Similarities","summary":" Retrieval plays a fundamental role in recommendation systems, search, and\nnatural language processing by efficiently finding relevant items from a large\ncorpus given a query. Dot products have been widely used as the similarity\nfunction in such retrieval tasks, thanks to Maximum Inner Product Search (MIPS)\nthat enabled efficient retrieval based on dot products. However,\nstate-of-the-art retrieval algorithms have migrated to learned similarities.\nSuch algorithms vary in form; the queries can be represented with multiple\nembeddings, complex neural networks can be deployed, the item ids can be\ndecoded directly from queries using beam search, and multiple approaches can be\ncombined in hybrid solutions. Unfortunately, we lack efficient solutions for\nretrieval in these state-of-the-art setups. Our work investigates techniques\nfor approximate nearest neighbor search with learned similarity functions. We\nfirst prove that Mixture-of-Logits (MoL) is a universal approximator, and can\nexpress all learned similarity functions. We next propose techniques to\nretrieve the approximate top K results using MoL with a tight bound. We finally\ncompare our techniques with existing approaches, showing that MoL sets new\nstate-of-the-art results on recommendation retrieval tasks, and our approximate\ntop-k retrieval with learned similarities outperforms baselines by up to two\norders of magnitude in latency, while achieving > .99 recall rate of exact\nalgorithms.\n","authors":["Bailu Ding","Jiaqi Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.15462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04884v2","updated":"2024-08-14T22:17:10Z","published":"2024-08-09T06:17:18Z","title":"Enhancing Relevance of Embedding-based Retrieval at Walmart","summary":" Embedding-based neural retrieval (EBR) is an effective search retrieval\nmethod in product search for tackling the vocabulary gap between customer\nsearch queries and products. The initial launch of our EBR system at Walmart\nyielded significant gains in relevance and add-to-cart rates [1]. However,\ndespite EBR generally retrieving more relevant products for reranking, we have\nobserved numerous instances of relevance degradation. Enhancing retrieval\nperformance is crucial, as it directly influences product reranking and affects\nthe customer shopping experience. Factors contributing to these degradations\ninclude false positives/negatives in the training data and the inability to\nhandle query misspellings. To address these issues, we present several\napproaches to further strengthen the capabilities of our EBR model in terms of\nretrieval relevance. We introduce a Relevance Reward Model (RRM) based on human\nrelevance feedback. We utilize RRM to remove noise from the training data and\ndistill it into our EBR model through a multi-objective loss. In addition, we\npresent the techniques to increase the performance of our EBR model, such as\ntypo-aware training, and semi-positive generation. The effectiveness of our EBR\nis demonstrated through offline relevance evaluation, online AB tests, and\nsuccessful deployments to live production.\n [1] Alessandro Magnani, Feng Liu, Suthee Chaidaroon, Sachin Yadav, Praveen\nReddy Suram, Ajit Puthenputhussery, Sijie Chen, Min Xie, Anirudh Kashi, Tony\nLee, et al. 2022. Semantic retrieval at walmart. In Proceedings of the 28th ACM\nSIGKDD Conference on Knowledge Discovery and Data Mining. 3495-3503.\n","authors":["Juexin Lin","Sachin Yadav","Feng Liu","Nicholas Rossi","Praveen R. Suram","Satya Chembolu","Prijith Chandran","Hrushikesh Mohapatra","Tony Lee","Alessandro Magnani","Ciya Liao"],"pdf_url":"https://arxiv.org/pdf/2408.04884v2.pdf","comment":"8 pages, 3 figures, CIKM 2024"},{"id":"http://arxiv.org/abs/2408.07759v1","updated":"2024-08-14T18:19:35Z","published":"2024-08-14T18:19:35Z","title":"SWaT: Statistical Modeling of Video Watch Time through User Behavior\n Analysis","summary":" The significance of estimating video watch time has been highlighted by the\nrising importance of (short) video recommendation, which has become a core\nproduct of mainstream social media platforms. Modeling video watch time,\nhowever, has been challenged by the complexity of user-video interaction, such\nas different user behavior modes in watching the recommended videos and varying\nwatching probabilities over the video horizon. Despite the importance and\nchallenges, existing literature on modeling video watch time mostly focuses on\nrelatively black-box mechanical enhancement of the classical\nregression/classification losses, without factoring in user behavior in a\nprincipled manner. In this paper, we for the first time take on a user-centric\nperspective to model video watch time, from which we propose a white-box\nstatistical framework that directly translates various user behavior\nassumptions in watching (short) videos into statistical watch time models.\nThese behavior assumptions are portrayed by our domain knowledge on users'\nbehavior modes in video watching. We further employ bucketization to cope with\nuser's non-stationary watching probability over the video horizon, which\nadditionally helps to respect the constraint of video length and facilitate the\npractical compatibility between the continuous regression event of watch time\nand other binary classification events. We test our models extensively on two\npublic datasets, a large-scale offline industrial dataset, and an online A/B\ntest on a short video platform with hundreds of millions of daily-active users.\nOn all experiments, our models perform competitively against strong relevant\nbaselines, demonstrating the efficacy of our user-centric perspective and\nproposed framework.\n","authors":["Shentao Yang","Haichuan Yang","Linna Du","Adithya Ganesh","Bo Peng","Boying Liu","Serena Li","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2408.07759v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.00142v2","updated":"2024-08-14T17:57:05Z","published":"2023-07-31T20:19:36Z","title":"Semi-Supervised Laplace Learning on Stiefel Manifolds","summary":" Motivated by the need to address the degeneracy of canonical Laplace learning\nalgorithms in low label rates, we propose to reformulate graph-based\nsemi-supervised learning as a nonconvex generalization of a \\emph{Trust-Region\nSubproblem} (TRS). This reformulation is motivated by the well-posedness of\nLaplacian eigenvectors in the limit of infinite unlabeled data. To solve this\nproblem, we first show that a first-order condition implies the solution of a\nmanifold alignment problem and that solutions to the classical \\emph{Orthogonal\nProcrustes} problem can be used to efficiently find good classifiers that are\namenable to further refinement. To tackle refinement, we develop the framework\nof Sequential Subspace Optimization for graph-based SSL. Next, we address the\ncriticality of selecting supervised samples at low-label rates. We characterize\ninformative samples with a novel measure of centrality derived from the\nprincipal eigenvectors of a certain submatrix of the graph Laplacian. We\ndemonstrate that our framework achieves lower classification error compared to\nrecent state-of-the-art and classical semi-supervised learning methods at\nextremely low, medium, and high label rates.\n","authors":["Chester Holtz","Pengwen Chen","Alexander Cloninger","Chung-Kuan Cheng","Gal Mishne"],"pdf_url":"https://arxiv.org/pdf/2308.00142v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2006.11184 by other authors"},{"id":"http://arxiv.org/abs/2408.07694v1","updated":"2024-08-14T17:50:27Z","published":"2024-08-14T17:50:27Z","title":"End-to-end Semantic-centric Video-based Multimodal Affective Computing","summary":" In the pathway toward Artificial General Intelligence (AGI), understanding\nhuman's affection is essential to enhance machine's cognition abilities. For\nachieving more sensual human-AI interaction, Multimodal Affective Computing\n(MAC) in human-spoken videos has attracted increasing attention. However,\nprevious methods are mainly devoted to designing multimodal fusion algorithms,\nsuffering from two issues: semantic imbalance caused by diverse pre-processing\noperations and semantic mismatch raised by inconsistent affection content\ncontained in different modalities comparing with the multimodal ground truth.\nBesides, the usage of manual features extractors make they fail in building\nend-to-end pipeline for multiple MAC downstream tasks. To address above\nchallenges, we propose a novel end-to-end framework named SemanticMAC to\ncompute multimodal semantic-centric affection for human-spoken videos. We\nfirstly employ pre-trained Transformer model in multimodal data pre-processing\nand design Affective Perceiver module to capture unimodal affective\ninformation. Moreover, we present a semantic-centric approach to unify\nmultimodal representation learning in three ways, including gated feature\ninteraction, multi-task pseudo label generation, and intra-/inter-sample\ncontrastive learning. Finally, SemanticMAC effectively learn specific- and\nshared-semantic representations in the guidance of semantic-centric labels.\nExtensive experimental results demonstrate that our approach surpass the\nstate-of-the-art methods on 7 public datasets in four MAC downstream tasks.\n","authors":["Ronghao Lin","Ying Zeng","Sijie Mai","Haifeng Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07694v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.03710v2","updated":"2024-08-14T17:39:59Z","published":"2023-10-05T17:36:16Z","title":"Agent Instructs Large Language Models to be General Zero-Shot Reasoners","summary":" We introduce a method to improve the zero-shot reasoning abilities of large\nlanguage models on general language understanding tasks. Specifically, we build\nan autonomous agent to instruct the reasoning process of large language models.\nWe show this approach further unleashes the zero-shot reasoning abilities of\nlarge language models to more tasks. We study the performance of our method on\na wide set of datasets spanning generation, classification, and reasoning. We\nshow that our method generalizes to most tasks and obtains state-of-the-art\nzero-shot performance on 20 of the 29 datasets that we evaluate. For instance,\nour method boosts the performance of state-of-the-art large language models by\na large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and\nGPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement\nin reasoning is striking, with an average increase of 10.5%. With our method,\nLlama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%.\n","authors":["Nicholas Crispino","Kyle Montgomery","Fankun Zeng","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.03710v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2407.21090v2","updated":"2024-08-14T17:35:21Z","published":"2024-07-30T16:56:21Z","title":"Learning Optimal Signal Temporal Logic Decision Trees for\n Classification: A Max-Flow MILP Formulation","summary":" This paper presents a novel framework for inferring timed temporal logic\nproperties from data. The dataset comprises pairs of finite-time system traces\nand corresponding labels, denoting whether the traces demonstrate specific\ndesired behaviors, e.g. whether the ship follows a safe route or not. Our\nproposed approach leverages decision-tree-based methods to infer Signal\nTemporal Logic classifiers using primitive formulae. We formulate the inference\nprocess as a mixed integer linear programming optimization problem, recursively\ngenerating constraints to determine both data classification and tree\nstructure. Applying a max-flow algorithm on the resultant tree transforms the\nproblem into a global optimization challenge, leading to improved\nclassification rates compared to prior methodologies. Moreover, we introduce a\ntechnique to reduce the number of constraints by exploiting the symmetry\ninherent in STL primitives, which enhances the algorithm's time performance and\ninterpretability. To assess our algorithm's effectiveness and classification\nperformance, we conduct three case studies involving two-class, multi-class,\nand complex formula classification scenarios.\n","authors":["Kaier Liang","Gustavo A. Cardona","Disha Kamale","Cristian-Ioan Vasile"],"pdf_url":"https://arxiv.org/pdf/2407.21090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07680v1","updated":"2024-08-14T17:28:58Z","published":"2024-08-14T17:28:58Z","title":"A Spitting Image: Modular Superpixel Tokenization in Vision Transformers","summary":" Vision Transformer (ViT) architectures traditionally employ a grid-based\napproach to tokenization independent of the semantic content of an image. We\npropose a modular superpixel tokenization strategy which decouples tokenization\nand feature extraction; a shift from contemporary approaches where these are\ntreated as an undifferentiated whole. Using on-line content-aware tokenization\nand scale- and shape-invariant positional embeddings, we perform experiments\nand ablations that contrast our approach with patch-based tokenization and\nrandomized partitions as baselines. We show that our method significantly\nimproves the faithfulness of attributions, gives pixel-level granularity on\nzero-shot unsupervised dense prediction tasks, while maintaining predictive\nperformance in classification tasks. Our approach provides a modular\ntokenization framework commensurable with standard architectures, extending the\nspace of ViTs to a larger class of semantically-rich models.\n","authors":["Marius Aasan","Odd Kolbjørnsen","Anne Schistad Solberg","Adín Ramirez Rivera"],"pdf_url":"https://arxiv.org/pdf/2408.07680v1.pdf","comment":"To appear in ECCV (MELEX) 2024 Workshop Proceedings"},{"id":"http://arxiv.org/abs/2308.09766v3","updated":"2024-08-14T17:20:19Z","published":"2023-08-18T18:30:33Z","title":"Time Series Predictions in Unmonitored Sites: A Survey of Machine\n Learning Techniques in Water Resources","summary":" Prediction of dynamic environmental variables in unmonitored sites remains a\nlong-standing challenge for water resources science. The majority of the\nworld's freshwater resources have inadequate monitoring of critical\nenvironmental variables needed for management. Yet, the need to have widespread\npredictions of hydrological variables such as river flow and water quality has\nbecome increasingly urgent due to climate and land use change over the past\ndecades, and their associated impacts on water resources. Modern machine\nlearning methods increasingly outperform their process-based and empirical\nmodel counterparts for hydrologic time series prediction with their ability to\nextract information from large, diverse data sets. We review relevant\nstate-of-the art applications of machine learning for streamflow, water\nquality, and other water resources prediction and discuss opportunities to\nimprove the use of machine learning with emerging methods for incorporating\nwatershed characteristics into deep learning models, transfer learning, and\nincorporating process knowledge into machine learning models. The analysis here\nsuggests most prior efforts have been focused on deep learning learning\nframeworks built on many sites for predictions at daily time scales in the\nUnited States, but that comparisons between different classes of machine\nlearning methods are few and inadequate. We identify several open questions for\ntime series predictions in unmonitored sites that include incorporating dynamic\ninputs and site characteristics, mechanistic understanding and spatial context,\nand explainable AI techniques in modern machine learning frameworks.\n","authors":["Jared D. Willard","Charuleka Varadharajan","Xiaowei Jia","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2308.09766v3.pdf","comment":"39 pages, 4 figures, 1 table, Accepted to Environmental Data Science"},{"id":"http://arxiv.org/abs/2408.07673v1","updated":"2024-08-14T17:16:50Z","published":"2024-08-14T17:16:50Z","title":"Deep Learning: a Heuristic Three-stage Mechanism for Grid Searches to\n Optimize the Future Risk Prediction of Breast Cancer Metastasis Using\n EHR-based Clinical Data","summary":" A grid search, at the cost of training and testing a large number of models,\nis an effective way to optimize the prediction performance of deep learning\nmodels. A challenging task concerning grid search is the time management.\nWithout a good time management scheme, a grid search can easily be set off as a\nmission that will not finish in our lifetime. In this study, we introduce a\nheuristic three-stage mechanism for managing the running time of low-budget\ngrid searches, and the sweet-spot grid search (SSGS) and randomized grid search\n(RGS) strategies for improving model prediction performance, in predicting the\n5-year, 10-year, and 15-year risk of breast cancer metastasis. We develop deep\nfeedforward neural network (DFNN) models and optimize them through grid\nsearches. We conduct eight cycles of grid searches by applying our three-stage\nmechanism and SSGS and RGS strategies. We conduct various SHAP analyses\nincluding unique ones that interpret the importance of the DFNN-model\nhyperparameters. Our results show that grid search can greatly improve model\nprediction. The grid searches we conducted improved the risk prediction of\n5-year, 10-year, and 15-year breast cancer metastasis by 18.6%, 16.3%, and\n17.3% respectively, over the average performance of all corresponding models we\ntrained. We not only demonstrate best model performance but also characterize\ngrid searches from various aspects such as their capabilities of discovering\ndecent models and the unit grid search time. The three-stage mechanism worked\neffectively. It made our low-budget grid searches feasible and manageable, and\nin the meantime helped improve model prediction performance. Our SHAP analyses\nidentified both clinical risk factors important for the prediction of future\nrisk of breast cancer metastasis, and DFNN-model hyperparameters important to\nthe prediction of performance scores.\n","authors":["Xia Jiang","Yijun Zhou","Chuhan Xu","Adam Brufsky","Alan Wells"],"pdf_url":"https://arxiv.org/pdf/2408.07673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07666v1","updated":"2024-08-14T16:58:48Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07661v1","updated":"2024-08-14T16:49:25Z","published":"2024-08-14T16:49:25Z","title":"Interpretable Graph Neural Networks for Heterogeneous Tabular Data","summary":" Many machine learning algorithms for tabular data produce black-box models,\nwhich prevent users from understanding the rationale behind the model\npredictions. In their unconstrained form, graph neural networks fall into this\ncategory, and they have further limited abilities to handle heterogeneous data.\nTo overcome these limitations, an approach is proposed, called IGNH\n(Interpretable Graph Neural Network for Heterogeneous tabular data), which\nhandles both categorical and numerical features, while constraining the\nlearning process to generate exact feature attributions together with the\npredictions. A large-scale empirical investigation is presented, showing that\nthe feature attributions provided by IGNH align with Shapley values that are\ncomputed post hoc. Furthermore, the results show that IGNH outperforms two\npowerful machine learning algorithms for tabular data, Random Forests and\nTabNet, while reaching a similar level of performance as XGBoost.\n","authors":["Amr Alkhatib","Henrik Boström"],"pdf_url":"https://arxiv.org/pdf/2408.07661v1.pdf","comment":"Accepted at 27th International Conference on Discovery Science 2024"},{"id":"http://arxiv.org/abs/2408.07660v1","updated":"2024-08-14T16:44:56Z","published":"2024-08-14T16:44:56Z","title":"Off-Policy Reinforcement Learning with High Dimensional Reward","summary":" Conventional off-policy reinforcement learning (RL) focuses on maximizing the\nexpected return of scalar rewards. Distributional RL (DRL), in contrast,\nstudies the distribution of returns with the distributional Bellman operator in\na Euclidean space, leading to highly flexible choices for utility. This paper\nestablishes robust theoretical foundations for DRL. We prove the contraction\nproperty of the Bellman operator even when the reward space is an\ninfinite-dimensional separable Banach space. Furthermore, we demonstrate that\nthe behavior of high- or infinite-dimensional returns can be effectively\napproximated using a lower-dimensional Euclidean space. Leveraging these\ntheoretical insights, we propose a novel DRL algorithm that tackles problems\nwhich have been previously intractable using conventional reinforcement\nlearning approaches.\n","authors":["Dong Neuck Lee","Michael R. Kosorok"],"pdf_url":"https://arxiv.org/pdf/2408.07660v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.11124v3","updated":"2024-08-14T16:39:37Z","published":"2024-02-16T23:17:00Z","title":"Implicit Causal Representation Learning via Switchable Mechanisms","summary":" Learning causal representations from observational and interventional data in\nthe absence of known ground-truth graph structures necessitates implicit latent\ncausal representation learning. Implicit learning of causal mechanisms\ntypically involves two categories of interventional data: hard and soft\ninterventions. In real-world scenarios, soft interventions are often more\nrealistic than hard interventions, as the latter require fully controlled\nenvironments. Unlike hard interventions, which directly force changes in a\ncausal variable, soft interventions exert influence indirectly by affecting the\ncausal mechanism. However, the subtlety of soft interventions impose several\nchallenges for learning causal models. One challenge is that soft\nintervention's effects are ambiguous, since parental relations remain intact.\nIn this paper, we tackle the challenges of learning causal models using soft\ninterventions while retaining implicit modelling. We propose ICLR-SM, which\nmodels the effects of soft interventions by employing a causal mechanism switch\nvariable designed to toggle between different causal mechanisms. In our\nexperiments, we consistently observe improved learning of identifiable, causal\nrepresentations, compared to baseline approaches.\n","authors":["Shayan Shirahmad Gale Bagi","Zahra Gharaee","Oliver Schulte","Mark Crowley"],"pdf_url":"https://arxiv.org/pdf/2402.11124v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07654v1","updated":"2024-08-14T16:29:07Z","published":"2024-08-14T16:29:07Z","title":"Graph Triple Attention Network: A Decoupled Perspective","summary":" Graph Transformers (GTs) have recently achieved significant success in the\ngraph domain by effectively capturing both long-range dependencies and graph\ninductive biases. However, these methods face two primary challenges: (1)\nmulti-view chaos, which results from coupling multi-view information\n(positional, structural, attribute), thereby impeding flexible usage and the\ninterpretability of the propagation process. (2) local-global chaos, which\narises from coupling local message passing with global attention, leading to\nissues of overfitting and over-globalizing. To address these challenges, we\npropose a high-level decoupled perspective of GTs, breaking them down into\nthree components and two interaction levels: positional attention, structural\nattention, and attribute attention, alongside local and global interaction.\nBased on this decoupled perspective, we design a decoupled graph triple\nattention network named DeGTA, which separately computes multi-view attentions\nand adaptively integrates multi-view local and global information. This\napproach offers three key advantages: enhanced interpretability, flexible\ndesign, and adaptive integration of local and global information. Through\nextensive experiments, DeGTA achieves state-of-the-art performance across\nvarious datasets and tasks, including node classification and graph\nclassification. Comprehensive ablation studies demonstrate that decoupling is\nessential for improving performance and enhancing interpretability. Our code is\navailable at: https://github.com/wangxiaotang0906/DeGTA\n","authors":["Xiaotang Wang","Yun Zhu","Haizhou Shi","Yongchao Liu","Chuntao Hong"],"pdf_url":"https://arxiv.org/pdf/2408.07654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07647v1","updated":"2024-08-14T16:18:51Z","published":"2024-08-14T16:18:51Z","title":"Adaptive Behavioral AI: Reinforcement Learning to Enhance Pharmacy\n Services","summary":" Pharmacies are critical in healthcare systems, particularly in low- and\nmiddle-income countries. Procuring pharmacists with the right behavioral\ninterventions or nudges can enhance their skills, public health awareness, and\npharmacy inventory management, ensuring access to essential medicines that\nultimately benefit their patients. We introduce a reinforcement learning\noperational system to deliver personalized behavioral interventions through\nmobile health applications. We illustrate its potential by discussing a series\nof initial experiments run with SwipeRx, an all-in-one app for pharmacists,\nincluding B2B e-commerce, in Indonesia. The proposed method has broader\napplications extending beyond pharmacy operations to optimize healthcare\ndelivery.\n","authors":["Ana Fernández del Río","Michael Brennan Leong","Paulo Saraiva","Ivan Nazarov","Aditya Rastogi","Moiz Hassan","Dexian Tang","África Periáñez"],"pdf_url":"https://arxiv.org/pdf/2408.07647v1.pdf","comment":"Presented at The First Workshop on AI Behavioral Science (AIBS'24) at\n KDD 2024, August 25, Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.07644v1","updated":"2024-08-14T16:16:51Z","published":"2024-08-14T16:16:51Z","title":"SigmaRL: A Sample-Efficient and Generalizable Multi-Agent Reinforcement\n Learning Framework for Motion Planning","summary":" This paper introduces an open-source, decentralized framework named SigmaRL,\ndesigned to enhance both sample efficiency and generalization of multi-agent\nReinforcement Learning (RL) for motion planning of connected and automated\nvehicles. Most RL agents exhibit a limited capacity to generalize, often\nfocusing narrowly on specific scenarios, and are usually evaluated in similar\nor even the same scenarios seen during training. Various methods have been\nproposed to address these challenges, including experience replay and\nregularization. However, how observation design in RL affects sample efficiency\nand generalization remains an under-explored area. We address this gap by\nproposing five strategies to design information-dense observations, focusing on\ngeneral features that are applicable to most traffic scenarios. We train our RL\nagents using these strategies on an intersection and evaluate their\ngeneralization through numerical experiments across completely unseen traffic\nscenarios, including a new intersection, an on-ramp, and a roundabout.\nIncorporating these information-dense observations reduces training times to\nunder one hour on a single CPU, and the evaluation results reveal that our RL\nagents can effectively zero-shot generalize. Code:\ngithub.com/cas-lab-munich/SigmaRL\n","authors":["Jianye Xu","Pan Hu","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2408.07644v1.pdf","comment":"8 pages, 5 figures, accepted for presentation at the IEEE\n International Conference on Intelligent Transportation Systems (ITSC) 2024"},{"id":"http://arxiv.org/abs/2403.07854v2","updated":"2024-08-14T16:08:45Z","published":"2024-03-12T17:44:45Z","title":"Distilling the Knowledge in Data Pruning","summary":" With the increasing size of datasets used for training neural networks, data\npruning becomes an attractive field of research. However, most current data\npruning algorithms are limited in their ability to preserve accuracy compared\nto models trained on the full data, especially in high pruning regimes. In this\npaper we explore the application of data pruning while incorporating knowledge\ndistillation (KD) when training on a pruned subset. That is, rather than\nrelying solely on ground-truth labels, we also use the soft predictions from a\nteacher network pre-trained on the complete data. By integrating KD into\ntraining, we demonstrate significant improvement across datasets, pruning\nmethods, and on all pruning fractions. We first establish a theoretical\nmotivation for employing self-distillation to improve training on pruned data.\nThen, we empirically make a compelling and highly practical observation: using\nKD, simple random pruning is comparable or superior to sophisticated pruning\nmethods across all pruning regimes. On ImageNet for example, we achieve\nsuperior accuracy despite training on a random subset of only 50% of the data.\nAdditionally, we demonstrate a crucial connection between the pruning factor\nand the optimal knowledge distillation weight. This helps mitigate the impact\nof samples with noisy labels and low-quality images retained by typical pruning\nalgorithms. Finally, we make an intriguing observation: when using lower\npruning fractions, larger teachers lead to accuracy degradation, while\nsurprisingly, employing teachers with a smaller capacity than the student's may\nimprove results. Our code will be made available.\n","authors":["Emanuel Ben-Baruch","Adam Botach","Igor Kviatkovsky","Manoj Aggarwal","Gérard Medioni"],"pdf_url":"https://arxiv.org/pdf/2403.07854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12435v3","updated":"2024-08-14T16:01:50Z","published":"2023-11-21T08:44:38Z","title":"Fair Enough? A map of the current limitations of the requirements to\n have fair algorithms","summary":" In recent years, the increase in the usage and efficiency of Artificial\nIntelligence and, more in general, of Automated Decision-Making systems has\nbrought with it an increasing and welcome awareness of the risks associated\nwith such systems. One of such risks is that of perpetuating or even amplifying\nbias and unjust disparities present in the data from which many of these\nsystems learn to adjust and optimise their decisions. This awareness has on the\none hand encouraged several scientific communities to come up with more and\nmore appropriate ways and methods to assess, quantify, and possibly mitigate\nsuch biases and disparities. On the other hand, it has prompted more and more\nlayers of society, including policy makers, to call for fair algorithms. We\nbelieve that while many excellent and multidisciplinary research is currently\nbeing conducted, what is still fundamentally missing is the awareness that\nhaving fair algorithms is per se a nearly meaningless requirement that needs to\nbe complemented with many additional social choices to become actionable.\nNamely, there is a hiatus between what the society is demanding from Automated\nDecision-Making systems, and what this demand actually means in real-world\nscenarios. In this work, we outline the key features of such a hiatus and\npinpoint a set of crucial open points that we as a society must address in\norder to give a concrete meaning to the increasing demand of fairness in\nAutomated Decision-Making systems.\n","authors":["Daniele Regoli","Alessandro Castelnovo","Nicole Inverardi","Gabriele Nanino","Ilaria Penco"],"pdf_url":"https://arxiv.org/pdf/2311.12435v3.pdf","comment":"20 pages, 2 figures, 2 tables. V2: added reference, update info on AI\n Act. V3: changed structure of open points, update info on AI Act and other\n legislation"},{"id":"http://arxiv.org/abs/2408.07636v1","updated":"2024-08-14T16:01:02Z","published":"2024-08-14T16:01:02Z","title":"Drug Discovery SMILES-to-Pharmacokinetics Diffusion Models with Deep\n Molecular Understanding","summary":" Artificial intelligence (AI) is increasingly used in every stage of drug\ndevelopment. One challenge facing drug discovery AI is that drug\npharmacokinetic (PK) datasets are often collected independently from each\nother, often with limited overlap, creating data overlap sparsity. Data\nsparsity makes data curation difficult for researchers looking to answer\nresearch questions in poly-pharmacy, drug combination research, and\nhigh-throughput screening. We propose Imagand, a novel\nSMILES-to-Pharmacokinetic (S2PK) diffusion model capable of generating an array\nof PK target properties conditioned on SMILES inputs. We show that\nImagand-generated synthetic PK data closely resembles real data univariate and\nbivariate distributions, and improves performance for downstream tasks. Imagand\nis a promising solution for data overlap sparsity and allows researchers to\nefficiently generate ligand PK data for drug discovery research. Code is\navailable at \\url{https://github.com/bing1100/Imagand}.\n","authors":["Bing Hu","Anita Layton","Helen Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07636v1.pdf","comment":"13 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.17762v2","updated":"2024-08-14T16:00:49Z","published":"2024-02-27T18:55:17Z","title":"Massive Activations in Large Language Models","summary":" We observe an empirical phenomenon in Large Language Models (LLMs) -- very\nfew activations exhibit significantly larger values than others (e.g., 100,000\ntimes larger). We call them massive activations. First, we demonstrate the\nwidespread existence of massive activations across various LLMs and\ncharacterize their locations. Second, we find their values largely stay\nconstant regardless of the input, and they function as indispensable bias terms\nin LLMs. Third, these massive activations lead to the concentration of\nattention probabilities to their corresponding tokens, and further, implicit\nbias terms in the self-attention output. Last, we also study massive\nactivations in Vision Transformers. Code is available at\nhttps://github.com/locuslab/massive-activations.\n","authors":["Mingjie Sun","Xinlei Chen","J. Zico Kolter","Zhuang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17762v2.pdf","comment":"First Conference on Language Modeling (COLM), 2024. Website at\n https://eric-mingjie.github.io/massive-activations/index.html"},{"id":"http://arxiv.org/abs/2408.07630v1","updated":"2024-08-14T15:56:27Z","published":"2024-08-14T15:56:27Z","title":"Towards Fair and Rigorous Evaluations: Hyperparameter Optimization for\n Top-N Recommendation Task with Implicit Feedback","summary":" The widespread use of the internet has led to an overwhelming amount of data,\nwhich has resulted in the problem of information overload. Recommender systems\nhave emerged as a solution to this problem by providing personalized\nrecommendations to users based on their preferences and historical data.\nHowever, as recommendation models become increasingly complex, finding the best\nhyperparameter combination for different models has become a challenge. The\nhigh-dimensional hyperparameter search space poses numerous challenges for\nresearchers, and failure to disclose hyperparameter settings may impede the\nreproducibility of research results. In this paper, we investigate the Top-N\nimplicit recommendation problem and focus on optimizing the benchmark\nrecommendation algorithm commonly used in comparative experiments using\nhyperparameter optimization algorithms. We propose a research methodology that\nfollows the principles of a fair comparison, employing seven types of\nhyperparameter search algorithms to fine-tune six common recommendation\nalgorithms on three datasets. We have identified the most suitable\nhyperparameter search algorithms for various recommendation algorithms on\ndifferent types of datasets as a reference for later study. This study\ncontributes to algorithmic research in recommender systems based on\nhyperparameter optimization, providing a fair basis for comparison.\n","authors":["Hui Fang","Xu Feng","Lu Qin","Zhu Sun"],"pdf_url":"https://arxiv.org/pdf/2408.07630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07629v1","updated":"2024-08-14T15:55:31Z","published":"2024-08-14T15:55:31Z","title":"Optimizing HIV Patient Engagement with Reinforcement Learning in\n Resource-Limited Settings","summary":" By providing evidence-based clinical decision support, digital tools and\nelectronic health records can revolutionize patient management, especially in\nresource-poor settings where fewer health workers are available and often need\nmore training. When these tools are integrated with AI, they can offer\npersonalized support and adaptive interventions, effectively connecting\ncommunity health workers (CHWs) and healthcare facilities. The CHARM (Community\nHealth Access & Resource Management) app is an AI-native mobile app for CHWs.\nDeveloped through a joint partnership of Causal Foundry (CF) and\nmothers2mothers (m2m), CHARM empowers CHWs, mainly local women, by streamlining\ncase management, enhancing learning, and improving communication. This paper\ndetails CHARM's development, integration, and upcoming reinforcement\nlearning-based adaptive interventions, all aimed at enhancing health worker\nengagement, efficiency, and patient outcomes, thereby enhancing CHWs'\ncapabilities and community health.\n","authors":["África Periáñez","Kathrin Schmitz","Lazola Makhupula","Moiz Hassan","Moeti Moleko","Ana Fernández del Río","Ivan Nazarov","Aditya Rastogi","Dexian Tang"],"pdf_url":"https://arxiv.org/pdf/2408.07629v1.pdf","comment":"Presented at the 7th epiDAMIK ACM SIGKDD International Workshop on\n Epidemiology meets Data Mining and Knowledge Discovery, August 26, 2024,\n Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.07624v1","updated":"2024-08-14T15:44:56Z","published":"2024-08-14T15:44:56Z","title":"Battery GraphNets : Relational Learning for Lithium-ion Batteries(LiBs)\n Life Estimation","summary":" Battery life estimation is critical for optimizing battery performance and\nguaranteeing minimal degradation for better efficiency and reliability of\nbattery-powered systems. The existing methods to predict the Remaining Useful\nLife(RUL) of Lithium-ion Batteries (LiBs) neglect the relational dependencies\nof the battery parameters to model the nonlinear degradation trajectories. We\npresent the Battery GraphNets framework that jointly learns to incorporate a\ndiscrete dependency graph structure between battery parameters to capture the\ncomplex interactions and the graph-learning algorithm to model the intrinsic\nbattery degradation for RUL prognosis. The proposed method outperforms several\npopular methods by a significant margin on publicly available battery datasets\nand achieves SOTA performance. We report the ablation studies to support the\nefficacy of our approach.\n","authors":["Sakhinana Sagar Srinivas","Rajat Kumar Sarkar","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.07624v1.pdf","comment":"Accepted in Workshop on Graph Learning for Industrial Applications :\n Finance, Crime Detection, Medicine, and Social Media (NeurIPS 2022)"},{"id":"http://arxiv.org/abs/2408.07623v1","updated":"2024-08-14T15:44:51Z","published":"2024-08-14T15:44:51Z","title":"Latent Anomaly Detection Through Density Matrices","summary":" This paper introduces a novel anomaly detection framework that combines the\nrobust statistical principles of density-estimation-based anomaly detection\nmethods with the representation-learning capabilities of deep learning models.\nThe method originated from this framework is presented in two different\nversions: a shallow approach employing a density-estimation model based on\nadaptive Fourier features and density matrices, and a deep approach that\nintegrates an autoencoder to learn a low-dimensional representation of the\ndata. By estimating the density of new samples, both methods are able to find\nnormality scores. The methods can be seamlessly integrated into an end-to-end\narchitecture and optimized using gradient-based optimization techniques. To\nevaluate their performance, extensive experiments were conducted on various\nbenchmark datasets. The results demonstrate that both versions of the method\ncan achieve comparable or superior performance when compared to other\nstate-of-the-art methods. Notably, the shallow approach performs better on\ndatasets with fewer dimensions, while the autoencoder-based approach shows\nimproved performance on datasets with higher dimensions.\n","authors":["Joseph Gallego-Mejia","Oscar Bustos-Brinez","Fabio A. González"],"pdf_url":"https://arxiv.org/pdf/2408.07623v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2211.08525"},{"id":"http://arxiv.org/abs/2408.02117v2","updated":"2024-08-14T15:25:32Z","published":"2024-08-04T19:14:36Z","title":"Value-Based Rationales Improve Social Experience: A Multiagent\n Simulation Study","summary":" We propose Exanna, a framework to realize agents that incorporate values in\ndecision making. An Exannaagent considers the values of itself and others when\nproviding rationales for its actions and evaluating the rationales provided by\nothers. Via multiagent simulation, we demonstrate that considering values in\ndecision making and producing rationales, especially for norm-deviating\nactions, leads to (1) higher conflict resolution, (2) better social experience,\n(3) higher privacy, and (4) higher flexibility.\n","authors":["Sz-Ting Tzeng","Nirav Ajmeri","Munindar P. Singh"],"pdf_url":"https://arxiv.org/pdf/2408.02117v2.pdf","comment":"13 pages, 13 figures, 13 tables (and supplementary material with\n reproducibility and additional results), accepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.02354v3","updated":"2024-08-14T15:19:41Z","published":"2024-08-05T10:02:29Z","title":"RECE: Reduced Cross-Entropy Loss for Large-Catalogue Sequential\n Recommenders","summary":" Scalability is a major challenge in modern recommender systems. In sequential\nrecommendations, full Cross-Entropy (CE) loss achieves state-of-the-art\nrecommendation quality but consumes excessive GPU memory with large item\ncatalogs, limiting its practicality. Using a GPU-efficient locality-sensitive\nhashing-like algorithm for approximating large tensor of logits, this paper\nintroduces a novel RECE (REduced Cross-Entropy) loss. RECE significantly\nreduces memory consumption while allowing one to enjoy the state-of-the-art\nperformance of full CE loss. Experimental results on various datasets show that\nRECE cuts training peak memory usage by up to 12 times compared to existing\nmethods while retaining or exceeding performance metrics of CE loss. The\napproach also opens up new possibilities for large-scale applications in other\ndomains.\n","authors":["Danil Gusak","Gleb Mezentsev","Ivan Oseledets","Evgeny Frolov"],"pdf_url":"https://arxiv.org/pdf/2408.02354v3.pdf","comment":"5 pages, accepted for CIKM'24"},{"id":"http://arxiv.org/abs/2407.15793v3","updated":"2024-08-14T15:12:07Z","published":"2024-07-22T16:51:28Z","title":"CLIP with Generative Latent Replay: a Strong Baseline for Incremental\n Learning","summary":" With the emergence of Transformers and Vision-Language Models (VLMs) such as\nCLIP, fine-tuning large pre-trained models has recently become a prevalent\nstrategy in Continual Learning. This has led to the development of numerous\nprompting strategies to adapt transformer-based models without incurring\ncatastrophic forgetting. However, these strategies often compromise the\noriginal zero-shot capabilities of the pre-trained CLIP model and struggle to\nadapt to domains that significantly deviate from the pre-training data. In this\nwork, we propose Continual Generative training for Incremental prompt-Learning,\na simple and novel approach to mitigate forgetting while adapting CLIP.\nBriefly, we employ Variational Autoencoders (VAEs) to learn class-conditioned\ndistributions within the embedding space of the visual encoder. We then exploit\nthese distributions to sample new synthetic visual embeddings and train the\ncorresponding class-specific textual prompts during subsequent tasks. Through\nextensive experiments on different domains, we show that such a generative\nreplay approach can adapt to new tasks while improving zero-shot capabilities,\nevaluated using a novel metric tailored for CL scenarios. Notably, further\nanalysis reveals that our approach can bridge the gap with joint prompt tuning.\nThe codebase is available at https://github.com/aimagelab/mammoth.\n","authors":["Emanuele Frascaroli","Aniello Panariello","Pietro Buzzega","Lorenzo Bonicelli","Angelo Porrello","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2407.15793v3.pdf","comment":"15 pages, 1 figure. Accepted at the The 35th British Machine Vision\n Conference 2024 (BMVC 2024), Glasgow, UK"},{"id":"http://arxiv.org/abs/2407.11917v2","updated":"2024-08-14T14:46:32Z","published":"2024-07-16T17:09:47Z","title":"Global Optimisation of Black-Box Functions with Generative Models in the\n Wasserstein Space","summary":" We propose a new uncertainty estimator for gradient-free optimisation of\nblack-box simulators using deep generative surrogate models. Optimisation of\nthese simulators is especially challenging for stochastic simulators and higher\ndimensions. To address these issues, we utilise a deep generative surrogate\napproach to model the black box response for the entire parameter space. We\nthen leverage this knowledge to estimate the proposed uncertainty based on the\nWasserstein distance - the Wasserstein uncertainty. This approach is employed\nin a posterior agnostic gradient-free optimisation algorithm that minimises\nregret over the entire parameter space. A series of tests were conducted to\ndemonstrate that our method is more robust to the shape of both the black box\nfunction and the stochastic response of the black box than state-of-the-art\nmethods, such as efficient global optimisation with a deep Gaussian process\nsurrogate.\n","authors":["Tigran Ramazyan","Mikhail Hushchyn","Denis Derkach"],"pdf_url":"https://arxiv.org/pdf/2407.11917v2.pdf","comment":"European Conference on Artificial Intelligence 2024 Main Track"},{"id":"http://arxiv.org/abs/2408.07588v1","updated":"2024-08-14T14:40:00Z","published":"2024-08-14T14:40:00Z","title":"\"How Big is Big Enough?\" Adjusting Model Size in Continual Gaussian\n Processes","summary":" For many machine learning methods, creating a model requires setting a\nparameter that controls the model's capacity before training, e.g.~number of\nneurons in DNNs, or inducing points in GPs. Increasing capacity improves\nperformance until all the information from the dataset is captured. After this\npoint, computational cost keeps increasing, without improved performance. This\nleads to the question ``How big is big enough?'' We investigate this problem\nfor Gaussian processes (single-layer neural networks) in continual learning.\nHere, data becomes available incrementally, and the final dataset size will\ntherefore not be known before training, preventing the use of heuristics for\nsetting the model size. We provide a method that automatically adjusts this,\nwhile maintaining near-optimal performance, and show that a single\nhyperparameter setting for our method performs well across datasets with a wide\nrange of properties.\n","authors":["Guiomar Pescador-Barrios","Sarah Filippi","Mark van der Wilk"],"pdf_url":"https://arxiv.org/pdf/2408.07588v1.pdf","comment":"9 pages main, 19 pages total, 9 figures, 3 tables, preprint"},{"id":"http://arxiv.org/abs/2408.07587v1","updated":"2024-08-14T14:36:28Z","published":"2024-08-14T14:36:28Z","title":"FedQUIT: On-Device Federated Unlearning via a Quasi-Competent Virtual\n Teacher","summary":" Federated Learning (FL) promises better privacy guarantees for individuals'\ndata when machine learning models are collaboratively trained. When an FL\nparticipant exercises its right to be forgotten, i.e., to detach from the FL\nframework it has participated and to remove its past contributions to the\nglobal model, the FL solution should perform all the necessary steps to make it\npossible without sacrificing the overall performance of the global model, which\nis not supported in state-of-the-art related solutions nowadays. In this paper,\nwe propose FedQUIT, a novel algorithm that uses knowledge distillation to scrub\nthe contribution of the forgetting data from an FL global model while\npreserving its generalization ability. FedQUIT directly works on clients'\ndevices and does not require sharing additional information if compared with a\nregular FL process, nor does it assume the availability of publicly available\nproxy data. Our solution is efficient, effective, and applicable in both\ncentralized and federated settings. Our experimental results show that, on\naverage, FedQUIT requires less than 2.5% additional communication rounds to\nrecover generalization performances after unlearning, obtaining a sanitized\nglobal model whose predictions are comparable to those of a global model that\nhas never seen the data to be forgotten.\n","authors":["Alessio Mora","Lorenzo Valerio","Paolo Bellavista","Andrea Passarella"],"pdf_url":"https://arxiv.org/pdf/2408.07587v1.pdf","comment":"Submitted to The 39th Annual AAAI Conference on Artificial\n Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2408.07580v1","updated":"2024-08-14T14:24:56Z","published":"2024-08-14T14:24:56Z","title":"Theoretical and Practical Progress in Hyperspectral Pixel Unmixing with\n Large Spectral Libraries from a Sparse Perspective","summary":" Hyperspectral unmixing is the process of determining the presence of\nindividual materials and their respective abundances from an observed pixel\nspectrum. Unmixing is a fundamental process in hyperspectral image analysis,\nand is growing in importance as increasingly large spectral libraries are\ncreated and used. Unmixing is typically done with ordinary least squares (OLS)\nregression. However, unmixing with large spectral libraries where the materials\npresent in a pixel are not a priori known, solving for the coefficients in OLS\nrequires inverting a non-invertible matrix from a large spectral library. A\nnumber of regression methods are available that can produce a numerical\nsolution using regularization, but with considerably varied effectiveness.\nAlso, simple methods that are unpopular in the statistics literature (i.e.\nstep-wise regression) are used with some level of effectiveness in\nhyperspectral analysis. In this paper, we provide a thorough performance\nevaluation of the methods considered, evaluating methods based on how often\nthey select the correct materials in the models. Investigated methods include\nordinary least squares regression, non-negative least squares regression, ridge\nregression, lasso regression, step-wise regression and Bayesian model\naveraging. We evaluated these unmixing approaches using multiple criteria:\nincorporation of non-negative abundances, model size, accurate mineral\ndetection and root mean squared error (RMSE). We provide a taxonomy of the\nregression methods, showing that most methods can be understood as Bayesian\nmethods with specific priors. We conclude that methods that can be derived with\npriors that correspond to the phenomenology of hyperspectral imagery outperform\nthose with priors that are optimal for prediction performance under the\nassumptions of ordinary least squares linear regression.\n","authors":["Jade Preston","William Basener"],"pdf_url":"https://arxiv.org/pdf/2408.07580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07579v1","updated":"2024-08-14T14:23:12Z","published":"2024-08-14T14:23:12Z","title":"TabularBench: Benchmarking Adversarial Robustness for Tabular Deep\n Learning in Real-world Use-cases","summary":" While adversarial robustness in computer vision is a mature research field,\nfewer researchers have tackled the evasion attacks against tabular deep\nlearning, and even fewer investigated robustification mechanisms and reliable\ndefenses. We hypothesize that this lag in the research on tabular adversarial\nattacks is in part due to the lack of standardized benchmarks. To fill this\ngap, we propose TabularBench, the first comprehensive benchmark of robustness\nof tabular deep learning classification models. We evaluated adversarial\nrobustness with CAA, an ensemble of gradient and search attacks which was\nrecently demonstrated as the most effective attack against a tabular model. In\naddition to our open benchmark (https://github.com/serval-uni-lu/tabularbench)\nwhere we welcome submissions of new models and defenses, we implement 7\nrobustification mechanisms inspired by state-of-the-art defenses in computer\nvision and propose the largest benchmark of robust tabular deep learning over\n200 models across five critical scenarios in finance, healthcare and security.\nWe curated real datasets for each use case, augmented with hundreds of\nthousands of realistic synthetic inputs, and trained and assessed our models\nwith and without data augmentations. We open-source our library that provides\nAPI access to all our pre-trained robust tabular models, and the largest\ndatasets of real and synthetic tabular inputs. Finally, we analyze the impact\nof various defenses on the robustness and provide actionable insights to design\nnew defenses and robustification mechanisms.\n","authors":["Thibault Simonetto","Salah Ghamizi","Maxime Cordy"],"pdf_url":"https://arxiv.org/pdf/2408.07579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07578v1","updated":"2024-08-14T14:18:51Z","published":"2024-08-14T14:18:51Z","title":"A Nested Graph Reinforcement Learning-based Decision-making Strategy for\n Eco-platooning","summary":" Platooning technology is renowned for its precise vehicle control, traffic\nflow optimization, and energy efficiency enhancement. However, in large-scale\nmixed platoons, vehicle heterogeneity and unpredictable traffic conditions lead\nto virtual bottlenecks. These bottlenecks result in reduced traffic throughput\nand increased energy consumption within the platoon. To address these\nchallenges, we introduce a decision-making strategy based on nested graph\nreinforcement learning. This strategy improves collaborative decision-making,\nensuring energy efficiency and alleviating congestion. We propose a theory of\nnested traffic graph representation that maps dynamic interactions between\nvehicles and platoons in non-Euclidean spaces. By incorporating spatio-temporal\nweighted graph into a multi-head attention mechanism, we further enhance the\nmodel's capacity to process both local and global data. Additionally, we have\ndeveloped a nested graph reinforcement learning framework to enhance the\nself-iterative learning capabilities of platooning. Using the I-24 dataset, we\ndesigned and conducted comparative algorithm experiments, generalizability\ntesting, and permeability ablation experiments, thereby validating the proposed\nstrategy's effectiveness. Compared to the baseline, our strategy increases\nthroughput by 10% and decreases energy use by 9%. Specifically, increasing the\npenetration rate of CAVs significantly enhances traffic throughput, though it\nalso increases energy consumption.\n","authors":["Xin Gao","Xueyuan Li","Hao Liu","Ao Li","Zhaoyang Ma","Zirui Li"],"pdf_url":"https://arxiv.org/pdf/2408.07578v1.pdf","comment":"14 pages, 18 figures"},{"id":"http://arxiv.org/abs/2405.05736v2","updated":"2024-08-14T14:14:02Z","published":"2024-05-09T12:52:22Z","title":"Optimal Baseline Corrections for Off-Policy Contextual Bandits","summary":" The off-policy learning paradigm allows for recommender systems and general\nranking applications to be framed as decision-making problems, where we aim to\nlearn decision policies that optimize an unbiased offline estimate of an online\nreward metric. With unbiasedness comes potentially high variance, and prevalent\nmethods exist to reduce estimation variance. These methods typically make use\nof control variates, either additive (i.e., baseline corrections or doubly\nrobust methods) or multiplicative (i.e., self-normalisation). Our work unifies\nthese approaches by proposing a single framework built on their equivalence in\nlearning scenarios. The foundation of our framework is the derivation of an\nequivalent baseline correction for all of the existing control variates.\nConsequently, our framework enables us to characterize the variance-optimal\nunbiased estimator and provide a closed-form solution for it. This optimal\nestimator brings significantly improved performance in both evaluation and\nlearning, and minimizes data requirements. Empirical observations corroborate\nour theoretical findings.\n","authors":["Shashank Gupta","Olivier Jeunen","Harrie Oosterhuis","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2405.05736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01686v2","updated":"2024-08-14T14:11:25Z","published":"2023-11-03T03:18:40Z","title":"Disentangled Representation Learning with Transmitted Information\n Bottleneck","summary":" Encoding only the task-related information from the raw data, \\ie,\ndisentangled representation learning, can greatly contribute to the robustness\nand generalizability of models. Although significant advances have been made by\nregularizing the information in representations with information theory, two\nmajor challenges remain: 1) the representation compression inevitably leads to\nperformance drop; 2) the disentanglement constraints on representations are in\ncomplicated optimization. To these issues, we introduce Bayesian networks with\ntransmitted information to formulate the interaction among input and\nrepresentations during disentanglement. Building upon this framework, we\npropose \\textbf{DisTIB} (\\textbf{T}ransmitted \\textbf{I}nformation\n\\textbf{B}ottleneck for \\textbf{Dis}entangled representation learning), a novel\nobjective that navigates the balance between information compression and\npreservation. We employ variational inference to derive a tractable estimation\nfor DisTIB. This estimation can be simply optimized via standard gradient\ndescent with a reparameterization trick. Moreover, we theoretically prove that\nDisTIB can achieve optimal disentanglement, underscoring its superior efficacy.\nTo solidify our claims, we conduct extensive experiments on various downstream\ntasks to demonstrate the appealing efficacy of DisTIB and validate our\ntheoretical analyses.\n","authors":["Zhuohang Dang","Minnan Luo","Chengyou Jia","Guang Dai","Jihong Wang","Xiaojun Chang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07569v1","updated":"2024-08-14T14:06:13Z","published":"2024-08-14T14:06:13Z","title":"Multi-task Heterogeneous Graph Learning on Electronic Health Records","summary":" Learning electronic health records (EHRs) has received emerging attention\nbecause of its capability to facilitate accurate medical diagnosis. Since the\nEHRs contain enriched information specifying complex interactions between\nentities, modeling EHRs with graphs is shown to be effective in practice. The\nEHRs, however, present a great degree of heterogeneity, sparsity, and\ncomplexity, which hamper the performance of most of the models applied to them.\nMoreover, existing approaches modeling EHRs often focus on learning the\nrepresentations for a single task, overlooking the multi-task nature of EHR\nanalysis problems and resulting in limited generalizability across different\ntasks. In view of these limitations, we propose a novel framework for EHR\nmodeling, namely MulT-EHR (Multi-Task EHR), which leverages a heterogeneous\ngraph to mine the complex relations and model the heterogeneity in the EHRs. To\nmitigate the large degree of noise, we introduce a denoising module based on\nthe causal inference framework to adjust for severe confounding effects and\nreduce noise in the EHR data. Additionally, since our model adopts a single\ngraph neural network for simultaneous multi-task prediction, we design a\nmulti-task learning module to leverage the inter-task knowledge to regularize\nthe training process. Extensive empirical studies on MIMIC-III and MIMIC-IV\ndatasets validate that the proposed method consistently outperforms the\nstate-of-the-art designs in four popular EHR analysis tasks -- drug\nrecommendation, and predictions of the length of stay, mortality, and\nreadmission. Thorough ablation studies demonstrate the robustness of our method\nupon variations to key components and hyperparameters.\n","authors":["Tsai Hor Chan","Guosheng Yin","Kyongtae Bae","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2408.07569v1.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2408.07558v1","updated":"2024-08-14T13:43:59Z","published":"2024-08-14T13:43:59Z","title":"Sonic: Fast and Transferable Data Poisoning on Clustering Algorithms","summary":" Data poisoning attacks on clustering algorithms have received limited\nattention, with existing methods struggling to scale efficiently as dataset\nsizes and feature counts increase. These attacks typically require\nre-clustering the entire dataset multiple times to generate predictions and\nassess the attacker's objectives, significantly hindering their scalability.\nThis paper addresses these limitations by proposing Sonic, a novel genetic data\npoisoning attack that leverages incremental and scalable clustering algorithms,\ne.g., FISHDBC, as surrogates to accelerate poisoning attacks against\ngraph-based and density-based clustering methods, such as HDBSCAN. We\nempirically demonstrate the effectiveness and efficiency of Sonic in poisoning\nthe target clustering algorithms. We then conduct a comprehensive analysis of\nthe factors affecting the scalability and transferability of poisoning attacks\nagainst clustering algorithms, and we conclude by examining the robustness of\nhyperparameters in our attack strategy Sonic.\n","authors":["Francesco Villani","Dario Lazzaro","Antonio Emanuele Cinà","Matteo Dell'Amico","Battista Biggio","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2408.07558v1.pdf","comment":"preprint paper"},{"id":"http://arxiv.org/abs/2408.07556v1","updated":"2024-08-14T13:43:22Z","published":"2024-08-14T13:43:22Z","title":"PolyCL: Contrastive Learning for Polymer Representation Learning via\n Explicit and Implicit Augmentations","summary":" Polymers play a crucial role in a wide array of applications due to their\ndiverse and tunable properties. Establishing the relationship between polymer\nrepresentations and their properties is crucial to the computational design and\nscreening of potential polymers via machine learning. The quality of the\nrepresentation significantly influences the effectiveness of these\ncomputational methods. Here, we present a self-supervised contrastive learning\nparadigm, PolyCL, for learning high-quality polymer representation without the\nneed for labels. Our model combines explicit and implicit augmentation\nstrategies for improved learning performance. The results demonstrate that our\nmodel achieves either better, or highly competitive, performances on transfer\nlearning tasks as a feature extractor without an overcomplicated training\nstrategy or hyperparameter optimisation. Further enhancing the efficacy of our\nmodel, we conducted extensive analyses on various augmentation combinations\nused in contrastive learning. This led to identifying the most effective\ncombination to maximise PolyCL's performance.\n","authors":["Jiajun Zhou","Yijie Yang","Austin M. Mroz","Kim E. Jelfs"],"pdf_url":"https://arxiv.org/pdf/2408.07556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01766v3","updated":"2024-08-14T13:41:02Z","published":"2024-01-31T14:52:02Z","title":"LLM Voting: Human Choices and AI Collective Decision Making","summary":" This paper investigates the voting behaviors of Large Language Models (LLMs),\nspecifically GPT-4 and LLaMA-2, their biases, and how they align with human\nvoting patterns. Our methodology involved using a dataset from a human voting\nexperiment to establish a baseline for human preferences and conducting a\ncorresponding experiment with LLM agents. We observed that the choice of voting\nmethods and the presentation order influenced LLM voting outcomes. We found\nthat varying the persona can reduce some of these biases and enhance alignment\nwith human choices. While the Chain-of-Thought approach did not improve\nprediction accuracy, it has potential for AI explainability in the voting\nprocess. We also identified a trade-off between preference diversity and\nalignment accuracy in LLMs, influenced by different temperature settings. Our\nfindings indicate that LLMs may lead to less diverse collective outcomes and\nbiased assumptions when used in voting scenarios, emphasizing the need for\ncautious integration of LLMs into democratic processes.\n","authors":["Joshua C. Yang","Damian Dailisan","Marcin Korecki","Carina I. Hausladen","Dirk Helbing"],"pdf_url":"https://arxiv.org/pdf/2402.01766v3.pdf","comment":"Accepted in AAAI Conference on AI, Ethics, and Society (AIES)"},{"id":"http://arxiv.org/abs/2406.03946v2","updated":"2024-08-14T13:39:47Z","published":"2024-06-06T10:45:19Z","title":"A Probabilistic Approach to Learning the Degree of Equivariance in\n Steerable CNNs","summary":" Steerable convolutional neural networks (SCNNs) enhance task performance by\nmodelling geometric symmetries through equivariance constraints on weights.\nYet, unknown or varying symmetries can lead to overconstrained weights and\ndecreased performance. To address this, this paper introduces a probabilistic\nmethod to learn the degree of equivariance in SCNNs. We parameterise the degree\nof equivariance as a likelihood distribution over the transformation group\nusing Fourier coefficients, offering the option to model layer-wise and shared\nequivariance. These likelihood distributions are regularised to ensure an\ninterpretable degree of equivariance across the network. Advantages include the\napplicability to many types of equivariant networks through the flexible\nframework of SCNNs and the ability to learn equivariance with respect to any\nsubgroup of any compact group without requiring additional layers. Our\nexperiments reveal competitive performance on datasets with mixed symmetries,\nwith learnt likelihood distributions that are representative of the underlying\ndegree of equivariance.\n","authors":["Lars Veefkind","Gabriele Cesa"],"pdf_url":"https://arxiv.org/pdf/2406.03946v2.pdf","comment":"9 pages, published at ICML 2024 as main conference paper"},{"id":"http://arxiv.org/abs/2305.02022v2","updated":"2024-08-14T13:37:29Z","published":"2023-05-03T10:20:26Z","title":"A Data-Driven Defense against Edge-case Model Poisoning Attacks on\n Federated Learning","summary":" Federated Learning systems are increasingly subjected to a multitude of model\npoisoning attacks from clients. Among these, edge-case attacks that target a\nsmall fraction of the input space are nearly impossible to detect using\nexisting defenses, leading to a high attack success rate. We propose an\neffective defense using an external defense dataset, which provides information\nabout the attack target. The defense dataset contains a mix of poisoned and\nclean examples, with only a few known to be clean. The proposed method,\nDataDefense, uses this dataset to learn a poisoned data detector model which\nmarks each example in the defense dataset as poisoned or clean. It also learns\na client importance model that estimates the probability of a client update\nbeing malicious. The global model is then updated as a weighted average of the\nclient models' updates. The poisoned data detector and the client importance\nmodel parameters are updated using an alternating minimization strategy over\nthe Federated Learning rounds. Extensive experiments on standard attack\nscenarios demonstrate that DataDefense can defend against model poisoning\nattacks where other state-of-the-art defenses fail. In particular, DataDefense\nis able to reduce the attack success rate by at least ~ 40% on standard attack\nsetups and by more than 80% on some setups. Furthermore, DataDefense requires\nvery few defense examples (as few as five) to achieve a near-optimal reduction\nin attack success rate.\n","authors":["Kiran Purohit","Soumi Das","Sourangshu Bhattacharya","Santu Rana"],"pdf_url":"https://arxiv.org/pdf/2305.02022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07547v1","updated":"2024-08-14T13:36:17Z","published":"2024-08-14T13:36:17Z","title":"PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform\n Generation","summary":" Recently, universal waveform generation tasks have been investigated\nconditioned on various out-of-distribution scenarios. Although GAN-based\nmethods have shown their strength in fast waveform generation, they are\nvulnerable to train-inference mismatch scenarios such as two-stage\ntext-to-speech. Meanwhile, diffusion-based models have shown their powerful\ngenerative performance in other domains; however, they stay out of the\nlimelight due to slow inference speed in waveform generation tasks. Above all,\nthere is no generator architecture that can explicitly disentangle the natural\nperiodic features of high-resolution waveform signals. In this paper, we\npropose PeriodWave, a novel universal waveform generation model. First, we\nintroduce a period-aware flow matching estimator that can capture the periodic\nfeatures of the waveform signal when estimating the vector fields.\nAdditionally, we utilize a multi-period estimator that avoids overlaps to\ncapture different periodic features of waveform signals. Although increasing\nthe number of periods can improve the performance significantly, this requires\nmore computational costs. To reduce this issue, we also propose a single\nperiod-conditional universal estimator that can feed-forward parallel by\nperiod-wise batch inference. Additionally, we utilize discrete wavelet\ntransform to losslessly disentangle the frequency information of waveform\nsignals for high-frequency modeling, and introduce FreeU to reduce the\nhigh-frequency noise for waveform generation. The experimental results\ndemonstrated that our model outperforms the previous models both in\nMel-spectrogram reconstruction and text-to-speech tasks. All source code will\nbe available at \\url{https://github.com/sh-lee-prml/PeriodWave}.\n","authors":["Sang-Hoon Lee","Ha-Yeong Choi","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2408.07547v1.pdf","comment":"24 pages, 16 tables, 4 figures"},{"id":"http://arxiv.org/abs/2408.07545v1","updated":"2024-08-14T13:31:32Z","published":"2024-08-14T13:31:32Z","title":"$χ$SPN: Characteristic Interventional Sum-Product Networks for Causal\n Inference in Hybrid Domains","summary":" Causal inference in hybrid domains, characterized by a mixture of discrete\nand continuous variables, presents a formidable challenge. We take a step\ntowards this direction and propose Characteristic Interventional Sum-Product\nNetwork ($\\chi$SPN) that is capable of estimating interventional distributions\nin presence of random variables drawn from mixed distributions. $\\chi$SPN uses\ncharacteristic functions in the leaves of an interventional SPN (iSPN) thereby\nproviding a unified view for discrete and continuous random variables through\nthe Fourier-Stieltjes transform of the probability measures. A neural network\nis used to estimate the parameters of the learned iSPN using the intervened\ndata. Our experiments on 3 synthetic heterogeneous datasets suggest that\n$\\chi$SPN can effectively capture the interventional distributions for both\ndiscrete and continuous variables while being expressive and causally adequate.\nWe also show that $\\chi$SPN generalize to multiple interventions while being\ntrained only on a single intervention data.\n","authors":["Harsh Poonia","Moritz Willig","Zhongjie Yu","Matej Zečević","Kristian Kersting","Devendra Singh Dhami"],"pdf_url":"https://arxiv.org/pdf/2408.07545v1.pdf","comment":"17 pages, 11 figures. Accepted as poster at UAI (Uncertainty in\n Artificial Intelligence) 2024"},{"id":"http://arxiv.org/abs/2403.09805v2","updated":"2024-08-14T13:22:50Z","published":"2024-03-14T18:52:34Z","title":"On the Utility of 3D Hand Poses for Action Recognition","summary":" 3D hand pose is an underexplored modality for action recognition. Poses are\ncompact yet informative and can greatly benefit applications with limited\ncompute budgets. However, poses alone offer an incomplete understanding of\nactions, as they cannot fully capture objects and environments with which\nhumans interact. We propose HandFormer, a novel multimodal transformer, to\nefficiently model hand-object interactions. HandFormer combines 3D hand poses\nat a high temporal resolution for fine-grained motion modeling with sparsely\nsampled RGB frames for encoding scene semantics. Observing the unique\ncharacteristics of hand poses, we temporally factorize hand modeling and\nrepresent each joint by its short-term trajectories. This factorized pose\nrepresentation combined with sparse RGB samples is remarkably efficient and\nhighly accurate. Unimodal HandFormer with only hand poses outperforms existing\nskeleton-based methods at 5x fewer FLOPs. With RGB, we achieve new\nstate-of-the-art performance on Assembly101 and H2O with significant\nimprovements in egocentric action recognition.\n","authors":["Md Salman Shamil","Dibyadip Chatterjee","Fadime Sener","Shugao Ma","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2403.09805v2.pdf","comment":"ECCV 2024; https://s-shamil.github.io/HandFormer/"},{"id":"http://arxiv.org/abs/2408.07542v1","updated":"2024-08-14T13:22:14Z","published":"2024-08-14T13:22:14Z","title":"New Curriculum, New Chance -- Retrieval Augmented Generation for Lesson\n Planning in Ugandan Secondary Schools. Prototype Quality Evaluation","summary":" Introduction: Poor educational quality in Secondary Schools is still regarded\nas one of the major struggles in 21st century Uganda - especially in rural\nareas. Research identifies several problems, including low quality or absent\nteacher lesson planning. As the government pushes towards the implementation of\na new curriculum, exiting lesson plans become obsolete and the problem is\nworsened. Using a Retrieval Augmented Generation approach, we developed a\nprototype that generates customized lesson plans based on the\ngovernment-accredited textbooks. This helps teachers create lesson plans more\nefficiently and with better quality, ensuring they are fully aligned the new\ncurriculum and the competence-based learning approach.\n Methods: The prototype was created using Cohere LLM and Sentence Embeddings,\nand LangChain Framework - and thereafter made available on a public website.\nVector stores were trained for three new curriculum textbooks (ICT,\nMathematics, History), all at Secondary 1 Level. Twenty-four lessons plans were\ngenerated following a pseudo-random generation protocol, based on the suggested\nperiods in the textbooks. The lesson plans were analyzed regarding their\ntechnical quality by three independent raters following the Lesson Plan\nAnalysis Protocol (LPAP) by Ndihokubwayo et al. (2022) that is specifically\ndesigned for East Africa and competence-based curriculums.\n Results: Evaluation of 24 lesson plans using the LPAP resulted in an average\nquality of between 75 and 80%, corresponding to \"very good lesson plan\". None\nof the lesson plans scored below 65%, although one lesson plan could be argued\nto have been missing the topic. In conclusion, the quality of the generated\nlesson plans is at least comparable, if not better, than those created by\nhumans, as demonstrated in a study in Rwanda, whereby no lesson plan even\nreached the benchmark of 50%.\n","authors":["Simon Kloker","Herbertson Bukoli","Twaha Kateete"],"pdf_url":"https://arxiv.org/pdf/2408.07542v1.pdf","comment":"Presented at Ndejje University Second Annual Research Dissemination\n Symposium 2024"},{"id":"http://arxiv.org/abs/2408.07531v1","updated":"2024-08-14T13:03:41Z","published":"2024-08-14T13:03:41Z","title":"Development of a Multi-Agent Clinical Decision Support System for Korean\n Triage and Acuity Scale (KTAS)-Based Triage and Treatment Planning in\n Emergency Departments","summary":" Emergency department (ED) overcrowding and the complexity of rapid\ndecision-making in critical care settings pose significant challenges to\nhealthcare systems worldwide. While clinical decision support systems (CDSS)\nhave shown promise, the integration of large language models (LLMs) offers new\npossibilities for enhancing triage accuracy and clinical decision-making. This\nstudy presents an LLM-driven CDSS designed to assist ED physicians and nurses\nin patient triage, treatment planning, and overall emergency care management.\n We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM,\norchestrated by CrewAI and Langchain. The system comprises four AI agents\nemulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED\nCoordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for\ntriage assessment and integrates with the RxNorm API for medication management.\n The model was evaluated using the Asclepius dataset, with performance\nassessed by a clinical emergency medicine specialist. The CDSS demonstrated\nhigh accuracy in triage decision-making compared to the baseline of a\nsingle-agent system. Furthermore, the system exhibited strong performance in\ncritical areas, including primary diagnosis, critical findings identification,\ndisposition decision-making, treatment planning, and resource allocation.\n Our multi-agent CDSS demonstrates significant potential for supporting\ncomprehensive emergency care management. By leveraging state-of-the-art AI\ntechnologies, this system offers a scalable and adaptable tool that could\nenhance emergency medical care delivery, potentially alleviating ED\novercrowding and improving patient outcomes. This work contributes to the\ngrowing field of AI applications in emergency medicine and offers a promising\ndirection for future research and clinical implementation.\n","authors":["Seungjun Han","Wongyung Choi"],"pdf_url":"https://arxiv.org/pdf/2408.07531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17385v2","updated":"2024-08-14T13:01:52Z","published":"2024-07-24T16:07:57Z","title":"Causal modelling without introducing counterfactuals or abstract\n distributions","summary":" The most common approach to causal modelling is the potential outcomes\nframework due to Neyman and Rubin. In this framework, outcomes of\ncounterfactual treatments are assumed to be well-defined. This metaphysical\nassumption is often thought to be problematic yet indispensable. The\nconventional approach relies not only on counterfactuals but also on abstract\nnotions of distributions and assumptions of independence that are not directly\ntestable. In this paper, we construe causal inference as treatment-wise\npredictions for finite populations where all assumptions are testable; this\nmeans that one can not only test predictions themselves (without any\nfundamental problem) but also investigate sources of error when they fail. The\nnew framework highlights the model-dependence of causal claims as well as the\ndifference between statistical and scientific inference.\n","authors":["Benedikt Höltgen","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2407.17385v2.pdf","comment":"Presented at the Humans, Algorithmic Decision-Making and Society\n Workshop at ICML 2024"},{"id":"http://arxiv.org/abs/2408.07526v1","updated":"2024-08-14T13:01:30Z","published":"2024-08-14T13:01:30Z","title":"Learning-based Models for Vulnerability Detection: An Extensive Study","summary":" Though many deep learning-based models have made great progress in\nvulnerability detection, we have no good understanding of these models, which\nlimits the further advancement of model capability, understanding of the\nmechanism of model detection, and efficiency and safety of practical\napplication of models. In this paper, we extensively and comprehensively\ninvestigate two types of state-of-the-art learning-based approaches\n(sequence-based and graph-based) by conducting experiments on a recently built\nlarge-scale dataset. We investigate seven research questions from five\ndimensions, namely model capabilities, model interpretation, model stability,\nease of use of model, and model economy. We experimentally demonstrate the\npriority of sequence-based models and the limited abilities of both LLM\n(ChatGPT) and graph-based models. We explore the types of vulnerability that\nlearning-based models skilled in and reveal the instability of the models\nthough the input is subtlely semantical-equivalently changed. We empirically\nexplain what the models have learned. We summarize the pre-processing as well\nas requirements for easily using the models. Finally, we initially induce the\nvital information for economically and safely practical usage of these models.\n","authors":["Chao Ni","Liyu Shen","Xiaodan Xu","Xin Yin","Shaohua Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07526v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.07522v1","updated":"2024-08-14T12:56:17Z","published":"2024-08-14T12:56:17Z","title":"Optimising MFCC parameters for the automatic detection of respiratory\n diseases","summary":" Voice signals originating from the respiratory tract are utilized as valuable\nacoustic biomarkers for the diagnosis and assessment of respiratory diseases.\nAmong the employed acoustic features, Mel Frequency Cepstral Coefficients\n(MFCC) is widely used for automatic analysis, with MFCC extraction commonly\nrelying on default parameters. However, no comprehensive study has\nsystematically investigated the impact of MFCC extraction parameters on\nrespiratory disease diagnosis. In this study, we address this gap by examining\nthe effects of key parameters, namely the number of coefficients, frame length,\nand hop length between frames, on respiratory condition examination. Our\ninvestigation uses four datasets: the Cambridge COVID-19 Sound database, the\nCoswara dataset, the Saarbrucken Voice Disorders (SVD) database, and a TACTICAS\ndataset. The Support Vector Machine (SVM) is employed as the classifier, given\nits widespread adoption and efficacy. Our findings indicate that the accuracy\nof MFCC decreases as hop length increases, and the optimal number of\ncoefficients is observed to be approximately 30. The performance of MFCC varies\nwith frame length across the datasets: for the COVID-19 datasets (Cambridge\nCOVID-19 Sound database and Coswara dataset), performance declines with longer\nframe lengths, while for the SVD dataset, performance improves with increasing\nframe length (from 50 ms to 500 ms). Furthermore, we investigate the optimized\ncombination of these parameters and observe substantial enhancements in\naccuracy. Compared to the worst combination, the SVM model achieves an accuracy\nof 81.1%, 80.6%, and 71.7%, with improvements of 19.6%, 16.10%, and 14.90% for\nthe Cambridge COVID-19 Sound database, the Coswara dataset, and the SVD dataset\nrespectively.\n","authors":["Yuyang Yan","Sami O. Simons","Loes van Bemmel","Lauren Reinders","Frits M. E. Franssen","Visara Urovi"],"pdf_url":"https://arxiv.org/pdf/2408.07522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04093v3","updated":"2024-08-14T12:47:31Z","published":"2024-08-07T21:16:55Z","title":"Tree Attention: Topology-aware Decoding for Long-Context Attention on\n GPU clusters","summary":" Self-attention is the core mathematical operation of modern transformer\narchitectures and is also a significant computational bottleneck due to its\nquadratic complexity in the sequence length. In this work, we derive the scalar\nenergy function whose gradient computes the self-attention block, thus\nelucidating the theoretical underpinnings of self-attention, providing a\nBayesian interpretation of the operation and linking it closely with\nenergy-based models such as Hopfield Networks. Our formulation reveals that the\nreduction across the sequence axis can be efficiently computed in parallel\nthrough a tree reduction. Our algorithm, for parallelizing attention\ncomputation across multiple GPUs enables cross-device decoding to be performed\nasymptotically faster (up to 8x faster in our experiments) than alternative\napproaches such as Ring Attention, while also requiring significantly less\ncommunication volume and incurring 2x less peak memory. Our code is publicly\navailable here: \\url{https://github.com/Zyphra/tree_attention}.\n","authors":["Vasudev Shyam","Jonathan Pilault","Emily Shepperd","Quentin Anthony","Beren Millidge"],"pdf_url":"https://arxiv.org/pdf/2408.04093v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09477v2","updated":"2024-08-14T12:43:52Z","published":"2024-03-14T15:19:19Z","title":"VIRUS-NeRF -- Vision, InfraRed and UltraSonic based Neural Radiance\n Fields","summary":" Autonomous mobile robots are an increasingly integral part of modern factory\nand warehouse operations. Obstacle detection, avoidance and path planning are\ncritical safety-relevant tasks, which are often solved using expensive LiDAR\nsensors and depth cameras. We propose to use cost-effective low-resolution\nranging sensors, such as ultrasonic and infrared time-of-flight sensors by\ndeveloping VIRUS-NeRF - Vision, InfraRed, and UltraSonic based Neural Radiance\nFields. Building upon Instant Neural Graphics Primitives with a Multiresolution\nHash Encoding (Instant-NGP), VIRUS-NeRF incorporates depth measurements from\nultrasonic and infrared sensors and utilizes them to update the occupancy grid\nused for ray marching. Experimental evaluation in 2D demonstrates that\nVIRUS-NeRF achieves comparable mapping performance to LiDAR point clouds\nregarding coverage. Notably, in small environments, its accuracy aligns with\nthat of LiDAR measurements, while in larger ones, it is bounded by the utilized\nultrasonic sensors. An in-depth ablation study reveals that adding ultrasonic\nand infrared sensors is highly effective when dealing with sparse data and low\nview variation. Further, the proposed occupancy grid of VIRUS-NeRF improves the\nmapping capabilities and increases the training speed by 46% compared to\nInstant-NGP. Overall, VIRUS-NeRF presents a promising approach for\ncost-effective local mapping in mobile robotics, with potential applications in\nsafety and navigation tasks. The code can be found at\nhttps://github.com/ethz-asl/virus nerf.\n","authors":["Nicolaj Schmid","Cornelius von Einem","Cesar Cadena","Roland Siegwart","Lorenz Hruby","Florian Tschopp"],"pdf_url":"https://arxiv.org/pdf/2403.09477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07511v1","updated":"2024-08-14T12:40:57Z","published":"2024-08-14T12:40:57Z","title":"Protected Test-Time Adaptation via Online Entropy Matching: A Betting\n Approach","summary":" We present a novel approach for test-time adaptation via online\nself-training, consisting of two components. First, we introduce a statistical\nframework that detects distribution shifts in the classifier's entropy values\nobtained on a stream of unlabeled samples. Second, we devise an online\nadaptation mechanism that utilizes the evidence of distribution shifts captured\nby the detection tool to dynamically update the classifier's parameters. The\nresulting adaptation process drives the distribution of test entropy values\nobtained from the self-trained classifier to match those of the source domain,\nbuilding invariance to distribution shifts. This approach departs from the\nconventional self-training method, which focuses on minimizing the classifier's\nentropy. Our approach combines concepts in betting martingales and online\nlearning to form a detection tool capable of quickly reacting to distribution\nshifts. We then reveal a tight relation between our adaptation scheme and\noptimal transport, which forms the basis of our novel self-supervised loss.\nExperimental results demonstrate that our approach improves test-time accuracy\nunder distribution shifts while maintaining accuracy and calibration in their\nabsence, outperforming leading entropy minimization methods across various\nscenarios.\n","authors":["Yarin Bar","Shalev Shaer","Yaniv Romano"],"pdf_url":"https://arxiv.org/pdf/2408.07511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07507v1","updated":"2024-08-14T12:35:41Z","published":"2024-08-14T12:35:41Z","title":"Decoder ensembling for learned latent geometries","summary":" Latent space geometry provides a rigorous and empirically valuable framework\nfor interacting with the latent variables of deep generative models. This\napproach reinterprets Euclidean latent spaces as Riemannian through a pull-back\nmetric, allowing for a standard differential geometric analysis of the latent\nspace. Unfortunately, data manifolds are generally compact and easily\ndisconnected or filled with holes, suggesting a topological mismatch to the\nEuclidean latent space. The most established solution to this mismatch is to\nlet uncertainty be a proxy for topology, but in neural network models, this is\noften realized through crude heuristics that lack principle and generally do\nnot scale to high-dimensional representations. We propose using ensembles of\ndecoders to capture model uncertainty and show how to easily compute geodesics\non the associated expected manifold. Empirically, we find this simple and\nreliable, thereby coming one step closer to easy-to-use latent geometries.\n","authors":["Stas Syrota","Pablo Moreno-Muñoz","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2408.07507v1.pdf","comment":"International Conference on Machine Learning, ELLIS Workshop on\n Geometry-grounded Representation Learning and Generative Modeling"},{"id":"http://arxiv.org/abs/2408.07503v1","updated":"2024-08-14T12:30:51Z","published":"2024-08-14T12:30:51Z","title":"Faster Stochastic Optimization with Arbitrary Delays via Asynchronous\n Mini-Batching","summary":" We consider the problem of asynchronous stochastic optimization, where an\noptimization algorithm makes updates based on stale stochastic gradients of the\nobjective that are subject to an arbitrary (possibly adversarial) sequence of\ndelays. We present a procedure which, for any given $q \\in (0,1]$, transforms\nany standard stochastic first-order method to an asynchronous method with\nconvergence guarantee depending on the $q$-quantile delay of the sequence. This\napproach leads to convergence rates of the form $O(\\tau_q/qT+\\sigma/\\sqrt{qT})$\nfor non-convex and $O(\\tau_q^2/(q T)^2+\\sigma/\\sqrt{qT})$ for convex smooth\nproblems, where $\\tau_q$ is the $q$-quantile delay, generalizing and improving\non existing results that depend on the average delay. We further show a method\nthat automatically adapts to all quantiles simultaneously, without any prior\nknowledge of the delays, achieving convergence rates of the form $O(\\inf_{q}\n\\tau_q/qT+\\sigma/\\sqrt{qT})$ for non-convex and $O(\\inf_{q} \\tau_q^2/(q\nT)^2+\\sigma/\\sqrt{qT})$ for convex smooth problems. Our technique is based on\nasynchronous mini-batching with a careful batch-size selection and filtering of\nstale gradients.\n","authors":["Amit Attia","Ofir Gaash","Tomer Koren"],"pdf_url":"https://arxiv.org/pdf/2408.07503v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2408.06425v2","updated":"2024-08-14T12:22:04Z","published":"2024-08-12T18:04:59Z","title":"Bayesian Learning in a Nonlinear Multiscale State-Space Model","summary":" The ubiquity of multiscale interactions in complex systems is\nwell-recognized, with development and heredity serving as a prime example of\nhow processes at different temporal scales influence one another. This work\nintroduces a novel multiscale state-space model to explore the dynamic\ninterplay between systems interacting across different time scales, with\nfeedback between each scale. We propose a Bayesian learning framework to\nestimate unknown states by learning the unknown process noise covariances\nwithin this multiscale model. We develop a Particle Gibbs with Ancestor\nSampling (PGAS) algorithm for inference and demonstrate through simulations the\nefficacy of our approach.\n","authors":["Nayely Vélez-Cruz","Manfred D. Laubichler"],"pdf_url":"https://arxiv.org/pdf/2408.06425v2.pdf","comment":"Corrected a typo"},{"id":"http://arxiv.org/abs/2408.07494v1","updated":"2024-08-14T12:19:25Z","published":"2024-08-14T12:19:25Z","title":"QirK: Question Answering via Intermediate Representation on Knowledge\n Graphs","summary":" We demonstrate QirK, a system for answering natural language questions on\nKnowledge Graphs (KG). QirK can answer structurally complex questions that are\nstill beyond the reach of emerging Large Language Models (LLMs). It does so\nusing a unique combination of database technology, LLMs, and semantic search\nover vector embeddings. The glue for these components is an intermediate\nrepresentation (IR). The input question is mapped to IR using LLMs, which is\nthen repaired into a valid relational database query with the aid of a semantic\nsearch on vector embeddings. This allows a practical synthesis of LLM\ncapabilities and KG reliability.\n A short video demonstrating QirK is available at\nhttps://youtu.be/6c81BLmOZ0U.\n","authors":["Jan Luca Scheerer","Anton Lykov","Moe Kayali","Ilias Fountalis","Dan Olteanu","Nikolaos Vasiloglou","Dan Suciu"],"pdf_url":"https://arxiv.org/pdf/2408.07494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17395v2","updated":"2024-08-14T12:17:38Z","published":"2024-07-24T16:17:14Z","title":"Why we should not (always) assume data generating distributions in\n Machine Learning","summary":" Machine Learning research, as most of Statistics, heavily relies on the\nconcept of a data-generating probability distribution. As data points are\nthought to be sampled from such a distribution, we can learn from observed data\nabout this distribution and, thus, predict future data points drawn from it\n(with some probability of success). Drawing on scholarship across disciplines,\nwe here argue that this framework is not always a good model. Not only do such\ntrue probability distributions not exist; the framework can also be misleading\nand obscure both the choices made and the goals pursued in machine learning\npractice. We suggest an alternative framework that focuses on finite\npopulations rather than abstract distributions; while classical learning theory\ncan be left almost unchanged, it opens new opportunities, especially to model\nsampling. We compile these considerations into five reasons for modelling\nmachine learning -- in some settings -- with finite distributions rather than\ngenerative distributions, both to be more faithful to practice and to provide\nnovel theoretical insights.\n","authors":["Benedikt Höltgen","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2407.17395v2.pdf","comment":"Presented at the Humans, Algorithmic Decision-Making and Society\n Workshop at ICML 2024"},{"id":"http://arxiv.org/abs/2408.06960v2","updated":"2024-08-14T12:11:12Z","published":"2024-08-13T15:17:03Z","title":"Measuring User Understanding in Dialogue-based XAI Systems","summary":" The field of eXplainable Artificial Intelligence (XAI) is increasingly\nrecognizing the need to personalize and/or interactively adapt the explanation\nto better reflect users' explanation needs. While dialogue-based approaches to\nXAI have been proposed recently, the state-of-the-art in XAI is still\ncharacterized by what we call one-shot, non-personalized and one-way\nexplanations. In contrast, dialogue-based systems that can adapt explanations\nthrough interaction with a user promise to be superior to GUI-based or\ndashboard explanations as they offer a more intuitive way of requesting\ninformation. In general, while interactive XAI systems are often evaluated in\nterms of user satisfaction, there are limited studies that access user's\nobjective model understanding. This is in particular the case for\ndialogue-based XAI approaches. In this paper, we close this gap by carrying out\ncontrolled experiments within a dialogue framework in which we measure\nunderstanding of users in three phases by asking them to simulate the\npredictions of the model they are learning about. By this, we can quantify the\nlevel of (improved) understanding w.r.t. how the model works, comparing the\nstate prior, and after the interaction. We further analyze the data to reveal\npatterns of how the interaction between groups with high vs. low understanding\ngain differ. Overall, our work thus contributes to our understanding about the\neffectiveness of XAI approaches.\n","authors":["Dimitry Mindlin","Amelie Sophie Robrecht","Michael Morasch","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2408.06960v2.pdf","comment":"Accepted at the ECAI 2024 main conference - final version and code\n coming soon. 9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.07480v1","updated":"2024-08-14T11:53:18Z","published":"2024-08-14T11:53:18Z","title":"Adaptive Basis Function Selection for Computationally Efficient\n Predictions","summary":" Basis Function (BF) expansions are a cornerstone of any engineer's toolbox\nfor computational function approximation which shares connections with both\nneural networks and Gaussian processes. Even though BF expansions are an\nintuitive and straightforward model to use, they suffer from quadratic\ncomputational complexity in the number of BFs if the predictive variance is to\nbe computed. We develop a method to automatically select the most important BFs\nfor prediction in a sub-domain of the model domain. This significantly reduces\nthe computational complexity of computing predictions while maintaining\npredictive accuracy. The proposed method is demonstrated using two numerical\nexamples, where reductions up to 50-75% are possible without significantly\nreducing the predictive accuracy.\n","authors":["Anton Kullberg","Frida Viset","Isaac Skog","Gustaf Hendeby"],"pdf_url":"https://arxiv.org/pdf/2408.07480v1.pdf","comment":"5 pages, accepted for publication in IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2303.07393v4","updated":"2024-08-14T11:37:15Z","published":"2023-03-13T18:15:52Z","title":"Many learning agents interacting with an agent-based market model","summary":" We consider the dynamics and the interactions of multiple reinforcement\nlearning optimal execution trading agents interacting with a reactive\nAgent-Based Model (ABM) of a financial market in event time. The model\nrepresents a market ecology with 3-trophic levels represented by: optimal\nexecution learning agents, minimally intelligent liquidity takers, and fast\nelectronic liquidity providers. The optimal execution agent classes include\nbuying and selling agents that can either use a combination of limit orders and\nmarket orders, or only trade using market orders. The reward function\nexplicitly balances trade execution slippage against the penalty of not\nexecuting the order timeously. This work demonstrates how multiple competing\nlearning agents impact a minimally intelligent market simulation as functions\nof the number of agents, the size of agents' initial orders, and the state\nspaces used for learning. We use phase space plots to examine the dynamics of\nthe ABM, when various specifications of learning agents are included. Further,\nwe examine whether the inclusion of optimal execution agents that can learn is\nable to produce dynamics with the same complexity as empirical data. We find\nthat the inclusion of optimal execution agents changes the stylised facts\nproduced by ABM to conform more with empirical data, and are a necessary\ninclusion for ABMs investigating market micro-structure. However, including\nexecution agents to chartist-fundamentalist-noise ABMs is insufficient to\nrecover the complexity observed in empirical data.\n","authors":["Matthew Dicks","Andrew Paskaramoorthy","Tim Gebbie"],"pdf_url":"https://arxiv.org/pdf/2303.07393v4.pdf","comment":"16 pages, 8 figures, 5 tables, enhanced discussion and figures"},{"id":"http://arxiv.org/abs/2408.07472v1","updated":"2024-08-14T11:31:32Z","published":"2024-08-14T11:31:32Z","title":"Unsupervised Blind Joint Dereverberation and Room Acoustics Estimation\n with Diffusion Models","summary":" This paper presents an unsupervised method for single-channel blind\ndereverberation and room impulse response (RIR) estimation, called BUDDy. The\nalgorithm is rooted in Bayesian posterior sampling: it combines a likelihood\nmodel enforcing fidelity to the reverberant measurement, and an anechoic speech\nprior implemented by an unconditional diffusion model. We design a parametric\nfilter representing the RIR, with exponential decay for each frequency subband.\nRoom acoustics estimation and speech dereverberation are jointly carried out,\nas the filter parameters are iteratively estimated and the speech utterance\nrefined along the reverse diffusion trajectory. In a blind scenario where the\nroom impulse response is unknown, BUDDy successfully performs speech\ndereverberation in various acoustic scenarios, significantly outperforming\nother blind unsupervised baselines. Unlike supervised methods, which often\nstruggle to generalize, BUDDy seamlessly adapts to different acoustic\nconditions. This paper extends our previous work by offering new experimental\nresults and insights into the algorithm's performance and versatility. We first\ninvestigate the robustness of informed dereverberation methods to RIR\nestimation errors, to motivate the joint acoustic estimation and\ndereverberation paradigm. Then, we demonstrate the adaptability of our method\nto high-resolution singing voice dereverberation, study its performance in RIR\nestimation, and conduct subjective evaluation experiments to validate the\nperceptual quality of the results, among other contributions. Audio samples and\ncode can be found online.\n","authors":["Jean-Marie Lemercier","Eloi Moliner","Simon Welker","Vesa Välimäki","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2408.07472v1.pdf","comment":"Submitted to IEEE/ACM Transactions on Audio, Speech and Language\n Processing"},{"id":"http://arxiv.org/abs/2406.16908v3","updated":"2024-08-14T11:07:41Z","published":"2024-06-04T10:53:56Z","title":"Using Explainable AI for EEG-based Reduced Montage Neonatal Seizure\n Detection","summary":" The neonatal period is the most vulnerable time for the development of\nseizures. Seizures in the immature brain lead to detrimental consequences,\ntherefore require early diagnosis. The gold-standard for neonatal seizure\ndetection currently relies on continuous video-EEG monitoring; which involves\nrecording multi-channel electroencephalogram (EEG) alongside real-time video\nmonitoring within a neonatal intensive care unit (NICU). However, video-EEG\nmonitoring technology requires clinical expertise and is often limited to\ntechnologically advanced and resourceful settings. Cost-effective new\ntechniques could help the medical fraternity make an accurate diagnosis and\nadvocate treatment without delay. In this work, a novel explainable deep\nlearning model to automate the neonatal seizure detection process with a\nreduced EEG montage is proposed, which employs convolutional nets, graph\nattention layers, and fully connected layers. Beyond its ability to detect\nseizures in real-time with a reduced montage, this model offers the unique\nadvantage of real-time interpretability. By evaluating the performance on the\nZenodo dataset with 10-fold cross-validation, the presented model achieves an\nabsolute improvement of 8.31% and 42.86% in area under curve (AUC) and recall,\nrespectively.\n","authors":["Dinuka Sandun Udayantha","Kavindu Weerasinghe","Nima Wickramasinghe","Akila Abeyratne","Kithmin Wickremasinghe","Jithangi Wanigasinghe","Anjula De Silva","Chamira U. S. Edussooriya"],"pdf_url":"https://arxiv.org/pdf/2406.16908v3.pdf","comment":"Paper is accepted to IEEE International Conference on Systems, Man,\n and Cybernetics (SMC) 2024. Final Version"},{"id":"http://arxiv.org/abs/2408.07453v1","updated":"2024-08-14T10:46:15Z","published":"2024-08-14T10:46:15Z","title":"Fact or Fiction? Improving Fact Verification with Knowledge Graphs\n through Simplified Subgraph Retrievals","summary":" Despite recent success in natural language processing (NLP), fact\nverification still remains a difficult task. Due to misinformation spreading\nincreasingly fast, attention has been directed towards automatically verifying\nthe correctness of claims. In the domain of NLP, this is usually done by\ntraining supervised machine learning models to verify claims by utilizing\nevidence from trustworthy corpora. We present efficient methods for verifying\nclaims on a dataset where the evidence is in the form of structured knowledge\ngraphs. We use the FactKG dataset, which is constructed from the DBpedia\nknowledge graph extracted from Wikipedia. By simplifying the evidence retrieval\nprocess, from fine-tuned language models to simple logical retrievals, we are\nable to construct models that both require less computational resources and\nachieve better test-set accuracy.\n","authors":["Tobias A. Opsahl"],"pdf_url":"https://arxiv.org/pdf/2408.07453v1.pdf","comment":"10 pages, 3 figures, appendix"},{"id":"http://arxiv.org/abs/2408.07438v1","updated":"2024-08-14T10:15:34Z","published":"2024-08-14T10:15:34Z","title":"Achieving Data Efficient Neural Networks with Hybrid Concept-based\n Models","summary":" Most datasets used for supervised machine learning consist of a single label\nper data point. However, in cases where more information than just the class\nlabel is available, would it be possible to train models more efficiently? We\nintroduce two novel model architectures, which we call hybrid concept-based\nmodels, that train using both class labels and additional information in the\ndataset referred to as concepts. In order to thoroughly assess their\nperformance, we introduce ConceptShapes, an open and flexible class of datasets\nwith concept labels. We show that the hybrid concept-based models outperform\nstandard computer vision models and previously proposed concept-based models\nwith respect to accuracy, especially in sparse data settings. We also introduce\nan algorithm for performing adversarial concept attacks, where an image is\nperturbed in a way that does not change a concept-based model's concept\npredictions, but changes the class prediction. The existence of such\nadversarial examples raises questions about the interpretable qualities\npromised by concept-based models.\n","authors":["Tobias A. Opsahl","Vegard Antun"],"pdf_url":"https://arxiv.org/pdf/2408.07438v1.pdf","comment":"11 pages, 8 figures, appendix"},{"id":"http://arxiv.org/abs/2408.07435v1","updated":"2024-08-14T10:12:15Z","published":"2024-08-14T10:12:15Z","title":"Real-world validation of safe reinforcement learning, model predictive\n control and decision tree-based home energy management systems","summary":" Recent advancements in machine learning based energy management approaches,\nspecifically reinforcement learning with a safety layer (OptLayerPolicy) and a\nmetaheuristic algorithm generating a decision tree control policy (TreeC), have\nshown promise. However, their effectiveness has only been demonstrated in\ncomputer simulations. This paper presents the real-world validation of these\nmethods, comparing against model predictive control and simple rule-based\ncontrol benchmark. The experiments were conducted on the electrical\ninstallation of 4 reproductions of residential houses, which all have their own\nbattery, photovoltaic and dynamic load system emulating a non-controllable\nelectrical load and a controllable electric vehicle charger. The results show\nthat the simple rules, TreeC, and model predictive control-based methods\nachieved similar costs, with a difference of only 0.6%. The reinforcement\nlearning based method, still in its training phase, obtained a cost 25.5\\%\nhigher to the other methods. Additional simulations show that the costs can be\nfurther reduced by using a more representative training dataset for TreeC and\naddressing errors in the model predictive control implementation caused by its\nreliance on accurate data from various sources. The OptLayerPolicy safety layer\nallows safe online training of a reinforcement learning agent in the\nreal-world, given an accurate constraint function formulation. The proposed\nsafety layer method remains error-prone, nonetheless, it is found beneficial\nfor all investigated methods. The TreeC method, which does require building a\nrealistic simulation for training, exhibits the safest operational performance,\nexceeding the grid limit by only 27.1 Wh compared to 593.9 Wh for reinforcement\nlearning.\n","authors":["Julian Ruddick","Glenn Ceusters","Gilles Van Kriekinge","Evgenii Genov","Thierry Coosemans","Maarten Messagie"],"pdf_url":"https://arxiv.org/pdf/2408.07435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07765v2","updated":"2024-08-14T09:52:09Z","published":"2024-07-10T15:43:30Z","title":"Ramsey Theorems for Trees and a General 'Private Learning Implies Online\n Learning' Theorem","summary":" This work continues to investigate the link between differentially private\n(DP) and online learning. Alon, Livni, Malliaris, and Moran (2019) showed that\nfor binary concept classes, DP learnability of a given class implies that it\nhas a finite Littlestone dimension (equivalently, that it is online learnable).\nTheir proof relies on a model-theoretic result by Hodges (1997), which\ndemonstrates that any binary concept class with a large Littlestone dimension\ncontains a large subclass of thresholds. In a follow-up work, Jung, Kim, and\nTewari (2020) extended this proof to multiclass PAC learning with a bounded\nnumber of labels. Unfortunately, Hodges's result does not apply in other\nnatural settings such as multiclass PAC learning with an unbounded label space,\nand PAC learning of partial concept classes.\n This naturally raises the question of whether DP learnability continues to\nimply online learnability in more general scenarios: indeed, Alon, Hanneke,\nHolzman, and Moran (2021) explicitly leave it as an open question in the\ncontext of partial concept classes, and the same question is open in the\ngeneral multiclass setting. In this work, we give a positive answer to these\nquestions showing that for general classification tasks, DP learnability\nimplies online learnability. Our proof reasons directly about Littlestone\ntrees, without relying on thresholds. We achieve this by establishing several\nRamsey-type theorems for trees, which might be of independent interest.\n","authors":["Simone Fioravanti","Steve Hanneke","Shay Moran","Hilla Schefler","Iska Tsubari"],"pdf_url":"https://arxiv.org/pdf/2407.07765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06400v2","updated":"2024-08-14T09:18:38Z","published":"2024-08-12T11:09:25Z","title":"MetMamba: Regional Weather Forecasting with Spatial-Temporal Mamba Model","summary":" Deep Learning based Weather Prediction (DLWP) models have been improving\nrapidly over the last few years, surpassing state of the art numerical weather\nforecasts by significant margins. While much of the optimization effort is\nfocused on training curriculum to extend forecast range in the global context,\ntwo aspects remains less explored: limited area modeling and better backbones\nfor weather forecasting. We show in this paper that MetMamba, a DLWP model\nbuilt on a state-of-the-art state-space model, Mamba, offers notable\nperformance gains and unique advantages over other popular backbones using\ntraditional attention mechanisms and neural operators. We also demonstrate the\nfeasibility of deep learning based limited area modeling via coupled training\nwith a global host model.\n","authors":["Haoyu Qin","Yungang Chen","Qianchuan Jiang","Pengchao Sun","Xiancai Ye","Chao Lin"],"pdf_url":"https://arxiv.org/pdf/2408.06400v2.pdf","comment":"Typo and grammar; Minor elaboration and clarifications; Use full\n organization name in the author section"},{"id":"http://arxiv.org/abs/2408.07394v1","updated":"2024-08-14T09:13:27Z","published":"2024-08-14T09:13:27Z","title":"Sum-Product-Set Networks","summary":" Daily internet communication relies heavily on tree-structured graphs,\nembodied by popular data formats such as XML and JSON. However, many recent\ngenerative (probabilistic) models utilize neural networks to learn a\nprobability distribution over undirected cyclic graphs. This assumption of a\ngeneric graph structure brings various computational challenges, and, more\nimportantly, the presence of non-linearities in neural networks does not permit\ntractable probabilistic inference. We address these problems by proposing\nsum-product-set networks, an extension of probabilistic circuits from\nunstructured tensor data to tree-structured graph data. To this end, we use\nrandom finite sets to reflect a variable number of nodes and edges in the graph\nand to allow for exact and efficient inference. We demonstrate that our\ntractable model performs comparably to various intractable models based on\nneural networks.\n","authors":["Milan Papež","Martin Rektoris","Tomáš Pevný","Václav Šmídl"],"pdf_url":"https://arxiv.org/pdf/2408.07394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09828v2","updated":"2024-08-14T09:11:33Z","published":"2024-04-15T14:26:00Z","title":"Interaction as Explanation: A User Interaction-based Method for\n Explaining Image Classification Models","summary":" In computer vision, explainable AI (xAI) methods seek to mitigate the\n'black-box' problem by making the decision-making process of deep learning\nmodels more interpretable and transparent. Traditional xAI methods concentrate\non visualizing input features that influence model predictions, providing\ninsights primarily suited for experts. In this work, we present an\ninteraction-based xAI method that enhances user comprehension of image\nclassification models through their interaction. Thus, we developed a web-based\nprototype allowing users to modify images via painting and erasing, thereby\nobserving changes in classification results. Our approach enables users to\ndiscern critical features influencing the model's decision-making process,\naligning their mental models with the model's logic. Experiments conducted with\nfive images demonstrate the potential of the method to reveal feature\nimportance through user interaction. Our work contributes a novel perspective\nto xAI by centering on end-user engagement and understanding, paving the way\nfor more intuitive and accessible explainability in AI systems.\n","authors":["Hyeonggeun Yun"],"pdf_url":"https://arxiv.org/pdf/2404.09828v2.pdf","comment":"IJCAI 2024 (International Joint Conference on Artificial Intelligence\n 2024) Workshop on Explainable Artificial Intelligence (XAI)"},{"id":"http://arxiv.org/abs/2408.07388v1","updated":"2024-08-14T09:08:43Z","published":"2024-08-14T09:08:43Z","title":"DPSNN: Spiking Neural Network for Low-Latency Streaming Speech\n Enhancement","summary":" Speech enhancement (SE) improves communication in noisy environments,\naffecting areas such as automatic speech recognition, hearing aids, and\ntelecommunications. With these domains typically being power-constrained and\nevent-based while requiring low latency, neuromorphic algorithms in the form of\nspiking neural networks (SNNs) have great potential. Yet, current effective SNN\nsolutions require a contextual sampling window imposing substantial latency,\ntypically around 32ms, too long for many applications. Inspired by Dual-Path\nSpiking Neural Networks (DPSNNs) in classical neural networks, we develop a\ntwo-phase time-domain streaming SNN framework -- the Dual-Path Spiking Neural\nNetwork (DPSNN). In the DPSNN, the first phase uses Spiking Convolutional\nNeural Networks (SCNNs) to capture global contextual information, while the\nsecond phase uses Spiking Recurrent Neural Networks (SRNNs) to focus on\nfrequency-related features. In addition, the regularizer suppresses activation\nto further enhance energy efficiency of our DPSNNs. Evaluating on the VCTK and\nIntel DNS Datasets, we demonstrate that our approach achieves the very low\nlatency (approximately 5ms) required for applications like hearing aids, while\ndemonstrating excellent signal-to-noise ratio (SNR), perceptual quality, and\nenergy efficiency.\n","authors":["Tao Sun","Sander Bohté"],"pdf_url":"https://arxiv.org/pdf/2408.07388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07386v1","updated":"2024-08-14T09:06:25Z","published":"2024-08-14T09:06:25Z","title":"Fading memory and the convolution theorem","summary":" Several topological and analytical notions of continuity and fading memory\nfor causal and time-invariant filters are introduced, and the relations between\nthem are analysed. A significant generalization of the convolution theorem that\nestablishes the equivalence between the fading memory property and the\navailability of convolution representations of linear filters is proved. This\nresult extends a previous such characterization to a complete array of weighted\nnorms in the definition of the fading memory property. Additionally, the main\ntheorem shows that the availability of convolution representations can be\ncharacterized, at least when the codomain is finite-dimensional, not only by\nthe fading memory property but also by the reunion of two purely topological\nnotions that are called minimal continuity and minimal fading memory property.\nFinally, when the input space and the codomain of a linear functional are\nHilbert spaces, it is shown that minimal continuity and the minimal fading\nmemory property guarantee the existence of interesting embeddings of the\nassociated reproducing kernel Hilbert spaces and approximation results of\nsolutions of kernel regressions in the presence of finite data sets.\n","authors":["Juan-Pablo Ortega","Florian Rossmannek"],"pdf_url":"https://arxiv.org/pdf/2408.07386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09171v2","updated":"2024-08-14T09:06:06Z","published":"2024-03-14T08:31:39Z","title":"ADEdgeDrop: Adversarial Edge Dropping for Robust Graph Neural Networks","summary":" Although Graph Neural Networks (GNNs) have exhibited the powerful ability to\ngather graph-structured information from neighborhood nodes via various\nmessage-passing mechanisms, the performance of GNNs is limited by poor\ngeneralization and fragile robustness caused by noisy and redundant graph data.\nAs a prominent solution, Graph Augmentation Learning (GAL) has recently\nreceived increasing attention. Among prior GAL approaches, edge-dropping\nmethods that randomly remove edges from a graph during training are effective\ntechniques to improve the robustness of GNNs. However, randomly dropping edges\noften results in bypassing critical edges, consequently weakening the\neffectiveness of message passing. In this paper, we propose a novel adversarial\nedge-dropping method (ADEdgeDrop) that leverages an adversarial edge predictor\nguiding the removal of edges, which can be flexibly incorporated into diverse\nGNN backbones. Employing an adversarial training framework, the edge predictor\nutilizes the line graph transformed from the original graph to estimate the\nedges to be dropped, which improves the interpretability of the edge-dropping\nmethod. The proposed ADEdgeDrop is optimized alternately by stochastic gradient\ndescent and projected gradient descent. Comprehensive experiments on six graph\nbenchmark datasets demonstrate that the proposed ADEdgeDrop outperforms\nstate-of-the-art baselines across various GNN backbones, demonstrating improved\ngeneralization and robustness.\n","authors":["Zhaoliang Chen","Zhihao Wu","Ylli Sadikaj","Claudia Plant","Hong-Ning Dai","Shiping Wang","Yiu-Ming Cheung","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2403.09171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16414v3","updated":"2024-08-14T09:06:00Z","published":"2023-09-28T13:08:08Z","title":"AutoCLIP: Auto-tuning Zero-Shot Classifiers for Vision-Language Models","summary":" Classifiers built upon vision-language models such as CLIP have shown\nremarkable zero-shot performance across a broad range of image classification\ntasks. Prior work has studied different ways of automatically creating\ndescriptor sets for every class based on prompt templates, ranging from\nmanually engineered templates over templates obtained from a large language\nmodel to templates built from random words and characters. Up until now,\nderiving zero-shot classifiers from the respective encoded class descriptors\nhas remained nearly unchanged, i.e., classify to the class that maximizes\ncosine similarity between its averaged encoded class descriptors and the image\nencoding. However, weighing all class descriptors equally can be suboptimal\nwhen certain descriptors match visual clues on a given image better than\nothers. In this work, we propose AutoCLIP, a method for auto-tuning zero-shot\nclassifiers. AutoCLIP tunes per-image weights to each prompt template at\ninference time, based on statistics of class descriptor-image similarities.\nAutoCLIP is fully unsupervised, has only a minor additional computation\noverhead, and can be easily implemented in few lines of code. We show that\nAutoCLIP outperforms baselines across a broad range of vision-language models,\ndatasets, and prompt templates consistently and by up to 3 percent point\naccuracy.\n","authors":["Jan Hendrik Metzen","Piyapat Saranrittichai","Chaithanya Kumar Mummadi"],"pdf_url":"https://arxiv.org/pdf/2309.16414v3.pdf","comment":"accepted at TMLR, Camera Ready Version"},{"id":"http://arxiv.org/abs/2209.13232v4","updated":"2024-08-14T09:05:15Z","published":"2022-09-27T08:10:14Z","title":"A Survey on Graph Neural Networks and Graph Transformers in Computer\n Vision: A Task-Oriented Perspective","summary":" Graph Neural Networks (GNNs) have gained momentum in graph representation\nlearning and boosted the state of the art in a variety of areas, such as data\nmining (\\emph{e.g.,} social network analysis and recommender systems), computer\nvision (\\emph{e.g.,} object detection and point cloud learning), and natural\nlanguage processing (\\emph{e.g.,} relation extraction and sequence learning),\nto name a few. With the emergence of Transformers in natural language\nprocessing and computer vision, graph Transformers embed a graph structure into\nthe Transformer architecture to overcome the limitations of local neighborhood\naggregation while avoiding strict structural inductive biases. In this paper,\nwe present a comprehensive review of GNNs and graph Transformers in computer\nvision from a task-oriented perspective. Specifically, we divide their\napplications in computer vision into five categories according to the modality\nof input data, \\emph{i.e.,} 2D natural images, videos, 3D data, vision +\nlanguage, and medical images. In each category, we further divide the\napplications according to a set of vision tasks. Such a task-oriented taxonomy\nallows us to examine how each task is tackled by different GNN-based approaches\nand how well these approaches perform. Based on the necessary preliminaries, we\nprovide the definitions and challenges of the tasks, in-depth coverage of the\nrepresentative approaches, as well as discussions regarding insights,\nlimitations, and future directions.\n","authors":["Chaoqi Chen","Yushuang Wu","Qiyuan Dai","Hong-Yu Zhou","Mutian Xu","Sibei Yang","Xiaoguang Han","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2209.13232v4.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (T-PAMI)"},{"id":"http://arxiv.org/abs/2303.01070v2","updated":"2024-08-14T09:05:09Z","published":"2023-03-02T08:45:49Z","title":"GHQ: Grouped Hybrid Q Learning for Heterogeneous Cooperative Multi-agent\n Reinforcement Learning","summary":" Previous deep multi-agent reinforcement learning (MARL) algorithms have\nachieved impressive results, typically in homogeneous scenarios. However,\nheterogeneous scenarios are also very common and usually harder to solve. In\nthis paper, we mainly discuss cooperative heterogeneous MARL problems in\nStarcraft Multi-Agent Challenges (SMAC) environment. We firstly define and\ndescribe the heterogeneous problems in SMAC. In order to comprehensively reveal\nand study the problem, we make new maps added to the original SMAC maps. We\nfind that baseline algorithms fail to perform well in those heterogeneous maps.\nTo address this issue, we propose the Grouped Individual-Global-Max Consistency\n(GIGM) and a novel MARL algorithm, Grouped Hybrid Q Learning (GHQ). GHQ\nseparates agents into several groups and keeps individual parameters for each\ngroup, along with a novel hybrid structure for factorization. To enhance\ncoordination between groups, we maximize the Inter-group Mutual Information\n(IGMI) between groups' trajectories. Experiments on original and new\nheterogeneous maps show the fabulous performance of GHQ compared to other\nstate-of-the-art algorithms.\n","authors":["Xiaoyang Yu","Youfang Lin","Xiangsen Wang","Sheng Han","Kai Lv"],"pdf_url":"https://arxiv.org/pdf/2303.01070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07379v1","updated":"2024-08-14T08:56:45Z","published":"2024-08-14T08:56:45Z","title":"Posterior Covariance Structures in Gaussian Processes","summary":" In this paper, we present a comprehensive analysis of the posterior\ncovariance field in Gaussian processes, with applications to the posterior\ncovariance matrix. The analysis is based on the Gaussian prior covariance but\nthe approach also applies to other covariance kernels. Our geometric analysis\nreveals how the Gaussian kernel's bandwidth parameter and the spatial\ndistribution of the observations influence the posterior covariance as well as\nthe corresponding covariance matrix, enabling straightforward identification of\nareas with high or low covariance in magnitude. Drawing inspiration from the a\nposteriori error estimation techniques in adaptive finite element methods, we\nalso propose several estimators to efficiently measure the absolute posterior\ncovariance field, which can be used for efficient covariance matrix\napproximation and preconditioning. We conduct a wide range of experiments to\nillustrate our theoretical findings and their practical applications.\n","authors":["Difeng Cai","Edmond Chow","Yuanzhe Xi"],"pdf_url":"https://arxiv.org/pdf/2408.07379v1.pdf","comment":"22 papges"},{"id":"http://arxiv.org/abs/2407.21034v2","updated":"2024-08-14T08:52:29Z","published":"2024-07-17T06:51:24Z","title":"Watermarking Recommender Systems","summary":" Recommender systems embody significant commercial value and represent crucial\nintellectual property. However, the integrity of these systems is constantly\nchallenged by malicious actors seeking to steal their underlying models.\nSafeguarding against such threats is paramount to upholding the rights and\ninterests of the model owner. While model watermarking has emerged as a potent\ndefense mechanism in various domains, its direct application to recommender\nsystems remains unexplored and non-trivial. In this paper, we address this gap\nby introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel\ntechnique tailored specifically for recommender systems. Our approach entails\nselecting an initial item and querying it through the oracle model, followed by\nthe selection of subsequent items with small prediction scores. This iterative\nprocess generates a watermark sequence autoregressively, which is then\ningrained into the model's memory through training. To assess the efficacy of\nthe watermark, the model is tasked with predicting the subsequent item given a\ntruncated watermark sequence. Through extensive experimentation and analysis,\nwe demonstrate the superior performance and robust properties of AOW. Notably,\nour watermarking technique exhibits high-confidence extraction capabilities and\nmaintains effectiveness even in the face of distillation and fine-tuning\nprocesses.\n","authors":["Sixiao Zhang","Cheng Long","Wei Yuan","Hongxu Chen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07372v1","updated":"2024-08-14T08:38:41Z","published":"2024-08-14T08:38:41Z","title":"An Adaptive Importance Sampling for Locally Stable Point Processes","summary":" The problem of finding the expected value of a statistic of a locally stable\npoint process in a bounded region is addressed. We propose an adaptive\nimportance sampling for solving the problem. In our proposal, we restrict the\nimportance point process to the family of homogeneous Poisson point processes,\nwhich enables us to generate quickly independent samples of the importance\npoint process. The optimal intensity of the importance point process is found\nby applying the cross-entropy minimization method. In the proposed scheme, the\nexpected value of the function and the optimal intensity are iteratively\nestimated in an adaptive manner. We show that the proposed estimator converges\nto the target value almost surely, and prove the asymptotic normality of it. We\nexplain how to apply the proposed scheme to the estimation of the intensity of\na stationary pairwise interaction point process. The performance of the\nproposed scheme is compared numerically with the Markov chain Monte Carlo\nsimulation and the perfect sampling.\n","authors":["Hee-Geon Kang","Sunggon Kim"],"pdf_url":"https://arxiv.org/pdf/2408.07372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12237v2","updated":"2024-08-14T08:37:37Z","published":"2024-01-19T17:07:05Z","title":"A distribution-guided Mapper algorithm","summary":" Motivation: The Mapper algorithm is an essential tool to explore shape of\ndata in topology data analysis. With a dataset as an input, the Mapper\nalgorithm outputs a graph representing the topological features of the whole\ndataset. This graph is often regarded as an approximation of a reeb graph of\ndata. The classic Mapper algorithm uses fixed interval lengths and overlapping\nratios, which might fail to reveal subtle features of data, especially when the\nunderlying structure is complex.\n Results: In this work, we introduce a distribution guided Mapper algorithm\nnamed D-Mapper, that utilizes the property of the probability model and data\nintrinsic characteristics to generate density guided covers and provides\nenhanced topological features. Our proposed algorithm is a probabilistic\nmodel-based approach, which could serve as an alternative to non-prababilistic\nones. Moreover, we introduce a metric accounting for both the quality of\noverlap clustering and extended persistence homology to measure the performance\nof Mapper type algorithm. Our numerical experiments indicate that the D-Mapper\noutperforms the classical Mapper algorithm in various scenarios. We also apply\nthe D-Mapper to a SARS-COV-2 coronavirus RNA sequences dataset to explore the\ntopological structure of different virus variants. The results indicate that\nthe D-Mapper algorithm can reveal both vertical and horizontal evolution\nprocesses of the viruses.\n Availability: Our package is available at\nhttps://github.com/ShufeiGe/D-Mapper.\n","authors":["Yuyang Tao","Shufei Ge"],"pdf_url":"https://arxiv.org/pdf/2401.12237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07364v1","updated":"2024-08-14T08:22:13Z","published":"2024-08-14T08:22:13Z","title":"Robust Active Learning (RoAL): Countering Dynamic Adversaries in Active\n Learning with Elastic Weight Consolidation","summary":" Despite significant advancements in active learning and adversarial attacks,\nthe intersection of these two fields remains underexplored, particularly in\ndeveloping robust active learning frameworks against dynamic adversarial\nthreats. The challenge of developing robust active learning frameworks under\ndynamic adversarial attacks is critical, as these attacks can lead to\ncatastrophic forgetting within the active learning cycle. This paper introduces\nRobust Active Learning (RoAL), a novel approach designed to address this issue\nby integrating Elastic Weight Consolidation (EWC) into the active learning\nprocess. Our contributions are threefold: First, we propose a new dynamic\nadversarial attack that poses significant threats to active learning\nframeworks. Second, we introduce a novel method that combines EWC with active\nlearning to mitigate catastrophic forgetting caused by dynamic adversarial\nattacks. Finally, we conduct extensive experimental evaluations to demonstrate\nthe efficacy of our approach. The results show that RoAL not only effectively\ncounters dynamic adversarial threats but also significantly reduces the impact\nof catastrophic forgetting, thereby enhancing the robustness and performance of\nactive learning systems in adversarial environments.\n","authors":["Ricky Maulana Fajri","Yulong Pei","Lu Yin","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2408.07364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07362v1","updated":"2024-08-14T08:19:23Z","published":"2024-08-14T08:19:23Z","title":"BadMerging: Backdoor Attacks Against Model Merging","summary":" Fine-tuning pre-trained models for downstream tasks has led to a\nproliferation of open-sourced task-specific models. Recently, Model Merging\n(MM) has emerged as an effective approach to facilitate knowledge transfer\namong these independently fine-tuned models. MM directly combines multiple\nfine-tuned task-specific models into a merged model without additional\ntraining, and the resulting model shows enhanced capabilities in multiple\ntasks. Although MM provides great utility, it may come with security risks\nbecause an adversary can exploit MM to affect multiple downstream tasks.\nHowever, the security risks of MM have barely been studied. In this paper, we\nfirst find that MM, as a new learning paradigm, introduces unique challenges\nfor existing backdoor attacks due to the merging process. To address these\nchallenges, we introduce BadMerging, the first backdoor attack specifically\ndesigned for MM. Notably, BadMerging allows an adversary to compromise the\nentire merged model by contributing as few as one backdoored task-specific\nmodel. BadMerging comprises a two-stage attack mechanism and a novel\nfeature-interpolation-based loss to enhance the robustness of embedded\nbackdoors against the changes of different merging parameters. Considering that\na merged model may incorporate tasks from different domains, BadMerging can\njointly compromise the tasks provided by the adversary (on-task attack) and\nother contributors (off-task attack) and solve the corresponding unique\nchallenges with novel attack designs. Extensive experiments show that\nBadMerging achieves remarkable attacks against various MM algorithms. Our\nablation study demonstrates that the proposed attack designs can progressively\ncontribute to the attack performance. Finally, we show that prior defense\nmechanisms fail to defend against our attacks, highlighting the need for more\nadvanced defense.\n","authors":["Jinghuai Zhang","Jianfeng Chi","Zheng Li","Kunlin Cai","Yang Zhang","Yuan Tian"],"pdf_url":"https://arxiv.org/pdf/2408.07362v1.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n (CCS), 2024"},{"id":"http://arxiv.org/abs/2407.07311v2","updated":"2024-08-14T08:02:39Z","published":"2024-07-10T02:11:01Z","title":"ViTime: A Visual Intelligence-Based Foundation Model for Time Series\n Forecasting","summary":" The success of large pretrained models in natural language processing (NLP)\nand computer vision (CV) has opened new avenues for constructing foundation\nmodels for time series forecasting (TSF). Traditional TSF foundation models\nrely heavily on numerical data fitting. In contrast, the human brain is\ninherently skilled at processing visual information, prefer predicting future\ntrends by observing visualized sequences. From a biomimetic perspective,\nutilizing models to directly process numerical sequences might not be the most\neffective route to achieving Artificial General Intelligence (AGI). This paper\nproposes ViTime, a novel Visual Intelligence-based foundation model for TSF.\nViTime overcomes the limitations of numerical time series data fitting by\nutilizing visual data processing paradigms and employs a innovative data\nsynthesis method during training, called Real Time Series (RealTS). Experiments\non a diverse set of previously unseen forecasting datasets demonstrate that\nViTime achieves state-of-the-art zero-shot performance, even surpassing the\nbest individually trained supervised models in some situations. These findings\nsuggest that visual intelligence can significantly enhance time series analysis\nand forecasting, paving the way for more advanced and versatile models in the\nfield. The code for our framework is accessible at\nhttps://github.com/IkeYang/ViTime.\n","authors":["Luoxiao Yang","Yun Wang","Xinqi Fan","Israel Cohen","Jingdong Chen","Yue Zhao","Zijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01138v2","updated":"2024-08-14T08:02:06Z","published":"2024-02-02T04:30:58Z","title":"Graph Neural Networks in EEG-based Emotion Recognition: A Survey","summary":" Compared to other modalities, EEG-based emotion recognition can intuitively\nrespond to the emotional patterns in the human brain and, therefore, has become\none of the most concerning tasks in the brain-computer interfaces field. Since\ndependencies within brain regions are closely related to emotion, a significant\ntrend is to develop Graph Neural Networks (GNNs) for EEG-based emotion\nrecognition. However, brain region dependencies in emotional EEG have\nphysiological bases that distinguish GNNs in this field from those in other\ntime series fields. Besides, there is neither a comprehensive review nor\nguidance for constructing GNNs in EEG-based emotion recognition. In the survey,\nour categorization reveals the commonalities and differences of existing\napproaches under a unified framework of graph construction. We analyze and\ncategorize methods from three stages in the framework to provide clear guidance\non constructing GNNs in EEG-based emotion recognition. In addition, we discuss\nseveral open challenges and future directions, such as Temporal full-connected\ngraph and Graph condensation.\n","authors":["Chenyu Liu","Xinliang Zhou","Yihao Wu","Ruizhi Yang","Zhongruo Wang","Liming Zhai","Ziyu Jia","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13983v3","updated":"2024-08-14T07:53:47Z","published":"2023-08-27T01:32:23Z","title":"Interpolation of mountain weather forecasts by machine learning","summary":" Recent advances in numerical simulation methods based on physical models and\ntheir combination with machine learning have improved the accuracy of weather\nforecasts. However, the accuracy decreases in complex terrains such as\nmountainous regions because these methods usually use grids of several\nkilometers square and simple machine learning models. While deep learning has\nalso made significant progress in recent years, its direct application is\ndifficult to utilize the physical knowledge used in the simulation. This paper\nproposes a method that uses machine learning to interpolate future weather in\nmountainous regions using forecast data from surrounding plains and past\nobserved data to improve weather forecasts in mountainous regions. We focus on\nmountainous regions in Japan and predict temperature and precipitation mainly\nusing LightGBM as a machine learning model. Despite the use of a small dataset,\nthrough feature engineering and model tuning, our method partially achieves\nimprovements in the RMSE with significantly less training time.\n","authors":["Kazuma Iwase","Tomoyuki Takenawa"],"pdf_url":"https://arxiv.org/pdf/2308.13983v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2306.07197v2","updated":"2024-08-14T07:49:55Z","published":"2023-06-12T15:54:52Z","title":"AROID: Improving Adversarial Robustness Through Online Instance-Wise\n Data Augmentation","summary":" Deep neural networks are vulnerable to adversarial examples. Adversarial\ntraining (AT) is an effective defense against adversarial examples. However, AT\nis prone to overfitting which degrades robustness substantially. Recently, data\naugmentation (DA) was shown to be effective in mitigating robust overfitting if\nappropriately designed and optimized for AT. This work proposes a new method to\nautomatically learn online, instance-wise, DA policies to improve robust\ngeneralization for AT. This is the first automated DA method specific for\nrobustness. A novel policy learning objective, consisting of Vulnerability,\nAffinity and Diversity, is proposed and shown to be sufficiently effective and\nefficient to be practical for automatic DA generation during AT. Importantly,\nour method dramatically reduces the cost of policy search from the 5000 hours\nof AutoAugment and the 412 hours of IDBH to 9 hours, making automated DA more\npractical to use for adversarial robustness. This allows our method to\nefficiently explore a large search space for a more effective DA policy and\nevolve the policy as training progresses. Empirically, our method is shown to\noutperform all competitive DA methods across various model architectures and\ndatasets. Our DA policy reinforced vanilla AT to surpass several\nstate-of-the-art AT methods regarding both accuracy and robustness. It can also\nbe combined with those advanced AT methods to further boost robustness. Code\nand pre-trained models are available at https://github.com/TreeLLi/AROID.\n","authors":["Lin Li","Jianing Qiu","Michael Spratling"],"pdf_url":"https://arxiv.org/pdf/2306.07197v2.pdf","comment":"published at the IJCV in press"},{"id":"http://arxiv.org/abs/2310.14483v2","updated":"2024-08-14T07:42:30Z","published":"2023-10-23T01:29:18Z","title":"Chain-of-Factors Paper-Reviewer Matching","summary":" With the rapid increase in paper submissions to academic conferences, the\nneed for automated and accurate paper-reviewer matching is more critical than\never. Previous efforts in this area have considered various factors to assess\nthe relevance of a reviewer's expertise to a paper, such as the semantic\nsimilarity, shared topics, and citation connections between the paper and the\nreviewer's previous works. However, most of these studies focus on only one\nfactor, resulting in an incomplete evaluation of the paper-reviewer relevance.\nTo address this issue, we propose a unified model for paper-reviewer matching\nthat jointly considers semantic, topic, and citation factors. To be specific,\nduring training, we instruction-tune a contextualized language model shared\nacross all factors to capture their commonalities and characteristics; during\ninference, we chain the three factors to enable step-by-step, coarse-to-fine\nsearch for qualified reviewers given a submission. Experiments on four datasets\n(one of which is newly contributed by us) spanning various fields such as\nmachine learning, computer vision, information retrieval, and data mining\nconsistently demonstrate the effectiveness of our proposed Chain-of-Factors\nmodel in comparison with state-of-the-art paper-reviewer matching methods and\nscientific pre-trained language models.\n","authors":["Yu Zhang","Yanzhen Shen","SeongKu Kang","Xiusi Chen","Bowen Jin","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2310.14483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03337v2","updated":"2024-08-14T07:34:18Z","published":"2024-07-22T07:19:12Z","title":"PsyDI: Towards a Personalized and Progressively In-depth Chatbot for\n Psychological Measurements","summary":" In the field of psychology, traditional assessment methods, such as\nstandardized scales, are frequently critiqued for their static nature, lack of\npersonalization, and reduced participant engagement, while comprehensive\ncounseling evaluations are often inaccessible. The complexity of quantifying\npsychological traits further limits these methods. Despite advances with large\nlanguage models (LLMs), many still depend on single-round Question-and-Answer\ninteractions. To bridge this gap, we introduce PsyDI, a personalized and\nprogressively in-depth chatbot designed for psychological measurements,\nexemplified by its application in the Myers-Briggs Type Indicator (MBTI)\nframework. PsyDI leverages user-related multi-modal information and engages in\ncustomized, multi-turn interactions to provide personalized, easily accessible\nmeasurements, while ensuring precise MBTI type determination. To address the\nchallenge of unquantifiable psychological traits, we introduce a novel training\nparadigm that involves learning the ranking of proxy variables associated with\nthese traits, culminating in a robust score model for MBTI measurements. The\nscore model enables PsyDI to conduct comprehensive and precise measurements\nthrough multi-turn interactions within a unified estimation context. Through\nvarious experiments, we validate the efficacy of both the score model and the\nPsyDI pipeline, demonstrating its potential to serve as a general framework for\npsychological measurements. Furthermore, the online deployment of PsyDI has\ngarnered substantial user engagement, with over 3,000 visits, resulting in the\ncollection of numerous multi-turn dialogues annotated with MBTI types, which\nfacilitates further research.\n","authors":["Xueyan Li","Xinyan Chen","Yazhe Niu","Shuai Hu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.03337v2.pdf","comment":"29 pages, 15 figures"},{"id":"http://arxiv.org/abs/2408.07340v1","updated":"2024-08-14T07:31:11Z","published":"2024-08-14T07:31:11Z","title":"Towards Few-shot Self-explaining Graph Neural Networks","summary":" Recent advancements in Graph Neural Networks (GNNs) have spurred an upsurge\nof research dedicated to enhancing the explainability of GNNs, particularly in\ncritical domains such as medicine. A promising approach is the self-explaining\nmethod, which outputs explanations along with predictions. However, existing\nself-explaining models require a large amount of training data, rendering them\nunavailable in few-shot scenarios. To address this challenge, in this paper, we\npropose a Meta-learned Self-Explaining GNN (MSE-GNN), a novel framework that\ngenerates explanations to support predictions in few-shot settings. MSE-GNN\nadopts a two-stage self-explaining structure, consisting of an explainer and a\npredictor. Specifically, the explainer first imitates the attention mechanism\nof humans to select the explanation subgraph, whereby attention is naturally\npaid to regions containing important characteristics. Subsequently, the\npredictor mimics the decision-making process, which makes predictions based on\nthe generated explanation. Moreover, with a novel meta-training process and a\ndesigned mechanism that exploits task information, MSE-GNN can achieve\nremarkable performance on new few-shot tasks. Extensive experimental results on\nfour datasets demonstrate that MSE-GNN can achieve superior performance on\nprediction tasks while generating high-quality explanations compared with\nexisting methods. The code is publicly available at\nhttps://github.com/jypeng28/MSE-GNN.\n","authors":["Jingyu Peng","Qi Liu","Linan Yue","Zaixi Zhang","Kai Zhang","Yunhao Sha"],"pdf_url":"https://arxiv.org/pdf/2408.07340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07331v1","updated":"2024-08-14T07:13:36Z","published":"2024-08-14T07:13:36Z","title":"RSEA-MVGNN: Multi-View Graph Neural Network with Reliable Structural\n Enhancement and Aggregation","summary":" Graph Neural Networks (GNNs) have exhibited remarkable efficacy in learning\nfrom multi-view graph data. In the framework of multi-view graph neural\nnetworks, a critical challenge lies in effectively combining diverse views,\nwhere each view has distinct graph structure features (GSFs). Existing\napproaches to this challenge primarily focus on two aspects: 1) prioritizing\nthe most important GSFs, 2) utilizing GNNs for feature aggregation. However,\nprioritizing the most important GSFs can lead to limited feature diversity, and\nexisting GNN-based aggregation strategies equally treat each view without\nconsidering view quality. To address these issues, we propose a novel\nMulti-View Graph Neural Network with Reliable Structural Enhancement and\nAggregation (RSEA-MVGNN). Firstly, we estimate view-specific uncertainty\nemploying subjective logic. Based on this uncertainty, we design reliable\nstructural enhancement by feature de-correlation algorithm. This approach\nenables each enhancement to focus on different GSFs, thereby achieving diverse\nfeature representation in the enhanced structure. Secondly, the model learns\nview-specific beliefs and uncertainty as opinions, which are utilized to\nevaluate view quality. Based on these opinions, the model enables high-quality\nviews to dominate GNN aggregation, thereby facilitating representation\nlearning. Experimental results conducted on five real-world datasets\ndemonstrate that RSEA-MVGNN outperforms several state-of-the-art GNN-based\nmethods.\n","authors":["Junyu Chen","Long Shi","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16877v3","updated":"2024-08-14T07:09:25Z","published":"2023-05-26T12:30:05Z","title":"Distributional Reinforcement Learning with Dual Expectile-Quantile\n Regression","summary":" Distributional reinforcement learning (RL) has proven useful in multiple\nbenchmarks as it enables approximating the full distribution of returns and\nmakes a better use of environment samples. The commonly used quantile\nregression approach to distributional RL -- based on asymmetric $L_1$ losses --\nprovides a flexible and effective way of learning arbitrary return\ndistributions. In practice, it is often improved by using a more efficient,\nhybrid asymmetric $L_1$-$L_2$ Huber loss for quantile regression. However, by\ndoing so, distributional estimation guarantees vanish, and we empirically\nobserve that the estimated distribution rapidly collapses to its mean. Indeed,\nasymmetric $L_2$ losses, corresponding to expectile regression, cannot be\nreadily used for distributional temporal difference learning. Motivated by the\nefficiency of $L_2$-based learning, we propose to jointly learn expectiles and\nquantiles of the return distribution in a way that allows efficient learning\nwhile keeping an estimate of the full distribution of returns. We prove that\nour approach approximately learns the correct return distribution, and we\nbenchmark a practical implementation on a toy example and at scale. On the\nAtari benchmark, our approach matches the performance of the Huber-based IQN-1\nbaseline after $200$M training frames but avoids distributional collapse and\nkeeps estimates of the full distribution of returns.\n","authors":["Sami Jullien","Romain Deffayet","Jean-Michel Renders","Paul Groth","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2305.16877v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07327v1","updated":"2024-08-14T06:57:58Z","published":"2024-08-14T06:57:58Z","title":"An Offline Meta Black-box Optimization Framework for Adaptive Design of\n Urban Traffic Light Management Systems","summary":" Complex urban road networks with high vehicle occupancy frequently face\nsevere traffic congestion. Designing an effective strategy for managing\nmultiple traffic lights plays a crucial role in managing congestion. However,\nmost current traffic light management systems rely on human-crafted decisions,\nwhich may not adapt well to diverse traffic patterns. In this paper, we delve\ninto two pivotal design components of the traffic light management system that\ncan be dynamically adjusted to various traffic conditions: phase combination\nand phase time allocation. While numerous studies have sought an efficient\nstrategy for managing traffic lights, most of these approaches consider a fixed\ntraffic pattern and are limited to relatively small road networks. To overcome\nthese limitations, we introduce a novel and practical framework to formulate\nthe optimization of such design components using an offline meta black-box\noptimization. We then present a simple yet effective method to efficiently find\na solution for the aforementioned problem. In our framework, we first collect\nan offline meta dataset consisting of pairs of design choices and corresponding\ncongestion measures from various traffic patterns. After collecting the\ndataset, we employ the Attentive Neural Process (ANP) to predict the impact of\nthe proposed design on congestion across various traffic patterns with\nwell-calibrated uncertainty. Finally, Bayesian optimization, with ANP as a\nsurrogate model, is utilized to find an optimal design for unseen traffic\npatterns through limited online simulations. Our experiment results show that\nour method outperforms state-of-the-art baselines on complex road networks in\nterms of the number of waiting vehicles. Surprisingly, the deployment of our\nmethod into a real-world traffic system was able to improve traffic throughput\nby 4.80\\% compared to the original strategy.\n","authors":["Taeyoung Yun","Kanghoon Lee","Sujin Yun","Ilmyung Kim","Won-Woo Jung","Min-Cheol Kwon","Kyujin Choi","Yoohyeon Lee","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2408.07327v1.pdf","comment":"12 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2306.09977v2","updated":"2024-08-14T06:53:41Z","published":"2023-06-16T17:17:07Z","title":"Adversarially robust clustering with optimality guarantees","summary":" We consider the problem of clustering data points coming from sub-Gaussian\nmixtures. Existing methods that provably achieve the optimal mislabeling error,\nsuch as the Lloyd algorithm, are usually vulnerable to outliers. In contrast,\nclustering methods seemingly robust to adversarial perturbations are not known\nto satisfy the optimal statistical guarantees. We propose a simple robust\nalgorithm based on the coordinatewise median that obtains the optimal\nmislabeling rate even when we allow adversarial outliers to be present. Our\nalgorithm achieves the optimal error rate in constant iterations when a weak\ninitialization condition is satisfied. In the absence of outliers, in fixed\ndimensions, our theoretical guarantees are similar to that of the Lloyd\nalgorithm. Extensive experiments on various simulated and public datasets are\nconducted to support the theoretical guarantees of our method.\n","authors":["Soham Jana","Kun Yang","Sanjeev Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2306.09977v2.pdf","comment":"38 pages, 9 figures. Updated with remarks, real data analysis, and\n typo corrections"},{"id":"http://arxiv.org/abs/2408.07318v1","updated":"2024-08-14T06:37:30Z","published":"2024-08-14T06:37:30Z","title":"A systematic dataset generation technique applied to data-driven\n automotive aerodynamics","summary":" A novel strategy for generating datasets is developed within the context of\ndrag prediction for automotive geometries using neural networks. A primary\nchallenge in this space is constructing a training databse of sufficient size\nand diversity. Our method relies on a small number of starting data points, and\nprovides a recipe to interpolate systematically between them, generating an\narbitrary number of samples at the desired quality. We test this strategy using\na realistic automotive geometry, and demonstrate that convolutional neural\nnetworks perform exceedingly well at predicting drag coefficients and surface\npressures. Promising results are obtained in testing extrapolation performance.\nOur method can be applied to other problems of aerodynamic shape optimization.\n","authors":["Mark Benjamin","Gianluca Iaccarino"],"pdf_url":"https://arxiv.org/pdf/2408.07318v1.pdf","comment":"26 pages, 28 figures"},{"id":"http://arxiv.org/abs/2403.05066v2","updated":"2024-08-14T06:32:11Z","published":"2024-03-08T05:37:59Z","title":"Reset & Distill: A Recipe for Overcoming Negative Transfer in Continual\n Reinforcement Learning","summary":" We argue that the negative transfer problem occurring when the new task to\nlearn arrives is an important problem that needs not be overlooked when\ndeveloping effective Continual Reinforcement Learning (CRL) algorithms. Through\ncomprehensive experimental validation, we demonstrate that such issue\nfrequently exists in CRL and cannot be effectively addressed by several recent\nwork on mitigating plasticity loss of RL agents. To that end, we develop Reset\n& Distill (R&D), a simple yet highly effective method, to overcome the negative\ntransfer problem in CRL. R&D combines a strategy of resetting the agent's\nonline actor and critic networks to learn a new task and an offline learning\nstep for distilling the knowledge from the online actor and previous expert's\naction probabilities. We carried out extensive experiments on long sequence of\nMeta World tasks and show that our method consistently outperforms recent\nbaselines, achieving significantly higher success rates across a range of\ntasks. Our findings highlight the importance of considering negative transfer\nin CRL and emphasize the need for robust strategies like R&D to mitigate its\ndetrimental effects.\n","authors":["Hongjoon Ahn","Jinu Hyeon","Youngmin Oh","Bosun Hwang","Taesup Moon"],"pdf_url":"https://arxiv.org/pdf/2403.05066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19757v2","updated":"2024-08-14T06:26:27Z","published":"2024-05-30T07:06:02Z","title":"Improving SMOTE via Fusing Conditional VAE for Data-adaptive Noise\n Filtering","summary":" Recent advances in a generative neural network model extend the development\nof data augmentation methods. However, the augmentation methods based on the\nmodern generative models fail to achieve notable performance for class\nimbalance data compared to the conventional model, Synthetic Minority\nOversampling Technique (SMOTE). We investigate the problem of the generative\nmodel for imbalanced classification and introduce a framework to enhance the\nSMOTE algorithm using Variational Autoencoders (VAE). Our approach\nsystematically quantifies the density of data points in a low-dimensional\nlatent space using the VAE, simultaneously incorporating information on class\nlabels and classification difficulty. Then, the data points potentially\ndegrading the augmentation are systematically excluded, and the neighboring\nobservations are directly augmented on the data space. Empirical studies on\nseveral imbalanced datasets represent that this simple process innovatively\nimproves the conventional SMOTE algorithm over the deep learning models.\nConsequently, we conclude that the selection of minority data and the\ninterpolation in the data space are beneficial for imbalanced classification\nproblems with a relatively small number of data points.\n","authors":["Sungchul Hong","Seunghwan An","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2405.19757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03219v2","updated":"2024-08-14T06:25:50Z","published":"2024-08-06T14:25:23Z","title":"Learning to Learn without Forgetting using Attention","summary":" Continual learning (CL) refers to the ability to continually learn over time\nby accommodating new knowledge while retaining previously learned experience.\nWhile this concept is inherent in human learning, current machine learning\nmethods are highly prone to overwrite previously learned patterns and thus\nforget past experience. Instead, model parameters should be updated selectively\nand carefully, avoiding unnecessary forgetting while optimally leveraging\npreviously learned patterns to accelerate future learning. Since hand-crafting\neffective update mechanisms is difficult, we propose meta-learning a\ntransformer-based optimizer to enhance CL. This meta-learned optimizer uses\nattention to learn the complex relationships between model parameters across a\nstream of tasks, and is designed to generate effective weight updates for the\ncurrent task while preventing catastrophic forgetting on previously encountered\ntasks. Evaluations on benchmark datasets like SplitMNIST, RotatedMNIST, and\nSplitCIFAR-100 affirm the efficacy of the proposed approach in terms of both\nforward and backward transfer, even on small sets of labeled data, highlighting\nthe advantages of integrating a meta-learned optimizer within the continual\nlearning framework.\n","authors":["Anna Vettoruzzo","Joaquin Vanschoren","Mohamed-Rafik Bouguelia","Thorsteinn Rögnvaldsson"],"pdf_url":"https://arxiv.org/pdf/2408.03219v2.pdf","comment":"Published at the 3rd Conference on Lifelong Learning Agents (CoLLAs),\n 2024"},{"id":"http://arxiv.org/abs/2408.07314v1","updated":"2024-08-14T06:15:55Z","published":"2024-08-14T06:15:55Z","title":"Kolmogorov-Arnold Networks (KAN) for Time Series Classification and\n Robust Analysis","summary":" Kolmogorov-Arnold Networks (KAN) has recently attracted significant attention\nas a promising alternative to traditional Multi-Layer Perceptrons (MLP).\nDespite their theoretical appeal, KAN require validation on large-scale\nbenchmark datasets. Time series data, which has become increasingly prevalent\nin recent years, especially univariate time series are naturally suited for\nvalidating KAN. Therefore, we conducted a fair comparison among KAN, MLP, and\nmixed structures. The results indicate that KAN can achieve performance\ncomparable to, or even slightly better than, MLP across 128 time series\ndatasets. We also performed an ablation study on KAN, revealing that the output\nis primarily determined by the base component instead of b-spline function.\nFurthermore, we assessed the robustness of these models and found that KAN and\nthe hybrid structure MLP\\_KAN exhibit significant robustness advantages,\nattributed to their lower Lipschitz constants. This suggests that KAN and KAN\nlayers hold strong potential to be robust models or to improve the adversarial\nrobustness of other models.\n","authors":["Chang Dong","Liangwei Zheng","Weitong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07314v1.pdf","comment":"14 pages, 8 figs"},{"id":"http://arxiv.org/abs/2406.13668v2","updated":"2024-08-14T06:08:54Z","published":"2024-06-19T16:19:39Z","title":"Breaking the $T^{2/3}$ Barrier for Sequential Calibration","summary":" A set of probabilistic forecasts is calibrated if each prediction of the\nforecaster closely approximates the empirical distribution of outcomes on the\nsubset of timesteps where that prediction was made. We study the fundamental\nproblem of online calibrated forecasting of binary sequences, which was\ninitially studied by Foster & Vohra (1998). They derived an algorithm with\n$O(T^{2/3})$ calibration error after $T$ time steps, and showed a lower bound\nof $\\Omega(T^{1/2})$. These bounds remained stagnant for two decades, until\nQiao & Valiant (2021) improved the lower bound to $\\Omega(T^{0.528})$ by\nintroducing a combinatorial game called sign preservation and showing that\nlower bounds for this game imply lower bounds for calibration.\n In this paper, we give the first improvement to the $O(T^{2/3})$ upper bound\non calibration error of Foster & Vohra. We do this by introducing a variant of\nQiao & Valiant's game that we call sign preservation with reuse (SPR). We prove\nthat the relationship between SPR and calibrated forecasting is bidirectional:\nnot only do lower bounds for SPR translate into lower bounds for calibration,\nbut algorithms for SPR also translate into new algorithms for calibrated\nforecasting. We then give an improved \\emph{upper bound} for the SPR game,\nwhich implies, via our equivalence, a forecasting algorithm with calibration\nerror $O(T^{2/3 - \\varepsilon})$ for some $\\varepsilon > 0$, improving Foster &\nVohra's upper bound for the first time. Using similar ideas, we then prove a\nslightly stronger lower bound than that of Qiao & Valiant, namely\n$\\Omega(T^{0.54389})$. Our lower bound is obtained by an oblivious adversary,\nmarking the first $\\omega(T^{1/2})$ calibration lower bound for oblivious\nadversaries.\n","authors":["Yuval Dagan","Constantinos Daskalakis","Maxwell Fishelson","Noah Golowich","Robert Kleinberg","Princewill Okoroafor"],"pdf_url":"https://arxiv.org/pdf/2406.13668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06008v2","updated":"2024-08-14T06:06:56Z","published":"2024-05-09T18:00:00Z","title":"Wilsonian Renormalization of Neural Network Gaussian Processes","summary":" Separating relevant and irrelevant information is key to any modeling process\nor scientific inquiry. Theoretical physics offers a powerful tool for achieving\nthis in the form of the renormalization group (RG). Here we demonstrate a\npractical approach to performing Wilsonian RG in the context of Gaussian\nProcess (GP) Regression. We systematically integrate out the unlearnable modes\nof the GP kernel, thereby obtaining an RG flow of the GP in which the data sets\nthe IR scale. In simple cases, this results in a universal flow of the ridge\nparameter, which becomes input-dependent in the richer scenario in which\nnon-Gaussianities are included. In addition to being analytically tractable,\nthis approach goes beyond structural analogies between RG and neural networks\nby providing a natural connection between RG flow and learnable vs. unlearnable\nmodes. Studying such flows may improve our understanding of feature learning in\ndeep neural networks, and enable us to identify potential universality classes\nin these models.\n","authors":["Jessica N. Howard","Ro Jefferson","Anindita Maiti","Zohar Ringel"],"pdf_url":"https://arxiv.org/pdf/2405.06008v2.pdf","comment":"17 pages, 1 figure; rewrote introduction, added references, section\n IIIA, section IVA, and appendix C"},{"id":"http://arxiv.org/abs/2402.01995v5","updated":"2024-08-14T06:02:40Z","published":"2024-02-03T02:36:59Z","title":"Online Uniform Allocation:Randomized Learning-Augmented Approximation\n Algorithms with Application to Digital Health","summary":" Motivated by applications in digital health, this work studies the novel\nproblem of online uniform allocation (OUA), where the goal is to distribute a\nbudget uniformly across unknown decision times. In the OUA problem, the\nalgorithm is given a budget $b$ and a time horizon $T$, and an adversary then\nchooses a value $\\tau^* \\in [b,T]$, which is revealed to the algorithm online.\nAt each decision time $i \\in [\\tau^*]$, the algorithm must determine a\nprobability that maximizes the budget spent throughout the horizon, respecting\nbudget constraint $b$, while achieving as uniform a distribution as possible\nover $\\tau^*$. We present the first randomized algorithm designed for this\nproblem and subsequently extend it to incorporate learning augmentation. We\nprovide worst-case approximation guarantees for both algorithms, and illustrate\nthe utility of the algorithms through both synthetic experiments and a\nreal-world case study involving the HeartSteps mobile application. Our\nnumerical results show strong empirical average performance of our proposed\nrandomized algorithms against previously proposed heuristic solutions.\n","authors":["Xueqing Liu","Kyra Gan","Esmaeil Keyvanshokooh","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2402.01995v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07307v1","updated":"2024-08-14T05:57:56Z","published":"2024-08-14T05:57:56Z","title":"Nonlocal Attention Operator: Materializing Hidden Knowledge Towards\n Interpretable Physics Discovery","summary":" Despite the recent popularity of attention-based neural architectures in core\nAI fields like natural language processing (NLP) and computer vision (CV),\ntheir potential in modeling complex physical systems remains under-explored.\nLearning problems in physical systems are often characterized as discovering\noperators that map between function spaces based on a few instances of function\npairs. This task frequently presents a severely ill-posed PDE inverse problem.\nIn this work, we propose a novel neural operator architecture based on the\nattention mechanism, which we coin Nonlocal Attention Operator (NAO), and\nexplore its capability towards developing a foundation physical model. In\nparticular, we show that the attention mechanism is equivalent to a double\nintegral operator that enables nonlocal interactions among spatial tokens, with\na data-dependent kernel characterizing the inverse mapping from data to the\nhidden parameter field of the underlying operator. As such, the attention\nmechanism extracts global prior information from training data generated by\nmultiple systems, and suggests the exploratory space in the form of a nonlinear\nkernel map. Consequently, NAO can address ill-posedness and rank deficiency in\ninverse PDE problems by encoding regularization and achieving generalizability.\nWe empirically demonstrate the advantages of NAO over baseline neural models in\nterms of generalizability to unseen data resolutions and system states. Our\nwork not only suggests a novel neural operator architecture for learning\ninterpretable foundation models of physical systems, but also offers a new\nperspective towards understanding the attention mechanism.\n","authors":["Yue Yu","Ning Liu","Fei Lu","Tian Gao","Siavash Jafarzadeh","Stewart Silling"],"pdf_url":"https://arxiv.org/pdf/2408.07307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10927v3","updated":"2024-08-14T05:54:52Z","published":"2023-11-18T01:21:54Z","title":"Learning Payment-Free Resource Allocation Mechanisms","summary":" We consider the design of mechanisms that allocate limited resources among\nself-interested agents using neural networks. Unlike the recent works that\nleverage machine learning for revenue maximization in auctions, we consider\nwelfare maximization as the key objective in the payment-free setting. Without\npayment exchange, it is unclear how we can align agents' incentives to achieve\nthe desired objectives of truthfulness and social welfare simultaneously,\nwithout resorting to approximations. Our work makes novel contributions by\ndesigning an approximate mechanism that desirably trade-off social welfare with\ntruthfulness. Specifically, (i) we contribute a new end-to-end neural network\narchitecture, ExS-Net, that accommodates the idea of \"money-burning\" for\nmechanism design without payments; (ii)~we provide a generalization bound that\nguarantees the mechanism performance when trained under finite samples; and\n(iii) we provide an experimental demonstration of the merits of the proposed\nmechanism.\n","authors":["Sihan Zeng","Sujay Bhatt","Eleonora Kreacic","Parisa Hassanzadeh","Alec Koppel","Sumitra Ganesh"],"pdf_url":"https://arxiv.org/pdf/2311.10927v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07305v1","updated":"2024-08-14T05:44:56Z","published":"2024-08-14T05:44:56Z","title":"Learning Decisions Offline from Censored Observations with\n ε-insensitive Operational Costs","summary":" Many important managerial decisions are made based on censored observations.\nMaking decisions without adequately handling the censoring leads to inferior\noutcomes. We investigate the data-driven decision-making problem with an\noffline dataset containing the feature data and the censored historical data of\nthe variable of interest without the censoring indicators. Without assuming the\nunderlying distribution, we design and leverage {\\epsilon}-insensitive\noperational costs to deal with the unobserved censoring in an offline\ndata-driven fashion. We demonstrate the customization of the\n{\\epsilon}-insensitive operational costs for a newsvendor problem and use such\ncosts to train two representative ML models, including linear regression (LR)\nmodels and neural networks (NNs). We derive tight generalization bounds for the\ncustom LR model without regularization (LR-{\\epsilon}NVC) and with\nregularization (LR-{\\epsilon}NVC-R), and a high-probability generalization\nbound for the custom NN (NN-{\\epsilon}NVC) trained by stochastic gradient\ndescent. The theoretical results reveal the stability and learnability of\nLR-{\\epsilon}NVC, LR-{\\epsilon}NVC-R and NN-{\\epsilon}NVC. We conduct extensive\nnumerical experiments to compare LR-{\\epsilon}NVC-R and NN-{\\epsilon}NVC with\ntwo existing approaches, estimate-as-solution (EAS) and integrated estimation\nand optimization (IEO). The results show that LR-{\\epsilon}NVC-R and\nNN-{\\epsilon}NVC outperform both EAS and IEO, with maximum cost savings up to\n14.40% and 12.21% compared to the lowest cost generated by the two existing\napproaches. In addition, LR-{\\epsilon}NVC-R's and NN-{\\epsilon}NVC's order\nquantities are statistically significantly closer to the optimal solutions\nshould the underlying distribution be known.\n","authors":["Minxia Chen","Ke Fu","Teng Huang","Miao Bai"],"pdf_url":"https://arxiv.org/pdf/2408.07305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06799v2","updated":"2024-08-14T05:44:37Z","published":"2024-08-13T10:42:32Z","title":"On a Scale-Invariant Approach to Bundle Recommendations in Candy Crush\n Saga","summary":" A good understanding of player preferences is crucial for increasing content\nrelevancy, especially in mobile games. This paper illustrates the use of\nattentive models for producing item recommendations in a mobile game scenario.\nThe methodology comprises a combination of supervised and unsupervised\napproaches to create user-level recommendations while introducing a novel\nscale-invariant approach to the prediction. The methodology is subsequently\napplied to a bundle recommendation in Candy Crush Saga. The strategy of\ndeployment, maintenance, and monitoring of ML models that are scaled up to\nserve millions of users is presented, along with the best practices and design\npatterns adopted to minimize technical debt typical of ML systems. The\nrecommendation approach is evaluated both offline and online, with a focus on\nunderstanding the increase in engagement, click- and take rates, novelty\neffects, recommendation diversity, and the impact of degenerate feedback loops.\nWe have demonstrated that the recommendation enhances user engagement by 30%\nconcerning click rate and by more than 40% concerning take rate. In addition,\nwe empirically quantify the diminishing effects of recommendation accuracy on\nuser engagement.\n","authors":["Styliani Katsarou","Francesca Carminati","Martin Dlask","Marta Braojos","Lavena Patra","Richard Perkins","Carlos Garcia Ling","Maria Paskevich"],"pdf_url":"https://arxiv.org/pdf/2408.06799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07303v1","updated":"2024-08-14T05:18:43Z","published":"2024-08-14T05:18:43Z","title":"Enhancing Visual Question Answering through Ranking-Based Hybrid\n Training and Multimodal Fusion","summary":" Visual Question Answering (VQA) is a challenging task that requires systems\nto provide accurate answers to questions based on image content. Current VQA\nmodels struggle with complex questions due to limitations in capturing and\nintegrating multimodal information effectively. To address these challenges, we\npropose the Rank VQA model, which leverages a ranking-inspired hybrid training\nstrategy to enhance VQA performance. The Rank VQA model integrates high-quality\nvisual features extracted using the Faster R-CNN model and rich semantic text\nfeatures obtained from a pre-trained BERT model. These features are fused\nthrough a sophisticated multimodal fusion technique employing multi-head\nself-attention mechanisms. Additionally, a ranking learning module is\nincorporated to optimize the relative ranking of answers, thus improving answer\naccuracy. The hybrid training strategy combines classification and ranking\nlosses, enhancing the model's generalization ability and robustness across\ndiverse datasets. Experimental results demonstrate the effectiveness of the\nRank VQA model. Our model significantly outperforms existing state-of-the-art\nmodels on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of\nboth accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank\nVQA is evident in its ability to handle complex questions that require\nunderstanding nuanced details and making sophisticated inferences from the\nimage and text. This work highlights the effectiveness of a ranking-based\nhybrid training strategy in improving VQA performance and lays the groundwork\nfor further research in multimodal learning methods.\n","authors":["Peiyuan Chen","Zecheng Zhang","Yiping Dong","Li Zhou","Han Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07303v1.pdf","comment":"Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal\n Fusion, Ranking Learning, Hybrid Training Strategy"},{"id":"http://arxiv.org/abs/2408.06891v2","updated":"2024-08-14T05:16:46Z","published":"2024-08-13T13:38:32Z","title":"Automatic Feature Recognition and Dimensional Attributes Extraction From\n CAD Models for Hybrid Additive-Subtractive Manufacturing","summary":" The integration of Computer-Aided Design (CAD), Computer-Aided Process\nPlanning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in\nmodern manufacturing, facilitating seamless transitions from digital designs to\nphysical products. However, a significant challenge within this integration is\nthe Automatic Feature Recognition (AFR) of CAD models, especially in the\ncontext of hybrid manufacturing that combines subtractive and additive\nmanufacturing processes. Traditional AFR methods, focused mainly on the\nidentification of subtractive (machined) features including holes, fillets,\nchamfers, pockets, and slots, fail to recognize features pertinent to additive\nmanufacturing. Furthermore, the traditional methods fall short in accurately\nextracting geometric dimensions and orientations, which are also key factors\nfor effective manufacturing process planning. This paper presents a novel\napproach for creating a synthetic CAD dataset that encompasses features\nrelevant to both additive and subtractive machining through Python Open\nCascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is\nimplemented to accurately identify the composite additive-subtractive features\nwithin the synthetic CAD dataset. The key novelty and contribution of the\nproposed methodology lie in its ability to recognize a wide range of\nmanufacturing features, and precisely extracting their dimensions,\norientations, and stock sizes. The proposed model demonstrates remarkable\nfeature recognition accuracy exceeding 97% and a dimension extraction accuracy\nof 100% for identified features. Therefore, the proposed methodology enhances\nthe integration of CAD, CAPP, and CAM within hybrid manufacturing by providing\nprecise feature recognition and dimension extraction. It facilitates improved\nmanufacturing process planning, by enabling more informed decision-making.\n","authors":["Muhammad Tayyab Khan","Wenhe Feng","Lequn Chen","Ye Han Ng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2408.06891v2.pdf","comment":"10 pages, 12 figures. This paper has been accepted for presentation\n at the ASME IDETC-CIE 2024 conference"},{"id":"http://arxiv.org/abs/2408.07292v1","updated":"2024-08-14T04:51:33Z","published":"2024-08-14T04:51:33Z","title":"LiPCoT: Linear Predictive Coding based Tokenizer for Self-supervised\n Learning of Time Series Data via Language Models","summary":" Language models have achieved remarkable success in various natural language\nprocessing tasks. However, their application to time series data, a crucial\ncomponent in many domains, remains limited. This paper proposes LiPCoT (Linear\nPredictive Coding based Tokenizer for time series), a novel tokenizer that\nencodes time series data into a sequence of tokens, enabling self-supervised\nlearning of time series using existing Language model architectures such as\nBERT. Unlike traditional time series tokenizers that rely heavily on CNN\nencoder for time series feature generation, LiPCoT employs stochastic modeling\nthrough linear predictive coding to create a latent space for time series\nproviding a compact yet rich representation of the inherent stochastic nature\nof the data. Furthermore, LiPCoT is computationally efficient and can\neffectively handle time series data with varying sampling rates and lengths,\novercoming common limitations of existing time series tokenizers. In this\nproof-of-concept work, we present the effectiveness of LiPCoT in classifying\nParkinson's disease (PD) using an EEG dataset from 46 participants. In\nparticular, we utilize LiPCoT to encode EEG data into a small vocabulary of\ntokens and then use BERT for self-supervised learning and the downstream task\nof PD classification. We benchmark our approach against several\nstate-of-the-art CNN-based deep learning architectures for PD detection. Our\nresults reveal that BERT models utilizing self-supervised learning outperformed\nthe best-performing existing method by 7.1% in precision, 2.3% in recall, 5.5%\nin accuracy, 4% in AUC, and 5% in F1-score highlighting the potential for\nself-supervised learning even on small datasets. Our work will inform future\nfoundational models for time series, particularly for self-supervised learning.\n","authors":["Md Fahim Anjum"],"pdf_url":"https://arxiv.org/pdf/2408.07292v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.02539v3","updated":"2024-08-14T04:49:22Z","published":"2024-07-02T00:44:06Z","title":"Research on Autonomous Robots Navigation based on Reinforcement Learning","summary":" Reinforcement learning continuously optimizes decision-making based on\nreal-time feedback reward signals through continuous interaction with the\nenvironment, demonstrating strong adaptive and self-learning capabilities. In\nrecent years, it has become one of the key methods to achieve autonomous\nnavigation of robots. In this work, an autonomous robot navigation method based\non reinforcement learning is introduced. We use the Deep Q Network (DQN) and\nProximal Policy Optimization (PPO) models to optimize the path planning and\ndecision-making process through the continuous interaction between the robot\nand the environment, and the reward signals with real-time feedback. By\ncombining the Q-value function with the deep neural network, deep Q network can\nhandle high-dimensional state space, so as to realize path planning in complex\nenvironments. Proximal policy optimization is a strategy gradient-based method,\nwhich enables robots to explore and utilize environmental information more\nefficiently by optimizing policy functions. These methods not only improve the\nrobot's navigation ability in the unknown environment, but also enhance its\nadaptive and self-learning capabilities. Through multiple training and\nsimulation experiments, we have verified the effectiveness and robustness of\nthese models in various complex scenarios.\n","authors":["Zixiang Wang","Hao Yan","Yining Wang","Zhengjia Xu","Zhuoyue Wang","Zhizhong Wu"],"pdf_url":"https://arxiv.org/pdf/2407.02539v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17712v2","updated":"2024-08-14T04:26:38Z","published":"2023-10-26T18:16:23Z","title":"Community Detection Guarantees Using Embeddings Learned by Node2Vec","summary":" Embedding the nodes of a large network into an Euclidean space is a common\nobjective in modern machine learning, with a variety of tools available. These\nembeddings can then be used as features for tasks such as community\ndetection/node clustering or link prediction, where they achieve state of the\nart performance. With the exception of spectral clustering methods, there is\nlittle theoretical understanding for commonly used approaches to learning\nembeddings. In this work we examine the theoretical properties of the\nembeddings learned by node2vec. Our main result shows that the use of $k$-means\nclustering on the embedding vectors produced by node2vec gives weakly\nconsistent community recovery for the nodes in (degree corrected) stochastic\nblock models. We also discuss the use of these embeddings for node and link\nprediction tasks. We demonstrate this result empirically, and examine how this\nrelates to other embedding tools for network data.\n","authors":["Andrew Davison","S. Carlyle Morgan","Owen G. Ward"],"pdf_url":"https://arxiv.org/pdf/2310.17712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06909v3","updated":"2024-08-14T04:26:03Z","published":"2023-06-12T07:27:31Z","title":"Graph Agent Network: Empowering Nodes with Inference Capabilities for\n Adversarial Resilience","summary":" End-to-end training with global optimization have popularized graph neural\nnetworks (GNNs) for node classification, yet inadvertently introduced\nvulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit\nthe inherent opened interfaces of GNNs' input and output, perturbing critical\nedges and thus manipulating the classification results. Current defenses, due\nto their persistent utilization of global-optimization-based end-to-end\ntraining schemes, inherently encapsulate the vulnerabilities of GNNs. This is\nspecifically evidenced in their inability to defend against targeted secondary\nattacks. In this paper, we propose the Graph Agent Network (GAgN) to address\nthe aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent\nnetwork in which each node is designed as an 1-hop-view agent. Through the\ndecentralized interactions between agents, they can learn to infer global\nperceptions to perform tasks including inferring embeddings, degrees and\nneighbor relationships for given nodes. This empowers nodes to filtering\nadversarial edges while carrying out classification tasks. Furthermore, agents'\nlimited view prevents malicious messages from propagating globally in GAgN,\nthereby resisting global-optimization-based secondary attacks. We prove that\nsingle-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient\nto achieve these functionalities. Experimental results show that GAgN\neffectively implements all its intended capabilities and, compared to\nstate-of-the-art defenses, achieves optimal classification accuracy on the\nperturbed datasets.\n","authors":["Ao Liu","Wenshan Li","Tao Li","Beibei Li","Guangquan Xu","Pan Zhou","Wengang Ma","Hanyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2306.06909v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13764v2","updated":"2024-08-14T04:05:23Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v2.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2312.06902v2","updated":"2024-08-14T03:54:57Z","published":"2023-12-12T00:16:18Z","title":"Perseus: Reducing Energy Bloat in Large Model Training","summary":" Training large AI models on numerous GPUs consumes a massive amount of\nenergy, making power delivery one of the largest limiting factors in building\nand operating datacenters for AI workloads. However, we observe that not all\nenergy consumed during training directly contributes to end-to-end throughput,\nand a significant portion can be removed without slowing down training, which\nwe call energy bloat. In this work, we identify two independent sources of\nenergy bloat in large model training and propose Perseus, a training system\nthat mitigates both. To do this, Perseus obtains the \"iteration time-energy\"\nPareto frontier of any large model training job using an efficient graph\ncut-based algorithm and schedules the energy consumption of computations across\ntime to remove both types of energy bloat. Evaluation on large models including\nGPT-3 and Bloom shows that Perseus reduces the energy consumption of large\nmodel training by up to 30% without any throughput loss or hardware\nmodification, enabling energy reduction -- and therefore cost savings --\notherwise unattainable before.\n","authors":["Jae-Won Chung","Yile Gu","Insu Jang","Luoxi Meng","Nikhil Bansal","Mosharaf Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2312.06902v2.pdf","comment":"SOSP 24 | Open-source part of Zeus at\n https://ml.energy/zeus/research_overview/perseus/"},{"id":"http://arxiv.org/abs/2401.09949v2","updated":"2024-08-14T03:45:37Z","published":"2024-01-18T12:51:38Z","title":"SymbolNet: Neural Symbolic Regression with Adaptive Dynamic Pruning","summary":" Contrary to genetic programming, the neural network approach to symbolic\nregression can efficiently handle high-dimensional inputs and leverage gradient\nmethods for faster equation searching. Common ways of constraining expression\ncomplexity often involve multistage pruning with fine-tuning, which can result\nin significant performance loss. In this work, we propose $\\tt{SymbolNet}$, a\nneural network approach to symbolic regression in a novel framework that allows\ndynamic pruning of model weights, input features, and mathematical operators in\na single training process, where both training loss and expression complexity\nare optimized simultaneously. We introduce a sparsity regularization term for\neach pruning type, which can adaptively adjust its strength, leading to\nconvergence at a target sparsity ratio. Unlike most existing symbolic\nregression methods that struggle with datasets containing more than\n$\\mathcal{O}(10)$ inputs, we demonstrate the effectiveness of our model on the\nLHC jet tagging task (16 inputs), MNIST (784 inputs), and SVHN (3072 inputs).\nOur approach enables symbolic regression to achieve fast inference with\nnanosecond-scale latency on FPGAs for high-dimensional datasets in environments\nwith stringent computational resource constraints, such as the high-energy\nphysics experiments at the LHC.\n","authors":["Ho Fung Tsoi","Vladimir Loncar","Sridhara Dasu","Philip Harris"],"pdf_url":"https://arxiv.org/pdf/2401.09949v2.pdf","comment":"24 pages. Minor fixes and formatting, under review"},{"id":"http://arxiv.org/abs/2312.03690v3","updated":"2024-08-14T03:25:29Z","published":"2023-12-06T18:53:45Z","title":"AI-guided inverse design and discovery of recyclable vitrimeric polymers","summary":" Vitrimer is a new, exciting class of sustainable polymers with the ability to\nheal due to their dynamic covalent adaptive network that can go through\nassociative rearrangement reactions. However, a limited choice of constituent\nmolecules restricts their property space, prohibiting full realization of their\npotential applications. To overcome this challenge, we couple molecular\ndynamics (MD) simulations and a novel graph variational autoencoder (VAE)\nmachine learning model for inverse design of vitrimer chemistries with desired\nglass transition temperature (Tg) and synthesize a novel vitrimer polymer. We\nbuild the first vitrimer dataset of one million chemistries and calculate Tg on\n8,424 of them by high-throughput MD simulations calibrated by a Gaussian\nprocess model. The proposed novel VAE employs dual graph encoders and a latent\ndimension overlapping scheme which allows for individual representation of\nmulti-component vitrimers. By constructing a continuous latent space containing\nnecessary information of vitrimers, we demonstrate high accuracy and efficiency\nof our framework in discovering novel vitrimers with desirable Tg beyond the\ntraining regime. To validate the effectiveness of our framework in experiments,\nwe generate novel vitrimer chemistries with a target Tg = 323 K. By\nincorporating chemical intuition, we synthesize a vitrimer with Tg of 311-317\nK, and experimentally demonstrate healability and flowability. The proposed\nframework offers an exciting tool for polymer chemists to design and synthesize\nnovel, sustainable vitrimer polymers for a facet of applications.\n","authors":["Yiwen Zheng","Prakash Thakolkaran","Agni K. Biswal","Jake A. Smith","Ziheng Lu","Shuxin Zheng","Bichlien H. Nguyen","Siddhant Kumar","Aniruddh Vashisth"],"pdf_url":"https://arxiv.org/pdf/2312.03690v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07262v1","updated":"2024-08-14T02:57:38Z","published":"2024-08-14T02:57:38Z","title":"Ensemble architecture in polyp segmentation","summary":" In this research, we revisit the architecture of semantic segmentation and\nevaluate the models excelling in polyp segmentation. We introduce an integrated\nframework that harnesses the advantages of different models to attain an\noptimal outcome. More specifically, we fuse the learned features from\nconvolutional and transformer models for prediction, and we view this approach\nas an ensemble technique to enhance model performance. Our experiments on polyp\nsegmentation reveal that the proposed architecture surpasses other top models,\nexhibiting improved learning capacity and resilience. The code is available at\nhttps://github.com/HuangDLab/EnFormer.\n","authors":["Hao-Yun Hsu","Yi-Ching Cheng","Guan-Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2408.07262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13777v2","updated":"2024-08-14T02:51:54Z","published":"2023-08-26T06:03:06Z","title":"Self-Supervised Scalable Deep Compressed Sensing","summary":" Compressed sensing (CS) is a promising tool for reducing sampling costs.\nCurrent deep neural network (NN)-based CS methods face the challenges of\ncollecting labeled measurement-ground truth (GT) data and generalizing to real\napplications. This paper proposes a novel $\\mathbf{S}$elf-supervised\ns$\\mathbf{C}$alable deep CS method, comprising a deep $\\mathbf{L}$earning\nscheme called $\\mathbf{SCL}$ and a family of $\\mathbf{Net}$works named\n$\\mathbf{SCNet}$, which does not require GT and can handle arbitrary sampling\nratios and matrices once trained on a partial measurement set. Our SCL contains\na dual-domain loss and a four-stage recovery strategy. The former encourages a\ncross-consistency on two measurement parts and a sampling-reconstruction\ncycle-consistency regarding arbitrary ratios and matrices to maximize\ndata/information utilization. The latter can progressively leverage common\nsignal prior in external measurements and internal characteristics of test\nsamples and learned NNs to improve accuracy. SCNet combines both the explicit\nguidance from optimization algorithms with implicit regularization from\nadvanced NN blocks to learn a collaborative signal representation. Our\ntheoretical analyses and experiments on simulated and real captured data,\ncovering 1-/2-/3-D natural and scientific signals, demonstrate the\neffectiveness, superior performance, flexibility, and generalization ability of\nour method over existing self-supervised methods and its significant potential\nin competing against state-of-the-art supervised methods. Code is available at\nhttps://github.com/Guaishou74851/SCNet.\n","authors":["Bin Chen","Xuanyu Zhang","Shuai Liu","Yongbing Zhang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13777v2.pdf","comment":"Accepted by Internaltional Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2402.06457v2","updated":"2024-08-14T02:41:48Z","published":"2024-02-09T15:02:56Z","title":"V-STaR: Training Verifiers for Self-Taught Reasoners","summary":" Common self-improvement approaches for large language models (LLMs), such as\nSTaR, iteratively fine-tune LLMs on self-generated solutions to improve their\nproblem-solving ability. However, these approaches discard the large amounts of\nincorrect solutions generated during this process, potentially neglecting\nvaluable information in such solutions. To address this shortcoming, we propose\nV-STaR that utilizes both the correct and incorrect solutions generated during\nthe self-improvement process to train a verifier using DPO that judges\ncorrectness of model-generated solutions. This verifier is used at inference\ntime to select one solution among many candidate solutions. Running V-STaR for\nmultiple iterations results in progressively better reasoners and verifiers,\ndelivering a 4% to 17% test accuracy improvement over existing self-improvement\nand verification approaches on common code generation and math reasoning\nbenchmarks with LLaMA2 models.\n","authors":["Arian Hosseini","Xingdi Yuan","Nikolay Malkin","Aaron Courville","Alessandro Sordoni","Rishabh Agarwal"],"pdf_url":"https://arxiv.org/pdf/2402.06457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07254v1","updated":"2024-08-14T02:13:35Z","published":"2024-08-14T02:13:35Z","title":"Learning Multi-Index Models with Neural Networks via Mean-Field Langevin\n Dynamics","summary":" We study the problem of learning multi-index models in high-dimensions using\na two-layer neural network trained with the mean-field Langevin algorithm.\nUnder mild distributional assumptions on the data, we characterize the\neffective dimension $d_{\\mathrm{eff}}$ that controls both sample and\ncomputational complexity by utilizing the adaptivity of neural networks to\nlatent low-dimensional structures. When the data exhibit such a structure,\n$d_{\\mathrm{eff}}$ can be significantly smaller than the ambient dimension. We\nprove that the sample complexity grows almost linearly with $d_{\\mathrm{eff}}$,\nbypassing the limitations of the information and generative exponents that\nappeared in recent analyses of gradient-based feature learning. On the other\nhand, the computational complexity may inevitably grow exponentially with\n$d_{\\mathrm{eff}}$ in the worst-case scenario. Motivated by improving\ncomputational complexity, we take the first steps towards polynomial time\nconvergence of the mean-field Langevin algorithm by investigating a setting\nwhere the weights are constrained to be on a compact manifold with positive\nRicci curvature, such as the hypersphere. There, we study assumptions under\nwhich polynomial time convergence is achievable, whereas similar assumptions in\nthe Euclidean setting lead to exponential time complexity.\n","authors":["Alireza Mousavi-Hosseini","Denny Wu","Murat A. Erdogdu"],"pdf_url":"https://arxiv.org/pdf/2408.07254v1.pdf","comment":"35 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.07253v1","updated":"2024-08-14T02:06:24Z","published":"2024-08-14T02:06:24Z","title":"All-around Neural Collapse for Imbalanced Classification","summary":" Neural Collapse (NC) presents an elegant geometric structure that enables\nindividual activations (features), class means and classifier (weights) vectors\nto reach \\textit{optimal} inter-class separability during the terminal phase of\ntraining on a \\textit{balanced} dataset. Once shifted to imbalanced\nclassification, such an optimal structure of NC can be readily destroyed by the\nnotorious \\textit{minority collapse}, where the classifier vectors\ncorresponding to the minority classes are squeezed. In response, existing works\nendeavor to recover NC typically by optimizing classifiers. However, we\ndiscover that this squeezing phenomenon is not only confined to classifier\nvectors but also occurs with class means.\n Consequently, reconstructing NC solely at the classifier aspect may be\nfutile, as the feature means remain compressed, leading to the violation of\ninherent \\textit{self-duality} in NC (\\textit{i.e.}, class means and classifier\nvectors converge mutually) and incidentally, resulting in an unsatisfactory\ncollapse of individual activations towards the corresponding class means. To\nshake off these dilemmas, we present a unified \\textbf{All}-around\n\\textbf{N}eural \\textbf{C}ollapse framework (AllNC), aiming to comprehensively\nrestore NC across multiple aspects including individual activations, class\nmeans and classifier vectors. We thoroughly analyze its effectiveness and\nverify on multiple benchmark datasets that it achieves state-of-the-art in both\nbalanced and imbalanced settings.\n","authors":["Enhao Zhang","Chaohua Li","Chuanxing Geng","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08729v2","updated":"2024-08-14T01:46:33Z","published":"2024-07-11T17:58:10Z","title":"BiEquiFormer: Bi-Equivariant Representations for Global Point Cloud\n Registration","summary":" The goal of this paper is to address the problem of global point cloud\nregistration (PCR) i.e., finding the optimal alignment between point clouds\nirrespective of the initial poses of the scans. This problem is notoriously\nchallenging for classical optimization methods due to computational\nconstraints. First, we show that state-of-the-art deep learning methods suffer\nfrom huge performance degradation when the point clouds are arbitrarily placed\nin space. We propose that equivariant deep learning should be utilized for\nsolving this task and we characterize the specific type of bi-equivariance of\nPCR. Then, we design BiEquiformer a novel and scalable bi-equivariant pipeline\ni.e. equivariant to the independent transformations of the input point clouds.\nWhile a naive approach would process the point clouds independently we design\nexpressive bi-equivariant layers that fuse the information from both point\nclouds. This allows us to extract high-quality superpoint correspondences and\nin turn, robust point-cloud registration. Extensive comparisons against\nstate-of-the-art methods show that our method achieves comparable performance\nin the canonical setting and superior performance in the robust setting in both\nthe 3DMatch and the challenging low-overlap 3DLoMatch dataset.\n","authors":["Stefanos Pertigkiozoglou","Evangelos Chatzipantazis","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2407.08729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07247v1","updated":"2024-08-14T01:17:19Z","published":"2024-08-14T01:17:19Z","title":"BiLSTM and Attention-Based Modulation Classification of Realistic\n Wireless Signals","summary":" This work proposes a novel and efficient quadstream BiLSTM-Attention network,\nabbreviated as QSLA network, for robust automatic modulation classification\n(AMC) of wireless signals. The proposed model exploits multiple representations\nof the wireless signal as inputs to the network and the feature extraction\nprocess combines convolutional and BiLSTM layers for processing the spatial and\ntemporal features of the signal, respectively. An attention layer is used after\nthe BiLSTM layer to emphasize the important temporal features. The experimental\nresults on the recent and realistic RML22 dataset demonstrate the superior\nperformance of the proposed model with an accuracy up to around 99%. The model\nis compared with other benchmark models in the literature in terms of\nclassification accuracy, computational complexity, memory usage, and training\ntime to show the effectiveness of our proposed approach.\n","authors":["Rohit Udaiwal","Nayan Baishya","Yash Gupta","B. R. Manoj"],"pdf_url":"https://arxiv.org/pdf/2408.07247v1.pdf","comment":"Accepted at the IEEE International Conference on Signal Processing\n and Communications (SPCOM) 2024"},{"id":"http://arxiv.org/abs/2408.07246v1","updated":"2024-08-14T01:16:40Z","published":"2024-08-14T01:16:40Z","title":"Seeing and Understanding: Bridging Vision with Chemical Knowledge Via\n ChemVLM","summary":" In this technical report, we propose ChemVLM, the first open-source\nmultimodal large language model dedicated to the fields of chemistry, designed\nto address the incompatibility between chemical image understanding and text\nanalysis. Built upon the VIT-MLP-LLM architecture, we leverage ChemLLM-20B as\nthe foundational large model, endowing our model with robust capabilities in\nunderstanding and utilizing chemical text knowledge. Additionally, we employ\nInternVIT-6B as a powerful image encoder. We have curated high-quality data\nfrom the chemical domain, including molecules, reaction formulas, and chemistry\nexamination data, and compiled these into a bilingual multimodal\nquestion-answering dataset. We test the performance of our model on multiple\nopen-source benchmarks and three custom evaluation sets. Experimental results\ndemonstrate that our model achieves excellent performance, securing\nstate-of-the-art results in five out of six involved tasks. Our model can be\nfound at https://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v1.pdf","comment":"Techical report"},{"id":"http://arxiv.org/abs/2408.07245v1","updated":"2024-08-14T01:01:35Z","published":"2024-08-14T01:01:35Z","title":"q-exponential family for policy optimization","summary":" Policy optimization methods benefit from a simple and tractable policy\nfunctional, usually the Gaussian for continuous action spaces. In this paper,\nwe consider a broader policy family that remains tractable: the $q$-exponential\nfamily. This family of policies is flexible, allowing the specification of both\nheavy-tailed policies ($q>1$) and light-tailed policies ($q<1$). This paper\nexamines the interplay between $q$-exponential policies for several\nactor-critic algorithms conducted on both online and offline problems. We find\nthat heavy-tailed policies are more effective in general and can consistently\nimprove on Gaussian. In particular, we find the Student's t-distribution to be\nmore stable than the Gaussian across settings and that a heavy-tailed\n$q$-Gaussian for Tsallis Advantage Weighted Actor-Critic consistently performs\nwell in offline benchmark problems. Our code is available at\n\\url{https://github.com/lingweizhu/qexp}.\n","authors":["Lingwei Zhu","Haseeb Shah","Han Wang","Martha White"],"pdf_url":"https://arxiv.org/pdf/2408.07245v1.pdf","comment":"27 pages, 12 pages main text, 15 pages appendix"},{"id":"http://arxiv.org/abs/2407.15462v2","updated":"2024-08-14T00:57:42Z","published":"2024-07-22T08:19:34Z","title":"Efficient Retrieval with Learned Similarities","summary":" Retrieval plays a fundamental role in recommendation systems, search, and\nnatural language processing by efficiently finding relevant items from a large\ncorpus given a query. Dot products have been widely used as the similarity\nfunction in such retrieval tasks, thanks to Maximum Inner Product Search (MIPS)\nthat enabled efficient retrieval based on dot products. However,\nstate-of-the-art retrieval algorithms have migrated to learned similarities.\nSuch algorithms vary in form; the queries can be represented with multiple\nembeddings, complex neural networks can be deployed, the item ids can be\ndecoded directly from queries using beam search, and multiple approaches can be\ncombined in hybrid solutions. Unfortunately, we lack efficient solutions for\nretrieval in these state-of-the-art setups. Our work investigates techniques\nfor approximate nearest neighbor search with learned similarity functions. We\nfirst prove that Mixture-of-Logits (MoL) is a universal approximator, and can\nexpress all learned similarity functions. We next propose techniques to\nretrieve the approximate top K results using MoL with a tight bound. We finally\ncompare our techniques with existing approaches, showing that MoL sets new\nstate-of-the-art results on recommendation retrieval tasks, and our approximate\ntop-k retrieval with learned similarities outperforms baselines by up to two\norders of magnitude in latency, while achieving > .99 recall rate of exact\nalgorithms.\n","authors":["Bailu Ding","Jiaqi Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.15462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11834v4","updated":"2024-08-14T00:11:30Z","published":"2022-06-23T17:05:34Z","title":"Non-Determinism and the Lawlessness of Machine Learning Code","summary":" Legal literature on machine learning (ML) tends to focus on harms, and thus\ntends to reason about individual model outcomes and summary error rates. This\nfocus has masked important aspects of ML that are rooted in its reliance on\nrandomness -- namely, stochasticity and non-determinism. While some recent work\nhas begun to reason about the relationship between stochasticity and\narbitrariness in legal contexts, the role of non-determinism more broadly\nremains unexamined. In this paper, we clarify the overlap and differences\nbetween these two concepts, and show that the effects of non-determinism, and\nconsequently its implications for the law, become clearer from the perspective\nof reasoning about ML outputs as distributions over possible outcomes. This\ndistributional viewpoint accounts for randomness by emphasizing the possible\noutcomes of ML. Importantly, this type of reasoning is not exclusive with\ncurrent legal reasoning; it complements (and in fact can strengthen) analyses\nconcerning individual, concrete outcomes for specific automated decisions. By\nilluminating the important role of non-determinism, we demonstrate that ML code\nfalls outside of the cyberlaw frame of treating ``code as law,'' as this frame\nassumes that code is deterministic. We conclude with a brief discussion of what\nwork ML can do to constrain the potentially harm-inducing effects of\nnon-determinism, and we indicate where the law must do work to bridge the gap\nbetween its current individual-outcome focus and the distributional approach\nthat we recommend.\n","authors":["A. Feder Cooper","Jonathan Frankle","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2206.11834v4.pdf","comment":"Proceedings of the 2022 Symposium on Computer Science and Law (CSLAW\n '22)"},{"id":"http://arxiv.org/abs/2310.00290v6","updated":"2024-08-14T00:09:21Z","published":"2023-09-30T07:46:47Z","title":"Universality of periodic points in bounded discrete time series","summary":" We consider arbitrary bounded discrete time series originating from dynamical\nsystem. Without any use of the Fourier transform, we find periodic points which\nsuitably characterizes (i.e. independent of Lyapunov exponent) the\ncorresponding time series. In particular, bounded discrete time series\ngenerated by the autoregressive model (without the white noise) is equivalent\nto a quasi periodic function.\n","authors":["Chikara Nakayama","Tsuyoshi Yoneda"],"pdf_url":"https://arxiv.org/pdf/2310.00290v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07239v1","updated":"2024-08-14T00:08:28Z","published":"2024-08-14T00:08:28Z","title":"Enhancing Autonomous Vehicle Perception in Adverse Weather through Image\n Augmentation during Semantic Segmentation Training","summary":" Robust perception is crucial in autonomous vehicle navigation and\nlocalization. Visual processing tasks, like semantic segmentation, should work\nin varying weather conditions and during different times of day. Semantic\nsegmentation is where each pixel is assigned a class, which is useful for\nlocating overall features (1). Training a segmentation model requires large\namounts of data, and the labeling process for segmentation data is especially\ntedious. Additionally, many large datasets include only images taken in clear\nweather. This is a problem because training a model exclusively on clear\nweather data hinders performance in adverse weather conditions like fog or\nrain. We hypothesize that given a dataset of only clear days images, applying\nimage augmentation (such as random rain, fog, and brightness) during training\nallows for domain adaptation to diverse weather conditions. We used CARLA, a 3D\nrealistic autonomous vehicle simulator, to collect 1200 images in clear weather\ncomposed of 29 classes from 10 different towns (2). We also collected 1200\nimages of random weather effects. We trained encoder-decoder UNet models to\nperform semantic segmentation. Applying augmentations significantly improved\nsegmentation under weathered night conditions (p < 0.001). However, models\ntrained on weather data have significantly lower losses than those trained on\naugmented data in all conditions except for clear days. This shows there is\nroom for improvement in the domain adaptation approach. Future work should test\nmore types of augmentations and also use real-life images instead of CARLA.\nIdeally, the augmented model meets or exceeds the performance of the weather\nmodel.\n","authors":["Ethan Kou","Noah Curran"],"pdf_url":"https://arxiv.org/pdf/2408.07239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07854v1","updated":"2024-08-14T23:45:21Z","published":"2024-08-14T23:45:21Z","title":"CON-FOLD -- Explainable Machine Learning with Confidence","summary":" FOLD-RM is an explainable machine learning classification algorithm that uses\ntraining data to create a set of classification rules. In this paper we\nintroduce CON-FOLD which extends FOLD-RM in several ways. CON-FOLD assigns\nprobability-based confidence scores to rules learned for a classification task.\nThis allows users to know how confident they should be in a prediction made by\nthe model. We present a confidence-based pruning algorithm that uses the unique\nstructure of FOLD-RM rules to efficiently prune rules and prevent overfitting.\nFurthermore, CON-FOLD enables the user to provide pre-existing knowledge in the\nform of logic program rules that are either (fixed) background knowledge or\n(modifiable) initial rule candidates. The paper describes our method in detail\nand reports on practical experiments. We demonstrate the performance of the\nalgorithm on benchmark datasets from the UCI Machine Learning Repository. For\nthat, we introduce a new metric, Inverse Brier Score, to evaluate the accuracy\nof the produced confidence scores. Finally we apply this extension to a real\nworld example that requires explainability: marking of student responses to a\nshort answer question from the Australian Physics Olympiad.\n","authors":["Lachlan McGinness","Peter Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2408.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07852v1","updated":"2024-08-14T23:34:28Z","published":"2024-08-14T23:34:28Z","title":"Training Language Models on the Knowledge Graph: Insights on\n Hallucinations and Their Detectability","summary":" While many capabilities of language models (LMs) improve with increased\ntraining budget, the influence of scale on hallucinations is not yet fully\nunderstood. Hallucinations come in many forms, and there is no universally\naccepted definition. We thus focus on studying only those hallucinations where\na correct answer appears verbatim in the training set. To fully control the\ntraining data content, we construct a knowledge graph (KG)-based dataset, and\nuse it to train a set of increasingly large LMs. We find that for a fixed\ndataset, larger and longer-trained LMs hallucinate less. However, hallucinating\non $\\leq5$% of the training data requires an order of magnitude larger model,\nand thus an order of magnitude more compute, than Hoffmann et al. (2022)\nreported was optimal. Given this costliness, we study how hallucination\ndetectors depend on scale. While we see detector size improves performance on\nfixed LM's outputs, we find an inverse relationship between the scale of the LM\nand the detectability of its hallucinations.\n","authors":["Jiri Hron","Laura Culp","Gamaleldin Elsayed","Rosanne Liu","Ben Adlam","Maxwell Bileschi","Bernd Bohnet","JD Co-Reyes","Noah Fiedel","C. Daniel Freeman","Izzeddin Gur","Kathleen Kenealy","Jaehoon Lee","Peter J. Liu","Gaurav Mishra","Igor Mordatch","Azade Nova","Roman Novak","Aaron Parisi","Jeffrey Pennington","Alex Rizkowsky","Isabelle Simpson","Hanie Sedghi","Jascha Sohl-dickstein","Kevin Swersky","Sharad Vikram","Tris Warkentin","Lechao Xiao","Kelvin Xu","Jasper Snoek","Simon Kornblith"],"pdf_url":"https://arxiv.org/pdf/2408.07852v1.pdf","comment":"Published at COLM 2024. 16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.07847v1","updated":"2024-08-14T23:09:01Z","published":"2024-08-14T23:09:01Z","title":"Time-inversion of spatiotemporal beam dynamics using uncertainty-aware\n latent evolution reversal","summary":" Charged particle dynamics under the influence of electromagnetic fields is a\nchallenging spatiotemporal problem. Many high performance physics-based\nsimulators for predicting behavior in a charged particle beam are\ncomputationally expensive, limiting their utility for solving inverse problems\nonline. The problem of estimating upstream six-dimensional phase space given\ndownstream measurements of charged particles in an accelerator is an inverse\nproblem of growing importance. This paper introduces a reverse Latent Evolution\nModel (rLEM) designed for temporal inversion of forward beam dynamics. In this\ntwo-step self-supervised deep learning framework, we utilize a Conditional\nVariational Autoencoder (CVAE) to project 6D phase space projections of a\ncharged particle beam into a lower-dimensional latent distribution.\nSubsequently, we autoregressively learn the inverse temporal dynamics in the\nlatent space using a Long Short-Term Memory (LSTM) network. The coupled\nCVAE-LSTM framework can predict 6D phase space projections across all upstream\naccelerating sections based on single or multiple downstream phase space\nmeasurements as inputs. The proposed model also captures the aleatoric\nuncertainty of the high-dimensional input data within the latent space. This\nuncertainty, which reflects potential uncertain measurements at a given module,\nis propagated through the LSTM to estimate uncertainty bounds for all upstream\npredictions, demonstrating the robustness of the LSTM against in-distribution\nvariations in the input data.\n","authors":["Mahindra Rautela","Alan Williams","Alexander Scheinker"],"pdf_url":"https://arxiv.org/pdf/2408.07847v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.13858"},{"id":"http://arxiv.org/abs/2408.07845v1","updated":"2024-08-14T23:01:02Z","published":"2024-08-14T23:01:02Z","title":"Enhancing Equitable Access to AI in Housing and Homelessness System of\n Care through Federated Learning","summary":" The top priority of a Housing and Homelessness System of Care (HHSC) is to\nconnect people experiencing homelessness to supportive housing. An HHSC\ntypically consists of many agencies serving the same population. Information\ntechnology platforms differ in type and quality between agencies, so their data\nare usually isolated from one agency to another. Larger agencies may have\nsufficient data to train and test artificial intelligence (AI) tools but\nsmaller agencies typically do not. To address this gap, we introduce a\nFederated Learning (FL) approach enabling all agencies to train a predictive\nmodel collaboratively without sharing their sensitive data. We demonstrate how\nFL can be used within an HHSC to provide all agencies equitable access to\nquality AI and further assist human decision-makers in the allocation of\nresources within HHSC. This is achieved while preserving the privacy of the\npeople within the data by not sharing identifying information between agencies\nwithout their consent. Our experimental results using real-world HHSC data from\nCalgary, Alberta, demonstrate that our FL approach offers comparable\nperformance with the idealized scenario of training the predictive model with\ndata fully shared and linked between agencies.\n","authors":["Musa Taib","Jiajun Wu","Steve Drew","Geoffrey G. Messier"],"pdf_url":"https://arxiv.org/pdf/2408.07845v1.pdf","comment":"Accepted at the 2024 AAAI/ACM Conference on AI, Ethics, and Society\n (AIES)"},{"id":"http://arxiv.org/abs/2302.09712v3","updated":"2024-08-14T22:47:41Z","published":"2023-02-20T01:30:27Z","title":"Depth Degeneracy in Neural Networks: Vanishing Angles in Fully Connected\n ReLU Networks on Initialization","summary":" Despite remarkable performance on a variety of tasks, many properties of deep\nneural networks are not yet theoretically understood. One such mystery is the\ndepth degeneracy phenomenon: the deeper you make your network, the closer your\nnetwork is to a constant function on initialization. In this paper, we examine\nthe evolution of the angle between two inputs to a ReLU neural network as a\nfunction of the number of layers. By using combinatorial expansions, we find\nprecise formulas for how fast this angle goes to zero as depth increases. These\nformulas capture microscopic fluctuations that are not visible in the popular\nframework of infinite width limits, and leads to qualitatively different\npredictions. We validate our theoretical results with Monte Carlo experiments\nand show that our results accurately approximate finite network behaviour.\n\\review{We also empirically investigate how the depth degeneracy phenomenon can\nnegatively impact training of real networks.} The formulas are given in terms\nof the mixed moments of correlated Gaussians passed through the ReLU function.\nWe also find a surprising combinatorial connection between these mixed moments\nand the Bessel numbers that allows us to explicitly evaluate these moments.\n","authors":["Cameron Jakub","Mihai Nica"],"pdf_url":"https://arxiv.org/pdf/2302.09712v3.pdf","comment":"Minor updates and exposition improved. Added a section with more\n numerical experiments. 45 pages, comments welcome. To appear in Journal of\n Machine Learning research"},{"id":"http://arxiv.org/abs/2403.13784v4","updated":"2024-08-14T22:47:01Z","published":"2024-03-20T17:47:08Z","title":"The Model Openness Framework: Promoting Completeness and Openness for\n Reproducibility, Transparency, and Usability in Artificial Intelligence","summary":" Generative AI (GAI) offers unprecedented opportunities for research and\ninnovation, but its commercialization has raised concerns about transparency,\nreproducibility, and safety. Many open GAI models lack the necessary components\nfor full understanding and reproducibility, and some use restrictive licenses\nwhilst claiming to be ``open-source''. To address these concerns, we propose\nthe Model Openness Framework (MOF), a ranked classification system that rates\nmachine learning models based on their completeness and openness, following\nprinciples of open science, open source, open data, and open access. The MOF\nrequires specific components of the model development lifecycle to be included\nand released under appropriate open licenses. This framework aims to prevent\nmisrepresentation of models claiming to be open, guide researchers and\ndevelopers in providing all model components under permissive licenses, and\nhelp individuals and organizations identify models that can be safely adopted\nwithout restrictions. By promoting transparency and reproducibility, the MOF\ncombats ``openwashing'' practices and establishes completeness and openness as\nprimary criteria alongside the core tenets of responsible AI. Wide adoption of\nthe MOF will foster a more open AI ecosystem, benefiting research, innovation,\nand adoption of state-of-the-art models.\n","authors":["Matt White","Ibrahim Haddad","Cailean Osborne","Xiao-Yang Liu Yanglet","Ahmed Abdelmonsef","Sachin Varghese"],"pdf_url":"https://arxiv.org/pdf/2403.13784v4.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2408.07841v1","updated":"2024-08-14T22:43:52Z","published":"2024-08-14T22:43:52Z","title":"SustainDC -- Benchmarking for Sustainable Data Center Control","summary":" Machine learning has driven an exponential increase in computational demand,\nleading to massive data centers that consume significant amounts of energy and\ncontribute to climate change. This makes sustainable data center control a\npriority. In this paper, we introduce SustainDC, a set of Python environments\nfor benchmarking multi-agent reinforcement learning (MARL) algorithms for data\ncenters (DC). SustainDC supports custom DC configurations and tasks such as\nworkload scheduling, cooling optimization, and auxiliary battery management,\nwith multiple agents managing these operations while accounting for the effects\nof each other. We evaluate various MARL algorithms on SustainDC, showing their\nperformance across diverse DC designs, locations, weather conditions, grid\ncarbon intensity, and workload requirements. Our results highlight significant\nopportunities for improvement of data center operations using MARL algorithms.\nGiven the increasing use of DC due to AI, SustainDC provides a crucial platform\nfor the development and benchmarking of advanced algorithms essential for\nachieving sustainable computing and addressing other heterogeneous real-world\nchallenges.\n","authors":["Avisek Naug","Antonio Guillen","Ricardo Luna","Vineet Gundecha","Desik Rengarajan","Sahand Ghorbanpour","Sajad Mousavi","Ashwin Ramesh Babu","Dejan Markovikj","Lekhapriya D Kashyap","Soumyendu Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.07841v1.pdf","comment":"Under review at Advances in Neural Information Processing Systems\n 2024 (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2307.03288v3","updated":"2024-08-14T22:42:58Z","published":"2023-07-06T20:49:42Z","title":"Optimal Scalarizations for Sublinear Hypervolume Regret","summary":" Scalarization is a general, parallizable technique that can be deployed in\nany multiobjective setting to reduce multiple objectives into one, yet some\nhave dismissed this versatile approach because linear scalarizations cannot\nexplore concave regions of the Pareto frontier. To that end, we aim to find\nsimple non-linear scalarizations that provably explore a diverse set of $k$\nobjectives on the Pareto frontier, as measured by the dominated hypervolume. We\nshow that hypervolume scalarizations with uniformly random weights achieves an\noptimal sublinear hypervolume regret bound of $O(T^{-1/k})$, with matching\nlower bounds that preclude any algorithm from doing better asymptotically. For\nthe setting of multiobjective stochastic linear bandits, we utilize properties\nof hypervolume scalarizations to derive a novel non-Euclidean analysis to get\nregret bounds of $\\tilde{O}( d T^{-1/2} + T^{-1/k})$, removing unnecessary\n$\\text{poly}(k)$ dependencies. We support our theory with strong empirical\nperformance of using non-linear scalarizations that outperforms both their\nlinear counterparts and other standard multiobjective algorithms in a variety\nof natural settings.\n","authors":["Qiuyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.03288v3.pdf","comment":"Neurips 2024"},{"id":"http://arxiv.org/abs/2301.08028v2","updated":"2024-08-14T22:15:57Z","published":"2023-01-19T12:01:41Z","title":"A Survey of Meta-Reinforcement Learning","summary":" While deep reinforcement learning (RL) has fueled multiple high-profile\nsuccesses in machine learning, it is held back from more widespread adoption by\nits often poor data efficiency and the limited generality of the policies it\nproduces. A promising approach for alleviating these limitations is to cast the\ndevelopment of better RL algorithms as a machine learning problem itself in a\nprocess called meta-RL. Meta-RL is most commonly studied in a problem setting\nwhere, given a distribution of tasks, the goal is to learn a policy that is\ncapable of adapting to any new task from the task distribution with as little\ndata as possible. In this survey, we describe the meta-RL problem setting in\ndetail as well as its major variations. We discuss how, at a high level,\nmeta-RL research can be clustered based on the presence of a task distribution\nand the learning budget available for each individual task. Using these\nclusters, we then survey meta-RL algorithms and applications. We conclude by\npresenting the open problems on the path to making meta-RL part of the standard\ntoolbox for a deep RL practitioner.\n","authors":["Jacob Beck","Risto Vuorio","Evan Zheran Liu","Zheng Xiong","Luisa Zintgraf","Chelsea Finn","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2301.08028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07831v1","updated":"2024-08-14T22:08:06Z","published":"2024-08-14T22:08:06Z","title":"CarbonClipper: Optimal Algorithms for Carbon-Aware Spatiotemporal\n Workload Management","summary":" We study carbon-aware spatiotemporal workload management, which seeks to\naddress the growing environmental impact of data centers. We formalize this as\nan online problem called spatiotemporal online allocation with deadline\nconstraints ($\\mathsf{SOAD}$), in which an online player completes a workload\n(e.g., a batch compute job) by moving and scheduling the workload across a\nnetwork subject to a deadline $T$. At each time step, a service cost function\nis revealed, representing, e.g., the carbon intensity of servicing a workload\nat each location, and the player must irrevocably decide the current\nallocation. Furthermore, whenever the player moves the allocation, it incurs a\nmovement cost defined by a metric space $(X,d)$ that captures, e.g., the\noverhead of migrating a compute job. $\\mathsf{SOAD}$ formalizes the open\nproblem of combining general metrics and deadline constraints in the online\nalgorithms literature, unifying problems such as metrical task systems and\nonline search. We propose a competitive algorithm for $\\mathsf{SOAD}$ along\nwith a matching lower bound that proves it is optimal. Our main algorithm,\n${\\rm C{\\scriptsize ARBON}C{\\scriptsize LIPPER}}$, is a learning-augmented\nalgorithm that takes advantage of predictions (e.g., carbon intensity\nforecasts) and achieves an optimal consistency-robustness trade-off. We\nevaluate our proposed algorithms for carbon-aware spatiotemporal workload\nmanagement on a simulated global data center network, showing that ${\\rm\nC{\\scriptsize ARBON}C{\\scriptsize LIPPER}}$ significantly improves performance\ncompared to baseline methods and delivers meaningful carbon reductions.\n","authors":["Adam Lechowicz","Nicolas Christianson","Bo Sun","Noman Bashir","Mohammad Hajiesmaili","Adam Wierman","Prashant Shenoy"],"pdf_url":"https://arxiv.org/pdf/2408.07831v1.pdf","comment":"50 pages, 21 figures"},{"id":"http://arxiv.org/abs/2408.07812v1","updated":"2024-08-14T21:00:58Z","published":"2024-08-14T21:00:58Z","title":"Differentiating Policies for Non-Myopic Bayesian Optimization","summary":" Bayesian optimization (BO) methods choose sample points by optimizing an\nacquisition function derived from a statistical model of the objective. These\nacquisition functions are chosen to balance sampling regions with predicted\ngood objective values against exploring regions where the objective is\nuncertain. Standard acquisition functions are myopic, considering only the\nimpact of the next sample, but non-myopic acquisition functions may be more\neffective. In principle, one could model the sampling by a Markov decision\nprocess, and optimally choose the next sample by maximizing an expected reward\ncomputed by dynamic programming; however, this is infeasibly expensive. More\npractical approaches, such as rollout, consider a parametric family of sampling\npolicies. In this paper, we show how to efficiently estimate rollout\nacquisition functions and their gradients, enabling stochastic gradient-based\noptimization of sampling policies.\n","authors":["Darian Nwankwo","David Bindel"],"pdf_url":"https://arxiv.org/pdf/2408.07812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10845v2","updated":"2024-08-14T20:41:56Z","published":"2023-10-16T21:37:34Z","title":"CoTFormer: A Chain-of-Thought Driven Architecture with Budget-Adaptive\n Computation Cost at Inference","summary":" Scaling language models to larger and deeper sizes has led to significant\nboosts in performance. Even though the size of these models limits their\napplication in compute-constrained environments, the race to continually\ndevelop ever larger and deeper foundational models is underway. At the same\ntime -- regardless of the model size -- task-specific techniques continue to\nplay a pivotal role in achieving optimal downstream performance. One of these\ntechniques, called Chain-of-Thought (CoT), is particularly interesting since,\nas we point out in this work, it resembles employing a deeper transformer\nthrough re-applying the model multiple times. However, a key subtlety in\ncomputing the attention of past tokens differentiates CoT from simply applying\nthe model several times. Based on this insight, we propose CoTFormer, a novel\narchitecture which closely mimics CoT at the token level, allowing us to obtain\nsignificantly improved accuracies close to much larger models. While applying\nCoT introduces additional computation costs, we compensate for it by leveraging\nCoTFormer's special compatibility with token-wise variable depth. Through a\ncompute adaptive model -- which automatically allocates the compute to tokens\nthat need it most -- we show that it is possible to reduce the computation cost\nsignificantly without any reduction in accuracy, and with further compute cost\nreductions possible while maintaining a competitive accuracy.\n","authors":["Amirkeivan Mohtashami","Matteo Pagliardini","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2310.10845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02941v3","updated":"2024-08-14T20:40:17Z","published":"2022-12-06T12:54:08Z","title":"Safe Imitation Learning of Nonlinear Model Predictive Control for\n Flexible Robots","summary":" Flexible robots may overcome some of the industry's major challenges, such as\nenabling intrinsically safe human-robot collaboration and achieving a higher\npayload-to-mass ratio. However, controlling flexible robots is complicated due\nto their complex dynamics, which include oscillatory behavior and a\nhigh-dimensional state space. Nonlinear model predictive control (NMPC) offers\nan effective means to control such robots, but its significant computational\ndemand often limits its application in real-time scenarios. To enable fast\ncontrol of flexible robots, we propose a framework for a safe approximation of\nNMPC using imitation learning and a predictive safety filter. Our framework\nsignificantly reduces computation time while incurring a slight loss in\nperformance. Compared to NMPC, our framework shows more than an eightfold\nimprovement in computation time when controlling a three-dimensional flexible\nrobot arm in simulation, all while guaranteeing safety constraints. Notably,\nour approach outperforms state-of-the-art reinforcement learning methods. The\ndevelopment of fast and safe approximate NMPC holds the potential to accelerate\nthe adoption of flexible robots in industry. The project code is available at:\ntinyurl.com/anmpc4fr\n","authors":["Shamil Mamedov","Rudolf Reiter","Seyed Mahdi Basiri Azad","Ruan Viljoen","Joschka Boedecker","Moritz Diehl","Jan Swevers"],"pdf_url":"https://arxiv.org/pdf/2212.02941v3.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2408.07802v1","updated":"2024-08-14T20:24:03Z","published":"2024-08-14T20:24:03Z","title":"Kraken: Inherently Parallel Transformers For Efficient Multi-Device\n Inference","summary":" Large Transformer networks are increasingly used in settings where low\ninference latency can improve the end-user experience and enable new\napplications. However, autoregressive inference is resource intensive and\nrequires parallelism for efficiency. Parallelism introduces collective\ncommunication that is both expensive and represents a phase when hardware\nresources are underutilized. Towards mitigating this, Kraken is an evolution of\nthe standard Transformer architecture that is designed to complement existing\ntensor parallelism schemes for efficient inference on multi-device systems. By\nintroducing a fixed degree of intra-layer model parallelism, the architecture\nallows collective operations to be overlapped with compute, decreasing latency\nand increasing hardware utilization. When trained on OpenWebText, Kraken models\nreach a similar perplexity as standard Transformers while also preserving their\nlanguage modeling capabilities when evaluated on the SuperGLUE benchmark.\nImportantly, when tested on multi-GPU systems using TensorRT-LLM engines,\nKraken speeds up Time To First Token by a mean of 35.6% across a range of model\nsizes, context lengths, and degrees of tensor parallelism.\n","authors":["Rohan Baskar Prabhakar","Hengrui Zhang","David Wentlzaff"],"pdf_url":"https://arxiv.org/pdf/2408.07802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19232v2","updated":"2024-08-14T20:15:55Z","published":"2024-02-29T15:05:59Z","title":"Trained Random Forests Completely Reveal your Dataset","summary":" We introduce an optimization-based reconstruction attack capable of\ncompletely or near-completely reconstructing a dataset utilized for training a\nrandom forest. Notably, our approach relies solely on information readily\navailable in commonly used libraries such as scikit-learn. To achieve this, we\nformulate the reconstruction problem as a combinatorial problem under a maximum\nlikelihood objective. We demonstrate that this problem is NP-hard, though\nsolvable at scale using constraint programming -- an approach rooted in\nconstraint propagation and solution-domain reduction. Through an extensive\ncomputational investigation, we demonstrate that random forests trained without\nbootstrap aggregation but with feature randomization are susceptible to a\ncomplete reconstruction. This holds true even with a small number of trees.\nEven with bootstrap aggregation, the majority of the data can also be\nreconstructed. These findings underscore a critical vulnerability inherent in\nwidely adopted ensemble methods, warranting attention and mitigation. Although\nthe potential for such reconstruction attacks has been discussed in privacy\nresearch, our study provides clear empirical evidence of their practicability.\n","authors":["Julien Ferry","Ricardo Fukasawa","Timothée Pascal","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2402.19232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07796v1","updated":"2024-08-14T20:14:42Z","published":"2024-08-14T20:14:42Z","title":"Ranking and Combining Latent Structured Predictive Scores without\n Labeled Data","summary":" Combining multiple predictors obtained from distributed data sources to an\naccurate meta-learner is promising to achieve enhanced performance in lots of\nprediction problems. As the accuracy of each predictor is usually unknown,\nintegrating the predictors to achieve better performance is challenging.\nConventional ensemble learning methods assess the accuracy of predictors based\non extensive labeled data. In practical applications, however, the acquisition\nof such labeled data can prove to be an arduous task. Furthermore, the\npredictors under consideration may exhibit high degrees of correlation,\nparticularly when similar data sources or machine learning algorithms were\nemployed during their model training. In response to these challenges, this\npaper introduces a novel structured unsupervised ensemble learning model (SUEL)\nto exploit the dependency between a set of predictors with continuous\npredictive scores, rank the predictors without labeled data and combine them to\nan ensembled score with weights. Two novel correlation-based decomposition\nalgorithms are further proposed to estimate the SUEL model, constrained\nquadratic optimization (SUEL.CQO) and matrix-factorization-based (SUEL.MF)\napproaches. The efficacy of the proposed methods is rigorously assessed through\nboth simulation studies and real-world application of risk genes discovery. The\nresults compellingly demonstrate that the proposed methods can efficiently\nintegrate the dependent predictors to an ensemble model without the need of\nground truth data.\n","authors":["Shiva Afshar","Yinghan Chen","Shizhong Han","Ying Lin"],"pdf_url":"https://arxiv.org/pdf/2408.07796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07791v1","updated":"2024-08-14T20:03:53Z","published":"2024-08-14T20:03:53Z","title":"An Efficient and Explanatory Image and Text Clustering System with\n Multimodal Autoencoder Architecture","summary":" We demonstrate the efficiencies and explanatory abilities of extensions to\nthe common tools of Autoencoders and LLM interpreters, in the novel context of\ncomparing different cultural approaches to the same international news event.\nWe develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model\nthat extends the modalities of previous CVAE models, by using fully-connected\nlatent layers to embed in parallel the CNN encodings of video frames, together\nwith the LSTM encodings of their related text derived from audio. We\nincorporate the model within a larger system that includes frame-caption\nalignment, latent space vector clustering, and a novel LLM-based cluster\ninterpreter. We measure, tune, and apply this system to the task of summarizing\na video into three to five thematic clusters, with each theme described by ten\nLLM-produced phrases. We apply this system to two news topics, COVID-19 and the\nWinter Olympics, and five other topics are in progress.\n","authors":["Tiancheng Shi","Yuanchen Wei","John R. Kender"],"pdf_url":"https://arxiv.org/pdf/2408.07791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19008v3","updated":"2024-08-14T19:39:16Z","published":"2023-05-30T13:06:26Z","title":"Bottleneck Structure in Learned Features: Low-Dimension vs Regularity\n Tradeoff","summary":" Previous work has shown that DNNs with large depth $L$ and\n$L_{2}$-regularization are biased towards learning low-dimensional\nrepresentations of the inputs, which can be interpreted as minimizing a notion\nof rank $R^{(0)}(f)$ of the learned function $f$, conjectured to be the\nBottleneck rank. We compute finite depth corrections to this result, revealing\na measure $R^{(1)}$ of regularity which bounds the pseudo-determinant of the\nJacobian $\\left|Jf(x)\\right|_{+}$ and is subadditive under composition and\naddition. This formalizes a balance between learning low-dimensional\nrepresentations and minimizing complexity/irregularity in the feature maps,\nallowing the network to learn the `right' inner dimension. Finally, we prove\nthe conjectured bottleneck structure in the learned features as $L\\to\\infty$:\nfor large depths, almost all hidden representations are approximately\n$R^{(0)}(f)$-dimensional, and almost all weight matrices $W_{\\ell}$ have\n$R^{(0)}(f)$ singular values close to 1 while the others are\n$O(L^{-\\frac{1}{2}})$. Interestingly, the use of large learning rates is\nrequired to guarantee an order $O(L)$ NTK which in turns guarantees infinite\ndepth convergence of the representations of almost all layers.\n","authors":["Arthur Jacot"],"pdf_url":"https://arxiv.org/pdf/2305.19008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17435v4","updated":"2024-08-14T19:23:43Z","published":"2024-01-30T20:49:47Z","title":"Can LLMs Replace Economic Choice Prediction Labs? The Case of\n Language-based Persuasion Games","summary":" Human choice prediction in economic contexts is crucial for applications in\nmarketing, finance, public policy, and more. This task, however, is often\nconstrained by the difficulties in acquiring human choice data. With most\nexperimental economics studies focusing on simple choice settings, the AI\ncommunity has explored whether LLMs can substitute for humans in these\npredictions and examined more complex experimental economics settings. However,\na key question remains: can LLMs generate training data for human choice\nprediction? We explore this in language-based persuasion games, a complex\neconomic setting involving natural language in strategic interactions. Our\nexperiments show that models trained on LLM-generated data can effectively\npredict human behavior in these games and even outperform models trained on\nactual human data.\n","authors":["Eilam Shapira","Omer Madmon","Roi Reichart","Moshe Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2401.17435v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03138v2","updated":"2024-08-14T19:08:48Z","published":"2024-02-05T16:08:58Z","title":"Just Cluster It: An Approach for Exploration in High-Dimensions using\n Clustering and Pre-Trained Representations","summary":" In this paper we adopt a representation-centric perspective on exploration in\nreinforcement learning, viewing exploration fundamentally as a density\nestimation problem. We investigate the effectiveness of clustering\nrepresentations for exploration in 3-D environments, based on the observation\nthat the importance of pixel changes between transitions is less pronounced in\n3-D environments compared to 2-D environments, where pixel changes between\ntransitions are typically distinct and significant. We propose a method that\nperforms episodic and global clustering on random representations and on\npre-trained DINO representations to count states, i.e, estimate pseudo-counts.\nSurprisingly, even random features can be clustered effectively to count states\nin 3-D environments, however when these become visually more complex,\npre-trained DINO representations are more effective thanks to the pre-trained\ninductive biases in the representations. Overall, this presents a pathway for\nintegrating pre-trained biases into exploration. We evaluate our approach on\nthe VizDoom and Habitat environments, demonstrating that our method surpasses\nother well-known exploration methods in these settings.\n","authors":["Stefan Sylvius Wagner","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2402.03138v2.pdf","comment":"Accepted at the International Conference On Machine Learning (ICML)\n 2024"},{"id":"http://arxiv.org/abs/2408.07776v1","updated":"2024-08-14T19:07:28Z","published":"2024-08-14T19:07:28Z","title":"Knowledge-based Neural Ordinary Differential Equations for Cosserat\n Rod-based Soft Robots","summary":" Soft robots have many advantages over rigid robots thanks to their compliant\nand passive nature. However, it is generally challenging to model the dynamics\nof soft robots due to their high spatial dimensionality, making it difficult to\nuse model-based methods to accurately control soft robots. It often requires\ndirect numerical simulation of partial differential equations to simulate soft\nrobots. This not only requires an accurate numerical model, but also makes soft\nrobot modeling slow and expensive. Deep learning algorithms have shown promises\nin data-driven modeling of soft robots. However, these algorithms usually\nrequire a large amount of data, which are difficult to obtain in either\nsimulation or real-world experiments of soft robots. In this work, we propose\nKNODE-Cosserat, a framework that combines first-principle physics models and\nneural ordinary differential equations. We leverage the best from both worlds\n-- the generalization ability of physics-based models and the fast speed of\ndeep learning methods. We validate our framework in both simulation and\nreal-world experiments. In both cases, we show that the robot model\nsignificantly improves over the baseline models under different metrics.\n","authors":["Tom Z. Jiahao","Ryan Adolf","Cynthia Sung","M. Ani Hsieh"],"pdf_url":"https://arxiv.org/pdf/2408.07776v1.pdf","comment":"8 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.07773v1","updated":"2024-08-14T18:57:05Z","published":"2024-08-14T18:57:05Z","title":"MedTsLLM: Leveraging LLMs for Multimodal Medical Time Series Analysis","summary":" The complexity and heterogeneity of data in many real-world applications pose\nsignificant challenges for traditional machine learning and signal processing\ntechniques. For instance, in medicine, effective analysis of diverse\nphysiological signals is crucial for patient monitoring and clinical\ndecision-making and yet highly challenging. We introduce MedTsLLM, a general\nmultimodal large language model (LLM) framework that effectively integrates\ntime series data and rich contextual information in the form of text to analyze\nphysiological signals, performing three tasks with clinical relevance: semantic\nsegmentation, boundary detection, and anomaly detection in time series. These\ncritical tasks enable deeper analysis of physiological signals and can provide\nactionable insights for clinicians. We utilize a reprogramming layer to align\nembeddings of time series patches with a pretrained LLM's embedding space and\nmake effective use of raw time series, in conjunction with textual context.\nGiven the multivariate nature of medical datasets, we develop methods to handle\nmultiple covariates. We additionally tailor the text prompt to include\npatient-specific information. Our model outperforms state-of-the-art baselines,\nincluding deep learning models, other LLMs, and clinical methods across\nmultiple medical domains, specifically electrocardiograms and respiratory\nwaveforms. MedTsLLM presents a promising step towards harnessing the power of\nLLMs for medical time series analysis that can elevate data-driven tools for\nclinicians and improve patient outcomes.\n","authors":["Nimeesha Chan","Felix Parker","William Bennett","Tianyi Wu","Mung Yao Jia","James Fackler","Kimia Ghobadi"],"pdf_url":"https://arxiv.org/pdf/2408.07773v1.pdf","comment":"published in Proceedings of Machine Learning Research, MLHC 2024"},{"id":"http://arxiv.org/abs/2402.13812v2","updated":"2024-08-14T18:55:22Z","published":"2024-02-21T13:50:46Z","title":"Voice-Driven Mortality Prediction in Hospitalized Heart Failure\n Patients: A Machine Learning Approach Enhanced with Diagnostic Biomarkers","summary":" Addressing heart failure (HF) as a prevalent global health concern poses\ndifficulties in implementing innovative approaches for enhanced patient care.\nPredicting mortality rates in HF patients, in particular, is difficult yet\ncritical, necessitating individualized care, proactive management, and enabling\neducated decision-making to enhance outcomes. Recently, the significance of\nvoice biomarkers coupled with Machine Learning (ML) has surged, demonstrating\nremarkable efficacy, particularly in predicting heart failure. The synergy of\nvoice analysis and ML algorithms provides a non-invasive and easily accessible\nmeans to evaluate patients' health. However, there is a lack of voice\nbiomarkers for predicting mortality rates among heart failure patients with\nstandardized speech protocols. Here, we demonstrate a powerful and effective ML\nmodel for predicting mortality rates in hospitalized HF patients through the\nutilization of voice biomarkers. By seamlessly integrating voice biomarkers\ninto routine patient monitoring, this strategy has the potential to improve\npatient outcomes, optimize resource allocation, and advance patient-centered HF\nmanagement. In this study, a Machine Learning system, specifically a logistic\nregression model, is trained to predict patients' 5-year mortality rates using\ntheir speech as input. The model performs admirably and consistently, as\ndemonstrated by cross-validation and statistical approaches (p-value < 0.001).\nFurthermore, integrating NT-proBNP, a diagnostic biomarker in HF, improves the\nmodel's predictive accuracy substantially.\n","authors":["Nihat Ahmadli","Mehmet Ali Sarsil","Berk Mizrak","Kurtulus Karauzum","Ata Shaker","Erol Tulumen","Didar Mirzamidinov","Dilek Ural","Onur Ergen"],"pdf_url":"https://arxiv.org/pdf/2402.13812v2.pdf","comment":"11 pages, 6 figures, 5 tables. The first 2 authors have contributed\n equally"},{"id":"http://arxiv.org/abs/2408.07772v1","updated":"2024-08-14T18:49:27Z","published":"2024-08-14T18:49:27Z","title":"Out-of-Distribution Learning with Human Feedback","summary":" Out-of-distribution (OOD) learning often relies heavily on statistical\napproaches or predefined assumptions about OOD data distributions, hindering\ntheir efficacy in addressing multifaceted challenges of OOD generalization and\nOOD detection in real-world deployment environments. This paper presents a\nnovel framework for OOD learning with human feedback, which can provide\ninvaluable insights into the nature of OOD shifts and guide effective model\nadaptation. Our framework capitalizes on the freely available unlabeled data in\nthe wild that captures the environmental test-time OOD distributions under both\ncovariate and semantic shifts. To harness such data, our key idea is to\nselectively provide human feedback and label a small number of informative\nsamples from the wild data distribution, which are then used to train a\nmulti-class classifier and an OOD detector. By exploiting human feedback, we\nenhance the robustness and reliability of machine learning models, equipping\nthem with the capability to handle OOD scenarios with greater precision. We\nprovide theoretical insights on the generalization error bounds to justify our\nalgorithm. Extensive experiments show the superiority of our method,\noutperforming the current state-of-the-art by a significant margin.\n","authors":["Haoyue Bai","Xuefeng Du","Katie Rainey","Shibin Parameswaran","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2408.07772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14058v2","updated":"2024-08-14T18:45:17Z","published":"2024-05-22T23:06:34Z","title":"Formally Verifying Deep Reinforcement Learning Controllers with Lyapunov\n Barrier Certificates","summary":" Deep reinforcement learning (DRL) is a powerful machine learning paradigm for\ngenerating agents that control autonomous systems. However, the ``black box''\nnature of DRL agents limits their deployment in real-world safety-critical\napplications. A promising approach for providing strong guarantees on an\nagent's behavior is to use Neural Lyapunov Barrier (NLB) certificates, which\nare learned functions over the system whose properties indirectly imply that an\nagent behaves as desired. However, NLB-based certificates are typically\ndifficult to learn and even more difficult to verify, especially for complex\nsystems. In this work, we present a novel method for training and verifying\nNLB-based certificates for discrete-time systems. Specifically, we introduce a\ntechnique for certificate composition, which simplifies the verification of\nhighly-complex systems by strategically designing a sequence of certificates.\nWhen jointly verified with neural network verification engines, these\ncertificates provide a formal guarantee that a DRL agent both achieves its\ngoals and avoids unsafe behavior. Furthermore, we introduce a technique for\ncertificate filtering, which significantly simplifies the process of producing\nformally verified certificates. We demonstrate the merits of our approach with\na case study on providing safety and liveness guarantees for a DRL-controlled\nspacecraft.\n","authors":["Udayan Mandal","Guy Amir","Haoze Wu","Ieva Daukantas","Fletcher Lee Newell","Umberto J. Ravaioli","Baoluo Meng","Michael Durling","Milan Ganai","Tobey Shim","Guy Katz","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2405.14058v2.pdf","comment":"To appear in FMCAD 2024"},{"id":"http://arxiv.org/abs/2408.07763v1","updated":"2024-08-14T18:30:57Z","published":"2024-08-14T18:30:57Z","title":"Data Clustering and Visualization with Recursive Goemans-Williamson\n MaxCut Algorithm","summary":" In this article, we introduce a novel recursive modification to the classical\nGoemans-Williamson MaxCut algorithm, offering improved performance in\nvectorized data clustering tasks. Focusing on the clustering of medical\npublications, we employ recursive iterations in conjunction with a dimension\nrelaxation method to significantly enhance density of clustering results.\nFurthermore, we propose a unique vectorization technique for articles,\nleveraging conditional probabilities for more effective clustering. Our methods\nprovide advantages in both computational efficiency and clustering accuracy,\nsubstantiated through comprehensive experiments.\n","authors":["An Ly","Raj Sawhney","Marina Chugunova"],"pdf_url":"https://arxiv.org/pdf/2408.07763v1.pdf","comment":"Published in the IEEE Conference, CSCI 2023 (Winter Session)"},{"id":"http://arxiv.org/abs/2312.13511v2","updated":"2024-08-14T18:27:46Z","published":"2023-12-21T01:12:44Z","title":"Symmetry-enforcing neural networks with applications to constitutive\n modeling","summary":" The use of machine learning techniques to homogenize the effective behavior\nof arbitrary microstructures has been shown to be not only efficient but also\naccurate. In a recent work, we demonstrated how to combine state-of-the-art\nmicromechanical modeling and advanced machine learning techniques to homogenize\ncomplex microstructures exhibiting non-linear and history dependent behaviors\n(Logarzo et al., 2021). The resulting homogenized model, termed smart\nconstitutive law (SCL), enables the adoption of microstructurally informed\nconstitutive laws into finite element solvers at a fraction of the\ncomputational cost required by traditional concurrent multiscale approaches. In\nthis work, the capabilities of SCLs are expanded via the introduction of a\nnovel methodology that enforces material symmetries at the neuron level,\napplicable across various neural network architectures. This approach utilizes\ntensor-based features in neural networks, facilitating the concise and accurate\nrepresentation of symmetry-preserving operations, and is general enough to be\nextend to problems beyond constitutive modeling. Details on the construction of\nthese tensor-based neural networks and their application in learning\nconstitutive laws are presented for both elastic and inelastic materials. The\nsuperiority of this approach over traditional neural networks is demonstrated\nin scenarios with limited data and strong symmetries, through comprehensive\ntesting on various materials, including isotropic neo-Hookean materials and\ntensegrity lattice metamaterials. This work is concluded by a discussion on the\npotential of this methodology to discover symmetry bases in materials and by an\noutline of future research directions.\n","authors":["Kévin Garanger","Julie Kraus","Julian J. Rimoli"],"pdf_url":"https://arxiv.org/pdf/2312.13511v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.07694v1","updated":"2024-08-14T17:50:27Z","published":"2024-08-14T17:50:27Z","title":"End-to-end Semantic-centric Video-based Multimodal Affective Computing","summary":" In the pathway toward Artificial General Intelligence (AGI), understanding\nhuman's affection is essential to enhance machine's cognition abilities. For\nachieving more sensual human-AI interaction, Multimodal Affective Computing\n(MAC) in human-spoken videos has attracted increasing attention. However,\nprevious methods are mainly devoted to designing multimodal fusion algorithms,\nsuffering from two issues: semantic imbalance caused by diverse pre-processing\noperations and semantic mismatch raised by inconsistent affection content\ncontained in different modalities comparing with the multimodal ground truth.\nBesides, the usage of manual features extractors make they fail in building\nend-to-end pipeline for multiple MAC downstream tasks. To address above\nchallenges, we propose a novel end-to-end framework named SemanticMAC to\ncompute multimodal semantic-centric affection for human-spoken videos. We\nfirstly employ pre-trained Transformer model in multimodal data pre-processing\nand design Affective Perceiver module to capture unimodal affective\ninformation. Moreover, we present a semantic-centric approach to unify\nmultimodal representation learning in three ways, including gated feature\ninteraction, multi-task pseudo label generation, and intra-/inter-sample\ncontrastive learning. Finally, SemanticMAC effectively learn specific- and\nshared-semantic representations in the guidance of semantic-centric labels.\nExtensive experimental results demonstrate that our approach surpass the\nstate-of-the-art methods on 7 public datasets in four MAC downstream tasks.\n","authors":["Ronghao Lin","Ying Zeng","Sijie Mai","Haifeng Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07694v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.07540v1","updated":"2024-08-14T13:17:42Z","published":"2024-08-14T13:17:42Z","title":"3D Gaussian Editing with A Single Image","summary":" The modeling and manipulation of 3D scenes captured from the real world are\npivotal in various applications, attracting growing research interest. While\nprevious works on editing have achieved interesting results through\nmanipulating 3D meshes, they often require accurately reconstructed meshes to\nperform editing, which limits their application in 3D content generation. To\naddress this gap, we introduce a novel single-image-driven 3D scene editing\napproach based on 3D Gaussian Splatting, enabling intuitive manipulation via\ndirectly editing the content on a 2D image plane. Our method learns to optimize\nthe 3D Gaussians to align with an edited version of the image rendered from a\nuser-specified viewpoint of the original scene. To capture long-range object\ndeformation, we introduce positional loss into the optimization process of 3D\nGaussian Splatting and enable gradient propagation through reparameterization.\nTo handle occluded 3D Gaussians when rendering from the specified viewpoint, we\nbuild an anchor-based structure and employ a coarse-to-fine optimization\nstrategy capable of handling long-range deformation while maintaining\nstructural stability. Furthermore, we design a novel masking strategy to\nadaptively identify non-rigid deformation regions for fine-scale modeling.\nExtensive experiments show the effectiveness of our method in handling\ngeometric details, long-range, and non-rigid deformation, demonstrating\nsuperior editing flexibility and quality compared to previous approaches.\n","authors":["Guan Luo","Tian-Xing Xu","Ying-Tian Liu","Xiao-Xiong Fan","Fang-Lue Zhang","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07540v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.10020v2","updated":"2024-08-14T13:15:00Z","published":"2024-03-15T05:06:21Z","title":"Lost in Overlap: Exploring Watermark Collision in LLMs","summary":" The proliferation of large language models (LLMs) in generating content\nraises concerns about text copyright. Watermarking methods, particularly\nlogit-based approaches, embed imperceptible identifiers into text to address\nthese challenges. However, the widespread usage of watermarking across diverse\nLLMs has led to an inevitable issue known as watermark collision during common\ntasks, such as paraphrasing or translation. In this paper, we introduce\nwatermark collision as a novel and general philosophy for watermark attacks,\naimed at enhancing attack performance on top of any other attacking methods. We\nalso provide a comprehensive demonstration that watermark collision poses a\nthreat to all logit-based watermark algorithms, impacting not only specific\nattack scenarios but also downstream applications.\n","authors":["Yiyang Luo","Ke Lin","Chao Gu"],"pdf_url":"https://arxiv.org/pdf/2403.10020v2.pdf","comment":"Long Paper, 7 pages"},{"id":"http://arxiv.org/abs/2408.06753v2","updated":"2024-08-14T10:53:34Z","published":"2024-08-13T09:19:59Z","title":"Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies","summary":" Existing methods on audio-visual deepfake detection mainly focus on\nhigh-level features for modeling inconsistencies between audio and visual data.\nAs a result, these approaches usually overlook finer audio-visual artifacts,\nwhich are inherent to deepfakes. Herein, we propose the introduction of\nfine-grained mechanisms for detecting subtle artifacts in both spatial and\ntemporal domains. First, we introduce a local audio-visual model capable of\ncapturing small spatial regions that are prone to inconsistencies with audio.\nFor that purpose, a fine-grained mechanism based on a spatially-local distance\ncoupled with an attention module is adopted. Second, we introduce a\ntemporally-local pseudo-fake augmentation to include samples incorporating\nsubtle temporal inconsistencies in our training set. Experiments on the DFDC\nand the FakeAVCeleb datasets demonstrate the superiority of the proposed method\nin terms of generalization as compared to the state-of-the-art under both\nin-dataset and cross-dataset settings.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2408.06753v2.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2304.02970v7","updated":"2024-08-14T09:21:44Z","published":"2023-04-06T09:54:06Z","title":"Unraveling Instance Associations: A Closer Look for Audio-Visual\n Segmentation","summary":" Audio-visual segmentation (AVS) is a challenging task that involves\naccurately segmenting sounding objects based on audio-visual cues. The\neffectiveness of audio-visual learning critically depends on achieving accurate\ncross-modal alignment between sound and visual objects. Successful audio-visual\nlearning requires two essential components: 1) a challenging dataset with\nhigh-quality pixel-level multi-class annotated images associated with audio\nfiles, and 2) a model that can establish strong links between audio information\nand its corresponding visual object. However, these requirements are only\npartially addressed by current methods, with training sets containing biased\naudio-visual data, and models that generalise poorly beyond this biased\ntraining set. In this work, we propose a new cost-effective strategy to build\nchallenging and relatively unbiased high-quality audio-visual segmentation\nbenchmarks. We also propose a new informative sample mining method for\naudio-visual supervised contrastive learning to leverage discriminative\ncontrastive samples to enforce cross-modal understanding. We show empirical\nresults that demonstrate the effectiveness of our benchmark. Furthermore,\nexperiments conducted on existing AVS datasets and on our new benchmark show\nthat our method achieves state-of-the-art (SOTA) segmentation accuracy.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v7.pdf","comment":"Code is available at https://github.com/cyh-0/CAVP"},{"id":"http://arxiv.org/abs/2308.13273v2","updated":"2024-08-14T08:48:40Z","published":"2023-08-25T09:51:03Z","title":"Bridging the Gap: Sketch-Aware Interpolation Network for High-Quality\n Animation Sketch Inbetweening","summary":" Hand-drawn 2D animation workflow is typically initiated with the creation of\nsketch keyframes. Subsequent manual inbetweens are crafted for smoothness,\nwhich is a labor-intensive process and the prospect of automatic animation\nsketch interpolation has become highly appealing. Yet, common frame\ninterpolation methods are generally hindered by two key issues: 1) limited\ntexture and colour details in sketches, and 2) exaggerated alterations between\ntwo sketch keyframes. To overcome these issues, we propose a novel deep\nlearning method - Sketch-Aware Interpolation Network (SAIN). This approach\nincorporates multi-level guidance that formulates region-level correspondence,\nstroke-level correspondence and pixel-level dynamics. A multi-stream\nU-Transformer is then devised to characterize sketch inbetweening patterns\nusing these multi-level guides through the integration of self /\ncross-attention mechanisms. Additionally, to facilitate future research on\nanimation sketch inbetweening, we constructed a large-scale dataset - STD-12K,\ncomprising 30 sketch animation series in diverse artistic styles. Comprehensive\nexperiments on this dataset convincingly show that our proposed SAIN surpasses\nthe state-of-the-art interpolation methods.\n","authors":["Jiaming Shen","Kun Hu","Wei Bao","Chang Wen Chen","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07349v1","updated":"2024-08-14T07:47:25Z","published":"2024-08-14T07:47:25Z","title":"Automated Retinal Image Analysis and Medical Report Generation through\n Deep Learning","summary":" The increasing prevalence of retinal diseases poses a significant challenge\nto the healthcare system, as the demand for ophthalmologists surpasses the\navailable workforce. This imbalance creates a bottleneck in diagnosis and\ntreatment, potentially delaying critical care. Traditional methods of\ngenerating medical reports from retinal images rely on manual interpretation,\nwhich is time-consuming and prone to errors, further straining\nophthalmologists' limited resources. This thesis investigates the potential of\nArtificial Intelligence (AI) to automate medical report generation for retinal\nimages. AI can quickly analyze large volumes of image data, identifying subtle\npatterns essential for accurate diagnosis. By automating this process, AI\nsystems can greatly enhance the efficiency of retinal disease diagnosis,\nreducing doctors' workloads and enabling them to focus on more complex cases.\nThe proposed AI-based methods address key challenges in automated report\ngeneration: (1) Improved methods for medical keyword representation enhance the\nsystem's ability to capture nuances in medical terminology; (2) A multi-modal\ndeep learning approach captures interactions between textual keywords and\nretinal images, resulting in more comprehensive medical reports; (3) Techniques\nto enhance the interpretability of the AI-based report generation system,\nfostering trust and acceptance in clinical practice. These methods are\nrigorously evaluated using various metrics and achieve state-of-the-art\nperformance. This thesis demonstrates AI's potential to revolutionize retinal\ndisease diagnosis by automating medical report generation, ultimately improving\nclinical efficiency, diagnostic accuracy, and patient care.\n[https://github.com/Jhhuangkay/DeepOpht-Medical-Report-Generation-for-Retinal-Images-via-Deep-Models-and-Visual-Explanation]\n","authors":["Jia-Hong Huang"],"pdf_url":"https://arxiv.org/pdf/2408.07349v1.pdf","comment":"Ph.D. thesis, 124 pages"},{"id":"http://arxiv.org/abs/2407.04416v3","updated":"2024-08-14T21:30:52Z","published":"2024-07-05T11:07:13Z","title":"Sound-VECaps: Improving Audio Generation with Visual Enhanced Captions","summary":" Generative models have shown significant achievements in audio generation\ntasks. However, existing models struggle with complex and detailed prompts,\nleading to potential performance degradation. We hypothesize that this problem\nstems from the simplicity and scarcity of the training data. This work aims to\ncreate a large-scale audio dataset with rich captions for improving audio\ngeneration models. We first develop an automated pipeline to generate detailed\ncaptions by transforming predicted visual captions, audio captions, and tagging\nlabels into comprehensive descriptions using a Large Language Model (LLM). The\nresulting dataset, Sound-VECaps, comprises 1.66M high-quality audio-caption\npairs with enriched details including audio event orders, occurred places and\nenvironment information. We then demonstrate that training the text-to-audio\ngeneration models with Sound-VECaps significantly improves the performance on\ncomplex prompts. Furthermore, we conduct ablation studies of the models on\nseveral downstream audio-language tasks, showing the potential of Sound-VECaps\nin advancing audio-text representation learning. Our dataset and models are\navailable online.\n","authors":["Yi Yuan","Dongya Jia","Xiaobin Zhuang","Yuanzhe Chen","Zhengxi Liu","Zhuo Chen","Yuping Wang","Yuxuan Wang","Xubo Liu","Xiyuan Kang","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04416v3.pdf","comment":"5 pages with 1 appendix"},{"id":"http://arxiv.org/abs/2408.07791v1","updated":"2024-08-14T20:03:53Z","published":"2024-08-14T20:03:53Z","title":"An Efficient and Explanatory Image and Text Clustering System with\n Multimodal Autoencoder Architecture","summary":" We demonstrate the efficiencies and explanatory abilities of extensions to\nthe common tools of Autoencoders and LLM interpreters, in the novel context of\ncomparing different cultural approaches to the same international news event.\nWe develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model\nthat extends the modalities of previous CVAE models, by using fully-connected\nlatent layers to embed in parallel the CNN encodings of video frames, together\nwith the LSTM encodings of their related text derived from audio. We\nincorporate the model within a larger system that includes frame-caption\nalignment, latent space vector clustering, and a novel LLM-based cluster\ninterpreter. We measure, tune, and apply this system to the task of summarizing\na video into three to five thematic clusters, with each theme described by ten\nLLM-produced phrases. We apply this system to two news topics, COVID-19 and the\nWinter Olympics, and five other topics are in progress.\n","authors":["Tiancheng Shi","Yuanchen Wei","John R. Kender"],"pdf_url":"https://arxiv.org/pdf/2408.07791v1.pdf","comment":null}]},"2024-08-15T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.08313v1","updated":"2024-08-15T17:59:57Z","published":"2024-08-15T17:59:57Z","title":"Can Large Language Models Understand Symbolic Graphics Programs?","summary":" Assessing the capabilities of large language models (LLMs) is often\nchallenging, in part, because it is hard to find tasks to which they have not\nbeen exposed during training. We take one step to address this challenge by\nturning to a new task: focusing on symbolic graphics programs, which are a\npopular representation for graphics content that procedurally generates visual\ndata. LLMs have shown exciting promise towards program synthesis, but do they\nunderstand symbolic graphics programs? Unlike conventional programs, symbolic\ngraphics programs can be translated to graphics content. Here, we characterize\nan LLM's understanding of symbolic programs in terms of their ability to answer\nquestions related to the graphics content. This task is challenging as the\nquestions are difficult to answer from the symbolic programs alone -- yet, they\nwould be easy to answer from the corresponding graphics content as we verify\nthrough a human experiment. To understand symbolic programs, LLMs may need to\npossess the ability to imagine how the corresponding graphics content would\nlook without directly accessing the rendered visual content. We use this task\nto evaluate LLMs by creating a large benchmark for the semantic understanding\nof symbolic graphics programs. This benchmark is built via program-graphics\ncorrespondence, hence requiring minimal human efforts. We evaluate current LLMs\non our benchmark to elucidate a preliminary assessment of their ability to\nreason about visual scenes from programs. We find that this task distinguishes\nexisting LLMs and models considered good at reasoning perform better. Lastly,\nwe introduce Symbolic Instruction Tuning (SIT) to improve this ability.\nSpecifically, we query GPT4-o with questions and images generated by symbolic\nprograms. Such data are then used to finetune an LLM. We also find that SIT\ndata can improve the general instruction following ability of LLMs.\n","authors":["Zeju Qiu","Weiyang Liu","Haiwen Feng","Zhen Liu","Tim Z. Xiao","Katherine M. Collins","Joshua B. Tenenbaum","Adrian Weller","Michael J. Black","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2408.08313v1.pdf","comment":"Technical Report v1 (44 pages, 23 figures, project page:\n https://sgp-bench.github.io/)"},{"id":"http://arxiv.org/abs/2408.08310v1","updated":"2024-08-15T17:59:30Z","published":"2024-08-15T17:59:30Z","title":"ScalingFilter: Assessing Data Quality through Inverse Utilization of\n Scaling Laws","summary":" High-quality data is crucial for the pre-training performance of large\nlanguage models. Unfortunately, existing quality filtering methods rely on a\nknown high-quality dataset as reference, which can introduce potential bias and\ncompromise diversity. In this paper, we propose ScalingFilter, a novel approach\nthat evaluates text quality based on the perplexity difference between two\nlanguage models trained on the same data, thereby eliminating the influence of\nthe reference dataset in the filtering process. An theoretical analysis shows\nthat ScalingFilter is equivalent to an inverse utilization of scaling laws.\nThrough training models with 1.3B parameters on the same data source processed\nby various quality filters, we find ScalingFilter can improve zero-shot\nperformance of pre-trained models in downstream tasks. To assess the bias\nintroduced by quality filtering, we introduce semantic diversity, a metric of\nutilizing text embedding models for semantic representations. Extensive\nexperiments reveal that semantic diversity is a reliable indicator of dataset\ndiversity, and ScalingFilter achieves an optimal balance between downstream\nperformance and semantic diversity.\n","authors":["Ruihang Li","Yixuan Wei","Miaosen Zhang","Nenghai Yu","Han Hu","Houwen Peng"],"pdf_url":"https://arxiv.org/pdf/2408.08310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08302v1","updated":"2024-08-15T17:55:45Z","published":"2024-08-15T17:55:45Z","title":"Benchmarking the Capabilities of Large Language Models in Transportation\n System Engineering: Accuracy, Consistency, and Reasoning Behaviors","summary":" In this paper, we explore the capabilities of state-of-the-art large language\nmodels (LLMs) such as GPT-4, GPT-4o, Claude 3.5 Sonnet, Claude 3 Opus, Gemini\n1.5 Pro, Llama 3, and Llama 3.1 in solving some selected undergraduate-level\ntransportation engineering problems. We introduce TransportBench, a benchmark\ndataset that includes a sample of transportation engineering problems on a wide\nrange of subjects in the context of planning, design, management, and control\nof transportation systems. This dataset is used by human experts to evaluate\nthe capabilities of various commercial and open-sourced LLMs, especially their\naccuracy, consistency, and reasoning behaviors, in solving transportation\nengineering problems. Our comprehensive analysis uncovers the unique strengths\nand limitations of each LLM, e.g. our analysis shows the impressive accuracy\nand some unexpected inconsistent behaviors of Claude 3.5 Sonnet in solving\nTransportBench problems. Our study marks a thrilling first step toward\nharnessing artificial general intelligence for complex transportation\nchallenges.\n","authors":["Usman Syed","Ethan Light","Xingang Guo","Huan Zhang","Lianhui Qin","Yanfeng Ouyang","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2408.08302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08291v1","updated":"2024-08-15T17:46:54Z","published":"2024-08-15T17:46:54Z","title":"The ShareLM Collection and Plugin: Contributing Human-Model Chats for\n the Benefit of the Community","summary":" Human-model conversations provide a window into users' real-world scenarios,\nbehavior, and needs, and thus are a valuable resource for model development and\nresearch. While for-profit companies collect user data through the APIs of\ntheir models, using it internally to improve their own models, the open source\nand research community lags behind.\n We introduce the ShareLM collection, a unified set of human conversations\nwith large language models, and its accompanying plugin, a Web extension for\nvoluntarily contributing user-model conversations. Where few platforms share\ntheir chats, the ShareLM plugin adds this functionality, thus, allowing users\nto share conversations from most platforms. The plugin allows the user to rate\ntheir conversations, both at the conversation and the response levels, and\ndelete conversations they prefer to keep private before they ever leave the\nuser's local storage. We release the plugin conversations as part of the\nShareLM collection, and call for more community effort in the field of open\nhuman-model data.\n The code, plugin, and data are available.\n","authors":["Shachar Don-Yehiya","Leshem Choshen","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2408.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11907v2","updated":"2024-08-15T17:37:36Z","published":"2024-02-19T07:46:40Z","title":"Direct Large Language Model Alignment Through Self-Rewarding Contrastive\n Prompt Distillation","summary":" Aligning large language models (LLMs) with human expectations without\nhuman-annotated preference data is an important problem. In this paper, we\npropose a method to evaluate the response preference by using the output\nprobabilities of response pairs under contrastive prompt pairs, which could\nachieve better performance on LLaMA2-7B and LLaMA2-13B compared to RLAIF. Based\non this, we propose an automatic alignment method, Direct Large Model Alignment\n(DLMA). First, we use contrastive prompt pairs to automatically generate\npreference data. Then, we continue to evaluate the generated preference data\nusing contrastive prompt pairs and calculate a self-rewarding score. Finally,\nwe use the DPO algorithm to effectively align LLMs by combining this\nself-rewarding score. In the experimental stage, our DLMA method could surpass\nthe \\texttt{RLHF} method without relying on human-annotated preference data.\n","authors":["Aiwei Liu","Haoping Bai","Zhiyun Lu","Xiang Kong","Simon Wang","Jiulong Shan","Meng Cao","Lijie Wen"],"pdf_url":"https://arxiv.org/pdf/2402.11907v2.pdf","comment":"24 pages, 5 pages"},{"id":"http://arxiv.org/abs/2408.08261v1","updated":"2024-08-15T17:01:57Z","published":"2024-08-15T17:01:57Z","title":"mhGPT: A Lightweight Generative Pre-Trained Transformer for Mental\n Health Text Analysis","summary":" This paper introduces mhGPT, a lightweight generative pre-trained transformer\ntrained on mental health-related social media and PubMed articles. Fine-tuned\nfor specific mental health tasks, mhGPT was evaluated under limited hardware\nconstraints and compared with state-of-the-art models like MentaLLaMA and\nGemma. Despite having only 1.98 billion parameters and using just 5% of the\ndataset, mhGPT outperformed larger models and matched the performance of models\ntrained on significantly more data. The key contributions include integrating\ndiverse mental health data, creating a custom tokenizer, and optimizing a\nsmaller architecture for low-resource settings. This research could advance\nAI-driven mental health care, especially in areas with limited computing power.\n","authors":["Dae-young Kim","Rebecca Hwa","Muhammad Mahbubur Rahman"],"pdf_url":"https://arxiv.org/pdf/2408.08261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06292v2","updated":"2024-08-15T15:42:50Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aides to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17143v2","updated":"2024-08-15T15:40:15Z","published":"2024-04-26T04:12:08Z","title":"Quantifying Memorization and Detecting Training Data of Pre-trained\n Language Models using Japanese Newspaper","summary":" Dominant pre-trained language models (PLMs) have demonstrated the potential\nrisk of memorizing and outputting the training data. While this concern has\nbeen discussed mainly in English, it is also practically important to focus on\ndomain-specific PLMs. In this study, we pre-trained domain-specific GPT-2\nmodels using a limited corpus of Japanese newspaper articles and evaluated\ntheir behavior. Experiments replicated the empirical finding that memorization\nof PLMs is related to the duplication in the training data, model size, and\nprompt length, in Japanese the same as in previous English studies.\nFurthermore, we attempted membership inference attacks, demonstrating that the\ntraining data can be detected even in Japanese, which is the same trend as in\nEnglish. The study warns that domain-specific PLMs, sometimes trained with\nvaluable private data, can ''copy and paste'' on a large scale.\n","authors":["Shotaro Ishihara","Hiromu Takahashi"],"pdf_url":"https://arxiv.org/pdf/2404.17143v2.pdf","comment":"The 17th International Natural Language Generation Conference"},{"id":"http://arxiv.org/abs/2408.06583v3","updated":"2024-08-15T15:24:10Z","published":"2024-08-13T02:43:19Z","title":"An Event Structure-aware Generative Model for Biomedical Event\n Extraction","summary":" Biomedical Event Extraction (BEE) is a challenging task that involves\nmodeling complex relationships between fine-grained entities in biomedical\ntext. BEE has traditionally been formulated as a classification problem. With\nthe recent technological advancements in large language models (LLMs),\ngeneration-based models that cast event extraction as a sequence generation\nproblem have attracted much attention from the NLP research communities.\nHowever, current generative models often overlook the importance of\ncross-instance information from complex event structures such as nested events\nand overlapping events, which contribute quite significantly in the benchmark\ndatasets. In this paper, we propose an event structure-aware generative model\ncalled GenBEE, which can capture complex event structures in biomedical text\nfor biomedical event extraction. In particular, GenBEE constructs event prompts\nthat distill knowledge from LLMs for incorporating both label semantics and\nargument dependency relationships into the proposed model. In addition, GenBEE\nalso generates prefixes with event structural prompts to incorporate structural\nfeatures for improving the model's overall performance. We have evaluated the\nproposed GenBEE model on three widely used biomedical event extraction\nbenchmark datasets, namely MLEE, GE11, and PHEE. Experimental results show that\nGenBEE has achieved state-of-the-art performance on the MLEE and GE11 datasets,\nand achieved competitive results when compared to the state-of-the-art\nclassification-based models on the PHEE dataset.\n","authors":["Haohan Yuan","Siu Cheung Hui","Haopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06583v3.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.08212v1","updated":"2024-08-15T15:23:00Z","published":"2024-08-15T15:23:00Z","title":"Covert Bias: The Severity of Social Views' Unalignment Towards Implicit\n and Explicit Opinion","summary":" While various approaches have recently been studied for bias identification,\nlittle is known about how implicit language that does not explicitly convey a\nviewpoint affects bias amplification in large language models.To examine the\nseverity of bias toward a view, we evaluated the performance of two downstream\ntasks where the implicit and explicit knowledge of social groups were used.\nFirst, we present a stress test evaluation by using a biased model in edge\ncases of excessive bias scenarios. Then, we evaluate how LLMs calibrate\nlinguistically in response to both implicit and explicit opinions when they are\naligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM\nperformance in identifying implicit and explicit opinions, with a general\ntendency of bias toward explicit opinions of opposing stances. Moreover, the\nbias-aligned models generate more cautious responses using uncertainty phrases\ncompared to the unaligned (zero-shot) base models. The direct, incautious\nresponses of the unaligned models suggest a need for further refinement of\ndecisiveness by incorporating uncertainty markers to enhance their reliability,\nespecially on socially nuanced topics with high subjectivity.\n","authors":["Abeer Aldayel","Areej Alokaili","Rehab Alahmadi"],"pdf_url":"https://arxiv.org/pdf/2408.08212v1.pdf","comment":"This work is under-review"},{"id":"http://arxiv.org/abs/2407.15508v2","updated":"2024-08-15T15:22:57Z","published":"2024-07-22T09:45:16Z","title":"Compensate Quantization Errors+: Quantized Models Are Inquisitive\n Learners","summary":" Large Language Models (LLMs) showcase remarkable performance and robust\ndeductive capabilities, yet their expansive size complicates deployment and\nraises environmental concerns due to substantial resource consumption. The\nrecent development of a quantization technique known as Learnable\nSingular-value Increment (LSI) has addressed some of these quantization\nchallenges. Leveraging insights from LSI and our extensive research, we have\ndeveloped innovative methods that enhance the performance of quantized LLMs,\nparticularly in low-bit settings. Our methods consistently deliver\nstate-of-the-art results across various quantization scenarios and offer deep\ntheoretical insights into the quantization process, elucidating the potential\nof quantized models for widespread application.\n","authors":["Yifei Gao","Jie Ou","Lei Wang","Fanhua Shang","Jaji Wu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.15508v2.pdf","comment":"Effecient Quantization Methods for LLMs"},{"id":"http://arxiv.org/abs/2311.09552v2","updated":"2024-08-15T15:00:12Z","published":"2023-11-16T04:17:47Z","title":"Large Language Models are Few-Shot Training Example Generators: A Case\n Study in Fallacy Recognition","summary":" Recognizing fallacies is crucial for ensuring the quality and validity of\narguments across various domains. However, computational fallacy recognition\nfaces challenges due to the diverse genres, domains, and types of fallacies\nfound in datasets. This leads to a highly multi-class, and even multi-label,\nsetup with substantial class imbalance. In this study, we aim to enhance\nexisting models for fallacy recognition by incorporating additional context and\nby leveraging large language models to generate synthetic data, thus increasing\nthe representation of the infrequent classes. We experiment with GPT3.5 to\ngenerate synthetic examples and we examine the impact of prompt settings for\nthis. Moreover, we explore zero-shot and few-shot scenarios to evaluate the\neffectiveness of using the generated examples for training smaller models\nwithin a unified fallacy recognition framework. Furthermore, we analyze the\noverlap between the synthetic data and existing fallacy datasets. Finally, we\ninvestigate the usefulness of providing supplementary context for detecting\nfallacy types that need such context, e.g., diversion fallacies. Our evaluation\nresults demonstrate consistent improvements across fallacy types, datasets, and\ngenerators. The code and the synthetic datasets are all publicly available.\n","authors":["Tariq Alhindi","Smaranda Muresan","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2311.09552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15588v2","updated":"2024-08-15T14:52:44Z","published":"2024-07-22T12:25:48Z","title":"Unsupervised Robust Cross-Lingual Entity Alignment via Neighbor Triple\n Matching with Entity and Relation Texts","summary":" Cross-lingual entity alignment (EA) enables the integration of multiple\nknowledge graphs (KGs) across different languages, providing users with\nseamless access to diverse and comprehensive knowledge. Existing methods,\nmostly supervised, face challenges in obtaining labeled entity pairs. To\naddress this, recent studies have shifted towards self-supervised and\nunsupervised frameworks. Despite their effectiveness, these approaches have\nlimitations: (1) Relation passing: mainly focusing on the entity while\nneglecting the semantic information of relations, (2) Isomorphic assumption:\nassuming isomorphism between source and target graphs, which leads to noise and\nreduced alignment accuracy, and (3) Noise vulnerability: susceptible to noise\nin the textual features, especially when encountering inconsistent translations\nor Out-Of-Vocabulary (OOV) problems. In this paper, we propose ERAlign, an\nunsupervised and robust cross-lingual EA pipeline that jointly performs\nEntity-level and Relation-level Alignment by neighbor triple matching strategy\nusing semantic textual features of relations and entities. Its refinement step\niteratively enhances results by fusing entity-level and relation-level\nalignments based on neighbor triple matching. The additional verification step\nexamines the entities' neighbor triples as the linearized text. This\nAlign-then-Verify pipeline rigorously assesses alignment results, achieving\nnear-perfect alignment even in the presence of noisy textual features of\nentities. Our extensive experiments demonstrate that the robustness and general\napplicability of ERAlign improved the accuracy and effectiveness of EA tasks,\ncontributing significantly to knowledge-oriented applications.\n","authors":["Soojin Yoon","Sungho Ko","Tongyoung Kim","SeongKu Kang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2407.15588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06162v3","updated":"2024-08-15T13:59:08Z","published":"2024-04-09T09:34:25Z","title":"Characterizing Multimodal Long-form Summarization: A Case Study on\n Financial Reports","summary":" As large language models (LLMs) expand the power of natural language\nprocessing to handle long inputs, rigorous and systematic analyses are\nnecessary to understand their abilities and behavior. A salient application is\nsummarization, due to its ubiquity and controversy (e.g., researchers have\ndeclared the death of summarization). In this paper, we use financial report\nsummarization as a case study because financial reports are not only long but\nalso use numbers and tables extensively. We propose a computational framework\nfor characterizing multimodal long-form summarization and investigate the\nbehavior of Claude 2.0/2.1, GPT-4/3.5, and Cohere. We find that GPT-3.5 and\nCohere fail to perform this summarization task meaningfully. For Claude 2 and\nGPT-4, we analyze the extractiveness of the summary and identify a position\nbias in LLMs. This position bias disappears after shuffling the input for\nClaude, which suggests that Claude seems to recognize important information. We\nalso conduct a comprehensive investigation on the use of numeric data in\nLLM-generated summaries and offer a taxonomy of numeric hallucination. We\nemploy prompt engineering to improve GPT-4's use of numbers with limited\nsuccess. Overall, our analyses highlight the strong capability of Claude 2 in\nhandling long multimodal inputs compared to GPT-4. The generated summaries and\nevaluation code are available at\nhttps://github.com/ChicagoHAI/characterizing-multimodal-long-form-summarization.\n","authors":["Tianyu Cao","Natraj Raman","Danial Dervovic","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2404.06162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21783v2","updated":"2024-08-15T13:57:20Z","published":"2024-07-31T17:54:27Z","title":"The Llama 3 Herd of Models","summary":" Modern artificial intelligence (AI) systems are powered by foundation models.\nThis paper presents a new set of foundation models, called Llama 3. It is a\nherd of language models that natively support multilinguality, coding,\nreasoning, and tool usage. Our largest model is a dense Transformer with 405B\nparameters and a context window of up to 128K tokens. This paper presents an\nextensive empirical evaluation of Llama 3. We find that Llama 3 delivers\ncomparable quality to leading language models such as GPT-4 on a plethora of\ntasks. We publicly release Llama 3, including pre-trained and post-trained\nversions of the 405B parameter language model and our Llama Guard 3 model for\ninput and output safety. The paper also presents the results of experiments in\nwhich we integrate image, video, and speech capabilities into Llama 3 via a\ncompositional approach. We observe this approach performs competitively with\nthe state-of-the-art on image, video, and speech recognition tasks. The\nresulting models are not yet being broadly released as they are still under\ndevelopment.\n","authors":["Abhimanyu Dubey","Abhinav Jauhri","Abhinav Pandey","Abhishek Kadian","Ahmad Al-Dahle","Aiesha Letman","Akhil Mathur","Alan Schelten","Amy Yang","Angela Fan","Anirudh Goyal","Anthony Hartshorn","Aobo Yang","Archi Mitra","Archie Sravankumar","Artem Korenev","Arthur Hinsvark","Arun Rao","Aston Zhang","Aurelien Rodriguez","Austen Gregerson","Ava Spataru","Baptiste Roziere","Bethany Biron","Binh Tang","Bobbie Chern","Charlotte Caucheteux","Chaya Nayak","Chloe Bi","Chris Marra","Chris McConnell","Christian Keller","Christophe Touret","Chunyang Wu","Corinne Wong","Cristian Canton Ferrer","Cyrus Nikolaidis","Damien Allonsius","Daniel Song","Danielle Pintz","Danny Livshits","David Esiobu","Dhruv Choudhary","Dhruv Mahajan","Diego Garcia-Olano","Diego Perino","Dieuwke Hupkes","Egor Lakomkin","Ehab AlBadawy","Elina Lobanova","Emily Dinan","Eric Michael Smith","Filip Radenovic","Frank Zhang","Gabriel Synnaeve","Gabrielle Lee","Georgia Lewis Anderson","Graeme Nail","Gregoire Mialon","Guan Pang","Guillem Cucurell","Hailey Nguyen","Hannah Korevaar","Hu Xu","Hugo Touvron","Iliyan Zarov","Imanol Arrieta Ibarra","Isabel Kloumann","Ishan Misra","Ivan Evtimov","Jade Copet","Jaewon Lee","Jan Geffert","Jana Vranes","Jason Park","Jay Mahadeokar","Jeet Shah","Jelmer van der Linde","Jennifer Billock","Jenny Hong","Jenya Lee","Jeremy Fu","Jianfeng Chi","Jianyu Huang","Jiawen Liu","Jie Wang","Jiecao Yu","Joanna Bitton","Joe Spisak","Jongsoo Park","Joseph Rocca","Joshua Johnstun","Joshua Saxe","Junteng Jia","Kalyan Vasuden Alwala","Kartikeya Upasani","Kate Plawiak","Ke Li","Kenneth Heafield","Kevin Stone","Khalid El-Arini","Krithika Iyer","Kshitiz Malik","Kuenley Chiu","Kunal Bhalla","Lauren Rantala-Yeary","Laurens van der Maaten","Lawrence Chen","Liang Tan","Liz Jenkins","Louis Martin","Lovish Madaan","Lubo Malo","Lukas Blecher","Lukas Landzaat","Luke de Oliveira","Madeline Muzzi","Mahesh Pasupuleti","Mannat Singh","Manohar Paluri","Marcin Kardas","Mathew Oldham","Mathieu Rita","Maya Pavlova","Melanie Kambadur","Mike Lewis","Min Si","Mitesh Kumar Singh","Mona Hassan","Naman Goyal","Narjes Torabi","Nikolay Bashlykov","Nikolay Bogoychev","Niladri Chatterji","Olivier Duchenne","Onur Çelebi","Patrick Alrassy","Pengchuan Zhang","Pengwei Li","Petar Vasic","Peter Weng","Prajjwal Bhargava","Pratik Dubal","Praveen Krishnan","Punit Singh Koura","Puxin Xu","Qing He","Qingxiao Dong","Ragavan Srinivasan","Raj Ganapathy","Ramon Calderer","Ricardo Silveira Cabral","Robert Stojnic","Roberta Raileanu","Rohit Girdhar","Rohit Patel","Romain Sauvestre","Ronnie Polidoro","Roshan Sumbaly","Ross Taylor","Ruan Silva","Rui Hou","Rui Wang","Saghar Hosseini","Sahana Chennabasappa","Sanjay Singh","Sean Bell","Seohyun Sonia Kim","Sergey Edunov","Shaoliang Nie","Sharan Narang","Sharath Raparthy","Sheng Shen","Shengye Wan","Shruti Bhosale","Shun Zhang","Simon Vandenhende","Soumya Batra","Spencer Whitman","Sten Sootla","Stephane Collot","Suchin Gururangan","Sydney Borodinsky","Tamar Herman","Tara Fowler","Tarek Sheasha","Thomas Georgiou","Thomas Scialom","Tobias Speckbacher","Todor Mihaylov","Tong Xiao","Ujjwal Karn","Vedanuj Goswami","Vibhor Gupta","Vignesh Ramanathan","Viktor Kerkez","Vincent Gonguet","Virginie Do","Vish Vogeti","Vladan Petrovic","Weiwei Chu","Wenhan Xiong","Wenyin Fu","Whitney Meers","Xavier Martinet","Xiaodong Wang","Xiaoqing Ellen Tan","Xinfeng Xie","Xuchao Jia","Xuewei Wang","Yaelle Goldschlag","Yashesh Gaur","Yasmine Babaei","Yi Wen","Yiwen Song","Yuchen Zhang","Yue Li","Yuning Mao","Zacharie Delpierre Coudert","Zheng Yan","Zhengxing Chen","Zoe Papakipos","Aaditya Singh","Aaron Grattafiori","Abha Jain","Adam Kelsey","Adam Shajnfeld","Adithya Gangidi","Adolfo Victoria","Ahuva Goldstand","Ajay Menon","Ajay Sharma","Alex Boesenberg","Alex Vaughan","Alexei Baevski","Allie Feinstein","Amanda Kallet","Amit Sangani","Anam Yunus","Andrei Lupu","Andres Alvarado","Andrew Caples","Andrew Gu","Andrew Ho","Andrew Poulton","Andrew Ryan","Ankit Ramchandani","Annie Franco","Aparajita Saraf","Arkabandhu Chowdhury","Ashley Gabriel","Ashwin Bharambe","Assaf Eisenman","Azadeh Yazdan","Beau James","Ben Maurer","Benjamin Leonhardi","Bernie Huang","Beth Loyd","Beto De Paola","Bhargavi Paranjape","Bing Liu","Bo Wu","Boyu Ni","Braden Hancock","Bram Wasti","Brandon Spence","Brani Stojkovic","Brian Gamido","Britt Montalvo","Carl Parker","Carly Burton","Catalina Mejia","Changhan Wang","Changkyu Kim","Chao Zhou","Chester Hu","Ching-Hsiang Chu","Chris Cai","Chris Tindal","Christoph Feichtenhofer","Damon Civin","Dana Beaty","Daniel Kreymer","Daniel Li","Danny Wyatt","David Adkins","David Xu","Davide Testuggine","Delia David","Devi Parikh","Diana Liskovich","Didem Foss","Dingkang Wang","Duc Le","Dustin Holland","Edward Dowling","Eissa Jamil","Elaine Montgomery","Eleonora Presani","Emily Hahn","Emily Wood","Erik Brinkman","Esteban Arcaute","Evan Dunbar","Evan Smothers","Fei Sun","Felix Kreuk","Feng Tian","Firat Ozgenel","Francesco Caggioni","Francisco Guzmán","Frank Kanayet","Frank Seide","Gabriela Medina Florez","Gabriella Schwarz","Gada Badeer","Georgia Swee","Gil Halpern","Govind Thattai","Grant Herman","Grigory Sizov"," Guangyi"," Zhang","Guna Lakshminarayanan","Hamid Shojanazeri","Han Zou","Hannah Wang","Hanwen Zha","Haroun Habeeb","Harrison Rudolph","Helen Suk","Henry Aspegren","Hunter Goldman","Ibrahim Damlaj","Igor Molybog","Igor Tufanov","Irina-Elena Veliche","Itai Gat","Jake Weissman","James Geboski","James Kohli","Japhet Asher","Jean-Baptiste Gaya","Jeff Marcus","Jeff Tang","Jennifer Chan","Jenny Zhen","Jeremy Reizenstein","Jeremy Teboul","Jessica Zhong","Jian Jin","Jingyi Yang","Joe Cummings","Jon Carvill","Jon Shepard","Jonathan McPhie","Jonathan Torres","Josh Ginsburg","Junjie Wang","Kai Wu","Kam Hou U","Karan Saxena","Karthik Prasad","Kartikay Khandelwal","Katayoun Zand","Kathy Matosich","Kaushik Veeraraghavan","Kelly Michelena","Keqian Li","Kun Huang","Kunal Chawla","Kushal Lakhotia","Kyle Huang","Lailin Chen","Lakshya Garg","Lavender A","Leandro Silva","Lee Bell","Lei Zhang","Liangpeng Guo","Licheng Yu","Liron Moshkovich","Luca Wehrstedt","Madian Khabsa","Manav Avalani","Manish Bhatt","Maria Tsimpoukelli","Martynas Mankus","Matan Hasson","Matthew Lennie","Matthias Reso","Maxim Groshev","Maxim Naumov","Maya Lathi","Meghan Keneally","Michael L. Seltzer","Michal Valko","Michelle Restrepo","Mihir Patel","Mik Vyatskov","Mikayel Samvelyan","Mike Clark","Mike Macey","Mike Wang","Miquel Jubert Hermoso","Mo Metanat","Mohammad Rastegari","Munish Bansal","Nandhini Santhanam","Natascha Parks","Natasha White","Navyata Bawa","Nayan Singhal","Nick Egebo","Nicolas Usunier","Nikolay Pavlovich Laptev","Ning Dong","Ning Zhang","Norman Cheng","Oleg Chernoguz","Olivia Hart","Omkar Salpekar","Ozlem Kalinli","Parkin Kent","Parth Parekh","Paul Saab","Pavan Balaji","Pedro Rittner","Philip Bontrager","Pierre Roux","Piotr Dollar","Polina Zvyagina","Prashant Ratanchandani","Pritish Yuvraj","Qian Liang","Rachad Alao","Rachel Rodriguez","Rafi Ayub","Raghotham Murthy","Raghu Nayani","Rahul Mitra","Raymond Li","Rebekkah Hogan","Robin Battey","Rocky Wang","Rohan Maheswari","Russ Howes","Ruty Rinott","Sai Jayesh Bondu","Samyak Datta","Sara Chugh","Sara Hunt","Sargun Dhillon","Sasha Sidorov","Satadru Pan","Saurabh Verma","Seiji Yamamoto","Sharadh Ramaswamy","Shaun Lindsay","Shaun Lindsay","Sheng Feng","Shenghao Lin","Shengxin Cindy Zha","Shiva Shankar","Shuqiang Zhang","Shuqiang Zhang","Sinong Wang","Sneha Agarwal","Soji Sajuyigbe","Soumith Chintala","Stephanie Max","Stephen Chen","Steve Kehoe","Steve Satterfield","Sudarshan Govindaprasad","Sumit Gupta","Sungmin Cho","Sunny Virk","Suraj Subramanian","Sy Choudhury","Sydney Goldman","Tal Remez","Tamar Glaser","Tamara Best","Thilo Kohler","Thomas Robinson","Tianhe Li","Tianjun Zhang","Tim Matthews","Timothy Chou","Tzook Shaked","Varun Vontimitta","Victoria Ajayi","Victoria Montanez","Vijai Mohan","Vinay Satish Kumar","Vishal Mangla","Vítor Albiero","Vlad Ionescu","Vlad Poenaru","Vlad Tiberiu Mihailescu","Vladimir Ivanov","Wei Li","Wenchen Wang","Wenwen Jiang","Wes Bouaziz","Will Constable","Xiaocheng Tang","Xiaofang Wang","Xiaojian Wu","Xiaolan Wang","Xide Xia","Xilun Wu","Xinbo Gao","Yanjun Chen","Ye Hu","Ye Jia","Ye Qi","Yenda Li","Yilin Zhang","Ying Zhang","Yossi Adi","Youngjin Nam"," Yu"," Wang","Yuchen Hao","Yundi Qian","Yuzi He","Zach Rait","Zachary DeVito","Zef Rosnbrick","Zhaoduo Wen","Zhenyu Yang","Zhiwei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.21783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08152v1","updated":"2024-08-15T13:40:03Z","published":"2024-08-15T13:40:03Z","title":"DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for\n Reinforcement Learning and Monte-Carlo Tree Search","summary":" We introduce DeepSeek-Prover-V1.5, an open-source language model designed for\ntheorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both\ntraining and inference processes. Pre-trained on DeepSeekMath-Base with\nspecialization in formal mathematical languages, the model undergoes supervised\nfine-tuning using an enhanced formal theorem proving dataset derived from\nDeepSeek-Prover-V1. Further refinement is achieved through reinforcement\nlearning from proof assistant feedback (RLPAF). Beyond the single-pass\nwhole-proof generation approach of DeepSeek-Prover-V1, we propose RMaxTS, a\nvariant of Monte-Carlo tree search that employs an intrinsic-reward-driven\nexploration strategy to generate diverse proof paths. DeepSeek-Prover-V1.5\ndemonstrates significant improvements over DeepSeek-Prover-V1, achieving new\nstate-of-the-art results on the test set of the high school level miniF2F\nbenchmark ($63.5\\%$) and the undergraduate level ProofNet benchmark ($25.3\\%$).\n","authors":["Huajian Xin","Z. Z. Ren","Junxiao Song","Zhihong Shao","Wanjia Zhao","Haocheng Wang","Bo Liu","Liyue Zhang","Xuan Lu","Qiushi Du","Wenjun Gao","Qihao Zhu","Dejian Yang","Zhibin Gou","Z. F. Wu","Fuli Luo","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2408.08152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01040v2","updated":"2024-08-15T13:36:43Z","published":"2024-05-02T06:52:49Z","title":"Few Shot Class Incremental Learning using Vision-Language models","summary":" Recent advancements in deep learning have demonstrated remarkable performance\ncomparable to human capabilities across various supervised computer vision\ntasks. However, the prevalent assumption of having an extensive pool of\ntraining data encompassing all classes prior to model training often diverges\nfrom real-world scenarios, where limited data availability for novel classes is\nthe norm. The challenge emerges in seamlessly integrating new classes with few\nsamples into the training data, demanding the model to adeptly accommodate\nthese additions without compromising its performance on base classes. To\naddress this exigency, the research community has introduced several solutions\nunder the realm of few-shot class incremental learning (FSCIL).\n In this study, we introduce an innovative FSCIL framework that utilizes\nlanguage regularizer and subspace regularizer. During base training, the\nlanguage regularizer helps incorporate semantic information extracted from a\nVision-Language model. The subspace regularizer helps in facilitating the\nmodel's acquisition of nuanced connections between image and text semantics\ninherent to base classes during incremental training. Our proposed framework\nnot only empowers the model to embrace novel classes with limited data, but\nalso ensures the preservation of performance on base classes. To substantiate\nthe efficacy of our approach, we conduct comprehensive experiments on three\ndistinct FSCIL benchmarks, where our framework attains state-of-the-art\nperformance.\n","authors":["Anurag Kumar","Chinmay Bharti","Saikat Dutta","Srikrishna Karanam","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2405.01040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08147v1","updated":"2024-08-15T13:32:25Z","published":"2024-08-15T13:32:25Z","title":"P/D-Serve: Serving Disaggregated Large Language Model at Scale","summary":" Serving disaggregated large language models (LLMs) over tens of thousands of\nxPU devices (GPUs or NPUs) with reliable performance faces multiple challenges.\n1) Ignoring the diversity (various prefixes and tidal requests), treating all\nthe prompts in a mixed pool is inadequate. To facilitate the similarity per\nscenario and minimize the inner mismatch on P/D (prefill and decoding)\nprocessing, fine-grained organization is required, dynamically adjusting P/D\nratios for better performance. 2) Due to inaccurate estimation on workload\n(queue status or maintained connections), the global scheduler easily incurs\nunnecessary timeouts in prefill. 3) Block-fixed device-to-device (D2D) KVCache\ntransfer over cluster-level RDMA (remote direct memory access) fails to achieve\ndesired D2D utilization as expected. To overcome previous problems, this paper\nproposes an end-to-end system P/D-Serve, complying with the paradigm of MLOps\n(machine learning operations), which models end-to-end (E2E) P/D performance\nand enables: 1) fine-grained P/D organization, mapping the service with RoCE\n(RDMA over converged ethernet) as needed, to facilitate similar processing and\ndynamic adjustments on P/D ratios; 2) on-demand forwarding upon rejections for\nidle prefill, decoupling the scheduler from regular inaccurate reports and\nlocal queues, to avoid timeouts in prefill; and 3) efficient KVCache transfer\nvia optimized D2D access. P/D-Serve is implemented upon Ascend and MindSpore,\nhas been deployed over tens of thousands of NPUs for more than eight months in\ncommercial use, and further achieves 60\\%, 42\\% and 46\\% improvements on E2E\nthroughput, time-to-first-token (TTFT) SLO (service level objective) and D2D\ntransfer time. As the E2E system with optimizations, P/D-Serve achieves 6.7x\nincrease on throughput, compared with aggregated LLMs.\n","authors":["Yibo Jin","Tao Wang","Huimin Lin","Mingyang Song","Peiyang Li","Yipeng Ma","Yicheng Shan","Zhengfan Yuan","Cailong Li","Yajing Sun","Tiandeng Wu","Xing Chu","Ruizhi Huan","Li Ma","Xiao You","Wenting Zhou","Yunpeng Ye","Wen Liu","Xiangkun Xu","Yongsheng Zhang","Tiantian Dong","Jiawei Zhu","Zhe Wang","Xijian Ju","Jianxun Song","Haoliang Cheng","Xiaojing Li","Jiandong Ding","Hefei Guo","Zhengyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08146v1","updated":"2024-08-15T13:29:48Z","published":"2024-08-15T13:29:48Z","title":"KOALA: Enhancing Speculative Decoding for LLM via Multi-Layer Draft\n Heads with Adversarial Learning","summary":" Large Language Models (LLMs) exhibit high inference latency due to their\nautoregressive decoding nature. While the draft head in speculative decoding\nmitigates this issue, its full potential remains unexplored. In this paper, we\nintroduce KOALA (K-layer Optimized Adversarial Learning Architecture), an\northogonal approach to the draft head. By transforming the conventional\nsingle-layer draft head into a multi-layer architecture and incorporating\nadversarial learning into the traditional supervised training, KOALA\nsignificantly improves the accuracy of the draft head in predicting subsequent\ntokens, thus more closely mirroring the functionality of LLMs. Although this\nimprovement comes at the cost of slightly increased drafting overhead, KOALA\nsubstantially unlocks the draft head's potential, greatly enhancing speculative\ndecoding. We conducted comprehensive evaluations of KOALA, including both\nautoregressive and non-autoregressive draft heads across various tasks,\ndemonstrating a latency speedup ratio improvement of 0.24x-0.41x, which is\n10.57%-14.09% faster than the original draft heads.\n","authors":["Kaiqi Zhang","Jing Zhao","Rui Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08144v1","updated":"2024-08-15T13:28:18Z","published":"2024-08-15T13:28:18Z","title":"MIDAS: Multi-level Intent, Domain, And Slot Knowledge Distillation for\n Multi-turn NLU","summary":" Although Large Language Models(LLMs) can generate coherent and contextually\nrelevant text, they often struggle to recognise the intent behind the human\nuser's query. Natural Language Understanding (NLU) models, however, interpret\nthe purpose and key information of user's input to enable responsive\ninteractions. Existing NLU models generally map individual utterances to a\ndual-level semantic frame, involving sentence-level intent and word-level slot\nlabels. However, real-life conversations primarily consist of multi-turn\nconversations, involving the interpretation of complex and extended dialogues.\nResearchers encounter challenges addressing all facets of multi-turn dialogue\nconversations using a unified single NLU model. This paper introduces a novel\napproach, MIDAS, leveraging a multi-level intent, domain, and slot knowledge\ndistillation for multi-turn NLU. To achieve this, we construct distinct\nteachers for varying levels of conversation knowledge, namely, sentence-level\nintent detection, word-level slot filling, and conversation-level domain\nclassification. These teachers are then fine-tuned to acquire specific\nknowledge of their designated levels. A multi-teacher loss is proposed to\nfacilitate the combination of these multi-level teachers, guiding a student\nmodel in multi-turn dialogue tasks. The experimental results demonstrate the\nefficacy of our model in improving the overall multi-turn conversation\nunderstanding, showcasing the potential for advancements in NLU models through\nthe incorporation of multi-level dialogue knowledge distillation techniques.\n","authors":["Yan Li","So-Eon Kim","Seong-Bae Park","Soyeon Caren Han"],"pdf_url":"https://arxiv.org/pdf/2408.08144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06930v2","updated":"2024-08-15T12:42:12Z","published":"2024-08-13T14:33:32Z","title":"Diagnosis extraction from unstructured Dutch echocardiogram reports\n using span- and document-level characteristic classification","summary":" Clinical machine learning research and AI driven clinical decision support\nmodels rely on clinically accurate labels. Manually extracting these labels\nwith the help of clinical specialists is often time-consuming and expensive.\nThis study tests the feasibility of automatic span- and document-level\ndiagnosis extraction from unstructured Dutch echocardiogram reports. We\nincluded 115,692 unstructured echocardiogram reports from the UMCU a large\nuniversity hospital in the Netherlands. A randomly selected subset was manually\nannotated for the occurrence and severity of eleven commonly described cardiac\ncharacteristics. We developed and tested several automatic labelling techniques\nat both span and document levels, using weighted and macro F1-score, precision,\nand recall for performance evaluation. We compared the performance of span\nlabelling against document labelling methods, which included both direct\ndocument classifiers and indirect document classifiers that rely on span\nclassification results. The SpanCategorizer and MedRoBERTa$.$nl models\noutperformed all other span and document classifiers, respectively. The\nweighted F1-score varied between characteristics, ranging from 0.60 to 0.93 in\nSpanCategorizer and 0.96 to 0.98 in MedRoBERTa$.$nl. Direct document\nclassification was superior to indirect document classification using span\nclassifiers. SetFit achieved competitive document classification performance\nusing only 10% of the training data. Utilizing a reduced label set yielded\nnear-perfect document classification results. We recommend using our published\nSpanCategorizer and MedRoBERTa$.$nl models for span- and document-level\ndiagnosis extraction from Dutch echocardiography reports. For settings with\nlimited training data, SetFit may be a promising alternative for document\nclassification.\n","authors":["Bauke Arends","Melle Vessies","Dirk van Osch","Arco Teske","Pim van der Harst","René van Es","Bram van Es"],"pdf_url":"https://arxiv.org/pdf/2408.06930v2.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.12614v2","updated":"2024-08-15T11:43:23Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of trustworthy and\ndisinformation articles related to pro-Kremlin themes. It is sourced directly\nfrom the debunk articles written by experts leading the EUvsDisinfo project.\nOur dataset is the largest to-date resource in terms of the overall number of\narticles and distinct languages. It also provides the largest topical and\ntemporal coverage. Using this dataset, we investigate the dissemination of\npro-Kremlin disinformation across different languages, uncovering\nlanguage-specific patterns targeting specific disinformation topics. We further\nanalyse the evolution of topic distribution over an eight-year period, noting a\nsignificant surge in disinformation content before the full-scale invasion of\nUkraine in 2022. Lastly, we demonstrate the dataset's applicability in training\nmodels to effectively distinguish between disinformation and trustworthy\ncontent in multilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v2.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08089v1","updated":"2024-08-15T11:33:20Z","published":"2024-08-15T11:33:20Z","title":"AgentCourt: Simulating Court with Adversarial Evolvable Lawyer Agents","summary":" In this paper, we present a simulation system called AgentCourt that\nsimulates the entire courtroom process. The judge, plaintiff's lawyer, defense\nlawyer, and other participants are autonomous agents driven by large language\nmodels (LLMs). Our core goal is to enable lawyer agents to learn how to argue a\ncase, as well as improving their overall legal skills, through courtroom\nprocess simulation. To achieve this goal, we propose an adversarial\nevolutionary approach for the lawyer-agent. Since AgentCourt can simulate the\noccurrence and development of court hearings based on a knowledge base and LLM,\nthe lawyer agents can continuously learn and accumulate experience from real\ncourt cases. The simulation experiments show that after two lawyer-agents have\nengaged in a thousand adversarial legal cases in AgentCourt (which can take a\ndecade for real-world lawyers), compared to their pre-evolutionary state, the\nevolved lawyer agents exhibit consistent improvement in their ability to handle\nlegal tasks. To enhance the credibility of our experimental results, we\nenlisted a panel of professional lawyers to evaluate our simulations. The\nevaluation indicates that the evolved lawyer agents exhibit notable\nadvancements in responsiveness, as well as expertise and logical rigor. This\nwork paves the way for advancing LLM-driven agent technology in legal\nscenarios. Code is available at https://github.com/relic-yuexi/AgentCourt.\n","authors":["Guhong Chen","Liyang Fan","Zihan Gong","Nan Xie","Zixuan Li","Ziqiang Liu","Chengming Li","Qiang Qu","Shiwen Ni","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08073v1","updated":"2024-08-15T10:54:55Z","published":"2024-08-15T10:54:55Z","title":"Extracting Sentence Embeddings from Pretrained Transformer Models","summary":" Background/introduction: Pre-trained transformer models shine in many natural\nlanguage processing tasks and therefore are expected to bear the representation\nof the input sentence or text meaning. These sentence-level embeddings are also\nimportant in retrieval-augmented generation. But do commonly used plain\naveraging or prompt templates surface it enough?\n Methods: Given 110M parameters BERT's hidden representations from multiple\nlayers and multiple tokens we tried various ways to extract optimal sentence\nrepresentations. We tested various token aggregation and representation\npost-processing techniques. We also tested multiple ways of using a general\nWikitext dataset to complement BERTs sentence representations. All methods were\ntested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12\nclassification tasks. We also evaluated our representation-shaping techniques\non other static models, including random token representations.\n Results: Proposed representation extraction methods improved the performance\non STS and clustering tasks for all models considered. Very high improvements\nfor static token-based models, especially random embeddings for STS tasks\nalmost reach the performance of BERT-derived representations.\n Conclusions: Our work shows that for multiple tasks simple baselines with\nrepresentation shaping techniques reach or even outperform more complex\nBERT-based models or are able to contribute to their performance.\n","authors":["Lukas Stankevičius","Mantas Lukoševičius"],"pdf_url":"https://arxiv.org/pdf/2408.08073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08072v1","updated":"2024-08-15T10:44:38Z","published":"2024-08-15T10:44:38Z","title":"I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative\n Self-Enhancement Paradigm","summary":" Large Language Models (LLMs) have achieved significant advancements, however,\nthe common learning paradigm treats LLMs as passive information repositories,\nneglecting their potential for active learning and alignment. Some approaches\ntrain LLMs using their own generated synthetic data, exploring the possibility\nof active alignment. However, there is still a huge gap between these one-time\nalignment methods and the continuous automatic alignment of humans. In this\npaper, we introduce \\textbf{I-SHEEP}, an \\textbf{I}terative\n\\textbf{S}elf-En\\textbf{H}anc\\textbf{E}m\\textbf{E}nt \\textbf{P}aradigm.This\nhuman-like paradigm enables LLMs to \\textbf{continuously self-align from\nscratch with nothing}. Compared to the one-time alignment method Dromedary\n\\cite{sun2023principledriven}, which refers to the first iteration in this\npaper, I-SHEEP can significantly enhance capacities on both Qwen and Llama\nmodels. I-SHEEP achieves a maximum relative improvement of 78.2\\% in the Alpaca\nEval, 24.0\\% in the MT Bench, and an absolute increase of 8.88\\% in the IFEval\naccuracy over subsequent iterations in Qwen-1.5 72B model. Additionally,\nI-SHEEP surpasses the base model in various standard benchmark generation\ntasks, achieving an average improvement of 24.77\\% in code generation tasks,\n12.04\\% in TrivialQA, and 20.29\\% in SQuAD. We also provide new insights based\non the experiment results. Our codes, datasets, and models are available at\n\\textbf{https://anonymous.4open.science/r/I-SHEEP}.\n","authors":["Yiming Liang","Ge Zhang","Xingwei Qu","Tianyu Zheng","Jiawei Guo","Xinrun Du","Zhenzhu Yang","Jiaheng Liu","Chenghua Lin","Lei Ma","Wenhao Huang","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05661v2","updated":"2024-08-15T10:27:45Z","published":"2024-06-09T06:30:28Z","title":"MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked\n Language Modelling methods for learning Speech Representations","summary":" In recent years, self-supervised pre-training methods have gained significant\ntraction in learning high-level information from raw speech. Among these\nmethods, HuBERT has demonstrated SOTA performance in automatic speech\nrecognition (ASR). However, HuBERT's performance lags behind data2vec due to\ndisparities in pre-training strategies. In this paper, we propose (i) a Swap\nmethod to address pre-training and inference mismatch observed in HuBERT and\n(ii) incorporates Multicluster masked prediction loss for more effective\nutilization of the models capacity. The resulting method is, MS-HuBERT, an\nend-to-end self-supervised pre-training method for learning robust speech\nrepresentations. It beats vanilla HuBERT on the ASR Librispeech benchmark on\naverage by a 5% margin when evaluated on different finetuning splits.\nAdditionally, we demonstrate that the learned embeddings obtained during\npre-training encode essential information for improving performance of content\nbased tasks such as ASR.\n","authors":["Hemant Yadav","Sunayana Sitaram","Rajiv Ratn Shah"],"pdf_url":"https://arxiv.org/pdf/2406.05661v2.pdf","comment":"4 pages, submitted to interspeech2024"},{"id":"http://arxiv.org/abs/2408.08067v1","updated":"2024-08-15T10:20:54Z","published":"2024-08-15T10:20:54Z","title":"RAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented\n Generation","summary":" Despite Retrieval-Augmented Generation (RAG) has shown promising capability\nin leveraging external knowledge, a comprehensive evaluation of RAG systems is\nstill challenging due to the modular nature of RAG, evaluation of long-form\nresponses and reliability of measurements. In this paper, we propose a\nfine-grained evaluation framework, RAGChecker, that incorporates a suite of\ndiagnostic metrics for both the retrieval and generation modules. Meta\nevaluation verifies that RAGChecker has significantly better correlations with\nhuman judgments than other evaluation metrics. Using RAGChecker, we evaluate 8\nRAG systems and conduct an in-depth analysis of their performance, revealing\ninsightful patterns and trade-offs in the design choices of RAG architectures.\nThe metrics of RAGChecker can guide researchers and practitioners in developing\nmore effective RAG systems.\n","authors":["Dongyu Ru","Lin Qiu","Xiangkun Hu","Tianhang Zhang","Peng Shi","Shuaichen Chang","Jiayang Cheng","Cunxiang Wang","Shichao Sun","Huanyu Li","Zizhao Zhang","Binjie Wang","Jiarong Jiang","Tong He","Zhiguo Wang","Pengfei Liu","Yue Zhang","Zheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08067v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.09170v4","updated":"2024-08-15T10:12:54Z","published":"2024-04-14T07:19:27Z","title":"Distilling Reasoning Ability from Large Language Models with Adaptive\n Thinking","summary":" Chain of thought finetuning (cot-finetuning) aims to endow small language\nmodels (SLM) with reasoning ability to improve their performance towards\nspecific tasks by allowing them to imitate the reasoning procedure of large\nlanguage models (LLM) beyond simply predicting the answers. Most existing\ncot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to\ngenerate a rationale before providing an answer. This mechanism enables SLM to\nanalyze and think about complex questions, but it also makes answer correctness\nhighly sensitive to minor errors in rationale. Therefore, we propose a robust\npost-thinking mechanism to generate answers before rationale. Thanks to this\nanswer-first setting, 1) the answer can escape from the adverse effects caused\nby minor errors in the rationale; 2) the rationale serves as an error amplifier\nto the answer, which makes the SLM focus on learning hard samples; 3) the\ninferring efficiency can also benefit from the setting since users can stop the\ngeneration right after answers are outputted when inference is conducted.\nHowever, although the post-thinking mechanism brings many advantages and\nimproves the overall performance of SLM on specific tasks, it may lose the\nability to think about the questions and decompose complex questions into\nsimple sub-questions compared to pre-thinking mechanism. Therefore, a\nplug-and-play adaptive-thinking mechanism is proposed with the aid of the soft\nprompt tuning to integrate the merits of the pre-thinking mechanism and\npost-thinking mechanism, in which a perception module is introduced to\nadaptively prompt SLM answer or think first based on perceiving the complexity\nof the questions. Extensive experiments are conducted across 12 reasoning tasks\nand 2 representative language models to demonstrate the effectiveness of the\nproposed mechanism.\n","authors":["Xiaoshu Chen","Sihang Zhou","Ke Liang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09170v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20770v2","updated":"2024-08-15T10:03:40Z","published":"2024-05-24T07:23:56Z","title":"Large Language Model Sentinel: LLM Agent for Adversarial Purification","summary":" Over the past two years, the use of large language models (LLMs) has advanced\nrapidly. While these LLMs offer considerable convenience, they also raise\nsecurity concerns, as LLMs are vulnerable to adversarial attacks by some\nwell-designed textual perturbations. In this paper, we introduce a novel\ndefense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is\ndesigned to enhance the adversarial robustness of LLMs by purifying the\nadversarial textual examples before feeding them into the target LLM. Our\nmethod comprises two main components: a) Agent instruction, which can simulate\na new agent for adversarial defense, altering minimal characters to maintain\nthe original meaning of the sentence while defending against attacks; b)\nDefense guidance, which provides strategies for modifying clean or adversarial\nexamples to ensure effective defense and accurate outputs from the target LLMs.\nRemarkably, the defense agent demonstrates robust defensive capabilities even\nwithout learning from adversarial examples. Additionally, we conduct an\nintriguing adversarial experiment where we develop two agents, one for defense\nand one for attack, and engage them in mutual confrontation. During the\nadversarial interactions, neither agent completely beat the other. Extensive\nexperiments on both open-source and closed-source LLMs demonstrate that our\nmethod effectively defends against adversarial attacks, thereby enhancing\nadversarial robustness.\n","authors":["Guang Lin","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13764v3","updated":"2024-08-15T10:03:37Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v3.pdf","comment":"Accepted to ECCV 2024. Code is available at\n https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2408.08054v1","updated":"2024-08-15T09:48:45Z","published":"2024-08-15T09:48:45Z","title":"Text2BIM: Generating Building Models Using a Large Language Model-based\n Multi-Agent Framework","summary":" The conventional BIM authoring process typically requires designers to master\ncomplex and tedious modeling commands in order to materialize their design\nintentions within BIM authoring tools. This additional cognitive burden\ncomplicates the design process and hinders the adoption of BIM and model-based\ndesign in the AEC (Architecture, Engineering, and Construction) industry. To\nfacilitate the expression of design intentions more intuitively, we propose\nText2BIM, an LLM-based multi-agent framework that can generate 3D building\nmodels from natural language instructions. This framework orchestrates multiple\nLLM agents to collaborate and reason, transforming textual user input into\nimperative code that invokes the BIM authoring tool's APIs, thereby generating\neditable BIM models with internal layouts, external envelopes, and semantic\ninformation directly in the software. Furthermore, a rule-based model checker\nis introduced into the agentic workflow, utilizing predefined domain knowledge\nto guide the LLM agents in resolving issues within the generated models and\niteratively improving model quality. Extensive experiments were conducted to\ncompare and analyze the performance of three different LLMs under the proposed\nframework. The evaluation results demonstrate that our approach can effectively\ngenerate high-quality, structurally rational building models that are aligned\nwith the abstract concepts specified by user input. Finally, an interactive\nsoftware prototype was developed to integrate the framework into the BIM\nauthoring software Vectorworks, showcasing the potential of modeling by\nchatting.\n","authors":["Changyu Du","Sebastian Esser","Stavros Nousias","André Borrmann"],"pdf_url":"https://arxiv.org/pdf/2408.08054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08027v1","updated":"2024-08-15T08:50:58Z","published":"2024-08-15T08:50:58Z","title":"Enhancing Large Language Model-based Speech Recognition by\n Contextualization for Rare and Ambiguous Words","summary":" We develop a large language model (LLM) based automatic speech recognition\n(ASR) system that can be contextualized by providing keywords as prior\ninformation in text prompts. We adopt decoder-only architecture and use our\nin-house LLM, PLaMo-100B, pre-trained from scratch using datasets dominated by\nJapanese and English texts as the decoder. We adopt a pre-trained Whisper\nencoder as an audio encoder, and the audio embeddings from the audio encoder\nare projected to the text embedding space by an adapter layer and concatenated\nwith text embeddings converted from text prompts to form inputs to the decoder.\nBy providing keywords as prior information in the text prompts, we can\ncontextualize our LLM-based ASR system without modifying the model architecture\nto transcribe ambiguous words in the input audio accurately. Experimental\nresults demonstrate that providing keywords to the decoder can significantly\nimprove the recognition performance of rare and ambiguous words.\n","authors":["Kento Nozawa","Takashi Masuko","Toru Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2408.08027v1.pdf","comment":"13 pages, 1 figure, and 7 tables"},{"id":"http://arxiv.org/abs/2408.08003v1","updated":"2024-08-15T08:12:52Z","published":"2024-08-15T08:12:52Z","title":"Leveraging Web-Crawled Data for High-Quality Fine-Tuning","summary":" Most large language models are fine-tuned using either expensive\nhuman-annotated data or GPT-4 generated data which cannot guarantee performance\nin certain domains. We argue that although the web-crawled data often has\nformatting errors causing semantic inaccuracies, it can still serve as a\nvaluable source for high-quality supervised fine-tuning in specific domains\nwithout relying on advanced models like GPT-4. To this end, we create a paired\ntraining dataset automatically by aligning web-crawled data with a smaller set\nof high-quality data. By training a language model on this dataset, we can\nconvert web data with irregular formats into high-quality ones. Our experiments\nshow that training with the model-transformed data yields better results,\nsurpassing training with only high-quality data by an average score of 9.4% in\nChinese math problems. Additionally, our 7B model outperforms several\nopen-source models larger than 32B and surpasses well-known closed-source\nmodels such as GPT-3.5, highlighting the efficacy of our approach.\n","authors":["Jing Zhou","Chenglin Jiang","Wei Shen","Xiao Zhou","Xiaonan He"],"pdf_url":"https://arxiv.org/pdf/2408.08003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07990v1","updated":"2024-08-15T07:37:24Z","published":"2024-08-15T07:37:24Z","title":"FuseChat: Knowledge Fusion of Chat Models","summary":" While training large language models (LLMs) from scratch can indeed lead to\nmodels with distinct capabilities and strengths, it incurs substantial costs\nand may lead to redundancy in competencies. Knowledge fusion aims to integrate\nexisting LLMs of diverse architectures and capabilities into a more potent LLM\nthrough lightweight continual training, thereby reducing the need for costly\nLLM development. In this work, we propose a new framework for the knowledge\nfusion of chat LLMs through two main stages, resulting in FuseChat. Firstly, we\nconduct pairwise knowledge fusion on source chat LLMs of varying structures and\nscales to create multiple target LLMs with identical structure and size via\nlightweight fine-tuning. During this process, a statistics-based token\nalignment approach is introduced as the cornerstone for fusing LLMs with\ndifferent structures. Secondly, we merge these target LLMs within the parameter\nspace, where we propose a novel method for determining the merging coefficients\nbased on the magnitude of parameter updates before and after fine-tuning. We\nimplement and validate FuseChat using six prominent chat LLMs with diverse\narchitectures and scales, including OpenChat-3.5-7B, Starling-LM-7B-alpha,\nNH2-SOLAR-10.7B, InternLM2-Chat-20B, Mixtral-8x7B-Instruct, and\nQwen-1.5-Chat-72B. Experimental results on two instruction-following\nbenchmarks, AlpacaEval 2.0 and MT-Bench, demonstrate the superiority of\nFuseChat-7B over baselines of various sizes. Our model is even comparable to\nthe larger Mixtral-8x7B-Instruct and approaches GPT-3.5-Turbo-1106 on MT-Bench.\nOur code, model weights, and data are public at\n\\url{https://github.com/fanqiwan/FuseAI}.\n","authors":["Fanqi Wan","Longguang Zhong","Ziyi Yang","Ruijun Chen","Xiaojun Quan"],"pdf_url":"https://arxiv.org/pdf/2408.07990v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.19726v3","updated":"2024-08-15T07:26:26Z","published":"2024-07-29T06:13:28Z","title":"Do Text-to-Vis Benchmarks Test Real Use of Visualisations?","summary":" Large language models are able to generate code for visualisations in\nresponse to user requests. This is a useful application, and an appealing one\nfor NLP research because plots of data provide grounding for language. However,\nthere are relatively few benchmarks, and it is unknown whether those that exist\nare representative of what people do in practice. This paper aims to answer\nthat question through an empirical study comparing benchmark datasets and code\nfrom public repositories. Our findings reveal a substantial gap in datasets,\nwith evaluations not testing the same distribution of chart types, attributes,\nand the number of actions. The only representative dataset requires\nmodification to become an end-to-end and practical benchmark. This shows that\nnew, more benchmarks are needed to support the development of systems that\ntruly address users' visualisation needs. These observations will guide future\ndata creation, highlighting which features hold genuine significance for users.\n","authors":["Hy Nguyen","Xuefei He","Andrew Reeson","Cecile Paris","Josiah Poon","Jonathan K. Kummerfeld"],"pdf_url":"https://arxiv.org/pdf/2407.19726v3.pdf","comment":"ARR AE score of 4"},{"id":"http://arxiv.org/abs/2407.12017v2","updated":"2024-08-15T07:12:33Z","published":"2024-06-27T07:16:46Z","title":"Follow-Up Questions Improve Documents Generated by Large Language Models","summary":" This study investigates the impact of Large Language Models (LLMs) generating\nfollow-up questions in response to user requests for short (1-page) text\ndocuments. Users interacted with a novel web-based AI system designed to ask\nfollow-up questions. Users requested documents they would like the AI to\nproduce. The AI then generated follow-up questions to clarify the user's needs\nor offer additional insights before generating the requested documents. After\nanswering the questions, users were shown a document generated using both the\ninitial request and the questions and answers, and a document generated using\nonly the initial request. Users indicated which document they preferred and\ngave feedback about their experience with the question-answering process. The\nfindings of this study show clear benefits to question-asking both in document\npreference and in the qualitative user experience. This study further shows\nthat users found more value in questions which were thought-provoking,\nopen-ended, or offered unique insights into the user's request as opposed to\nsimple information-gathering questions.\n","authors":["Bernadette J Tix"],"pdf_url":"https://arxiv.org/pdf/2407.12017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07983v1","updated":"2024-08-15T07:09:51Z","published":"2024-08-15T07:09:51Z","title":"ArabLegalEval: A Multitask Benchmark for Assessing Arabic Legal\n Knowledge in Large Language Models","summary":" The rapid advancements in Large Language Models (LLMs) have led to\nsignificant improvements in various natural language processing tasks. However,\nthe evaluation of LLMs' legal knowledge, particularly in non-English languages\nsuch as Arabic, remains under-explored. To address this gap, we introduce\nArabLegalEval, a multitask benchmark dataset for assessing the Arabic legal\nknowledge of LLMs. Inspired by the MMLU and LegalBench datasets, ArabLegalEval\nconsists of multiple tasks sourced from Saudi legal documents and synthesized\nquestions. In this work, we aim to analyze the capabilities required to solve\nlegal problems in Arabic and benchmark the performance of state-of-the-art\nLLMs. We explore the impact of in-context learning and investigate various\nevaluation methods. Additionally, we explore workflows for generating questions\nwith automatic validation to enhance the dataset's quality. We benchmark\nmultilingual and Arabic-centric LLMs, such as GPT-4 and Jais, respectively. We\nalso share our methodology for creating the dataset and validation, which can\nbe generalized to other domains. We hope to accelerate AI research in the\nArabic Legal domain by releasing the ArabLegalEval dataset and code:\nhttps://github.com/Thiqah/ArabLegalEval\n","authors":["Faris Hijazi","Somayah AlHarbi","Abdulaziz AlHussein","Harethah Abu Shairah","Reem AlZahrani","Hebah AlShamlan","Omar Knio","George Turkiyyah"],"pdf_url":"https://arxiv.org/pdf/2408.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07978v1","updated":"2024-08-15T06:52:24Z","published":"2024-08-15T06:52:24Z","title":"Coupling without Communication and Drafter-Invariant Speculative\n Decoding","summary":" Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice\nwants to generate a sample $a\\sim P$ and Bob a sample $b \\sim Q$ such that $a =\nb$ with has as high of probability as possible. It is well-known that, by\nsampling from an optimal coupling between the distributions, Alice and Bob can\nachieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total\nvariation distance. What if Alice and Bob must solve this same problem without\ncommunicating at all? Perhaps surprisingly, with access to public randomness,\nthey can still achieve $Pr[a = b] \\geq \\frac{1 - D_{TV}(P,Q)}{1 + D_{TV}(P,Q)}\n\\geq 1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple\nprotocol based on the Weighted MinHash algorithm. In this work, we explore the\ncommunication-free coupling in greater depth. First, we show that an equally\nsimple protocol based on Gumbel sampling matches the worst-case guarantees of\nthe Weighted MinHash approach, but tends to perform better in practice.\nConversely, we prove that both approaches are actually sharp: no\ncommunication-free protocol can achieve $Pr[a=b]>\\frac{1 - D_{TV}(P,Q)}{1 +\nD_{TV}(P,Q)}$ in the worst-case. Finally, we prove that, for distributions over\n$n$ items, there exists a scheme that uses just $O(\\log(n/\\epsilon))$ bits of\ncommunication to achieve $Pr[a = b] = 1 - D_{TV}(P,Q) - \\epsilon$, i.e. to\nessentially match optimal coupling. Beyond our theoretical results, we\ndemonstrate an application of communication-free coupling to speculative\ndecoding, a recent method for accelerating autoregressive large language models\n[Leviathan, Kalman, Matias, ICML 2023]. We show that communication-free\nprotocols yield a variant of speculative decoding that we call\nDrafter-Invariant Speculative Decoding, which has the desirable property that\nthe output of the method is fixed given a fixed random seed, regardless of what\ndrafter is used for speculation.\n","authors":["Majid Daliri","Christopher Musco","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2408.07978v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2102.00621v3","updated":"2024-08-15T06:51:57Z","published":"2021-02-01T03:47:59Z","title":"Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised\n Learning","summary":" The majority of Chinese characters are monophonic, while a special group of\ncharacters, called polyphonic characters, have multiple pronunciations. As a\nprerequisite of performing speech-related generative tasks, the correct\npronunciation must be identified among several candidates. This process is\ncalled Polyphone Disambiguation. Although the problem has been well explored\nwith both knowledge-based and learning-based approaches, it remains challenging\ndue to the lack of publicly available labeled datasets and the irregular nature\nof polyphone in Mandarin Chinese. In this paper, we propose a novel\nsemi-supervised learning (SSL) framework for Mandarin Chinese polyphone\ndisambiguation that can potentially leverage unlimited unlabeled text data. We\nexplore the effect of various proxy labeling strategies including\nentropy-thresholding and lexicon-based labeling. Qualitative and quantitative\nexperiments demonstrate that our method achieves state-of-the-art performance.\nIn addition, we publish a novel dataset specifically for the polyphone\ndisambiguation task to promote further research.\n","authors":["Yi Shi","Congyi Wang","Yu Chen","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2102.00621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07975v1","updated":"2024-08-15T06:40:38Z","published":"2024-08-15T06:40:38Z","title":"Polaris: Open-ended Interactive Robotic Manipulation via Syn2Real Visual\n Grounding and Large Language Models","summary":" This paper investigates the task of the open-ended interactive robotic\nmanipulation on table-top scenarios. While recent Large Language Models (LLMs)\nenhance robots' comprehension of user instructions, their lack of visual\ngrounding constrains their ability to physically interact with the environment.\nThis is because the robot needs to locate the target object for manipulation\nwithin the physical workspace. To this end, we introduce an interactive robotic\nmanipulation framework called Polaris, which integrates perception and\ninteraction by utilizing GPT-4 alongside grounded vision models. For precise\nmanipulation, it is essential that such grounded vision models produce detailed\nobject pose for the target object, rather than merely identifying pixels\nbelonging to them in the image. Consequently, we propose a novel\nSynthetic-to-Real (Syn2Real) pose estimation pipeline. This pipeline utilizes\nrendered synthetic data for training and is then transferred to real-world\nmanipulation tasks. The real-world performance demonstrates the efficacy of our\nproposed pipeline and underscores its potential for extension to more general\ncategories. Moreover, real-robot experiments have showcased the impressive\nperformance of our framework in grasping and executing multiple manipulation\ntasks. This indicates its potential to generalize to scenarios beyond the\ntabletop. More information and video results are available here:\nhttps://star-uu-wang.github.io/Polaris/\n","authors":["Tianyu Wang","Haitao Lin","Junqiu Yu","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2408.07975v1.pdf","comment":"Accepted by IROS 2024. 8 pages, 5 figures. See\n https://star-uu-wang.github.io/Polaris/"},{"id":"http://arxiv.org/abs/2408.07971v1","updated":"2024-08-15T06:36:27Z","published":"2024-08-15T06:36:27Z","title":"Predicting Lung Cancer Patient Prognosis with Large Language Models","summary":" Prognosis prediction is crucial for determining optimal treatment plans for\nlung cancer patients. Traditionally, such predictions relied on models\ndeveloped from retrospective patient data. Recently, large language models\n(LLMs) have gained attention for their ability to process and generate text\nbased on extensive learned knowledge. In this study, we evaluate the potential\nof GPT-4o mini and GPT-3.5 in predicting the prognosis of lung cancer patients.\nWe collected two prognosis datasets, i.e., survival and post-operative\ncomplication datasets, and designed multiple tasks to assess the models'\nperformance comprehensively. Logistic regression models were also developed as\nbaselines for comparison. The experimental results demonstrate that LLMs can\nachieve competitive, and in some tasks superior, performance in lung cancer\nprognosis prediction compared to data-driven logistic regression models despite\nnot using additional patient data. These findings suggest that LLMs can be\neffective tools for prognosis prediction in lung cancer, particularly when\npatient data is limited or unavailable.\n","authors":["Danqing Hu","Bing Liu","Xiang Li","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2408.07971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01609v2","updated":"2024-08-15T06:11:27Z","published":"2024-05-28T04:22:28Z","title":"Judgement Citation Retrieval using Contextual Similarity","summary":" Traditionally in the domain of legal research, the retrieval of pertinent\ncitations from intricate case descriptions has demanded manual effort and\nkeyword-based search applications that mandate expertise in understanding legal\njargon. Legal case descriptions hold pivotal information for legal\nprofessionals and researchers, necessitating more efficient and automated\napproaches. We propose a methodology that combines natural language processing\n(NLP) and machine learning techniques to enhance the organization and\nutilization of legal case descriptions. This approach revolves around the\ncreation of textual embeddings with the help of state-of-art embedding models.\nOur methodology addresses two primary objectives: unsupervised clustering and\nsupervised citation retrieval, both designed to automate the citation\nextraction process. Although the proposed methodology can be used for any\ndataset, we employed the Supreme Court of The United States (SCOTUS) dataset,\nyielding remarkable results. Our methodology achieved an impressive accuracy\nrate of 90.9%. By automating labor-intensive processes, we pave the way for a\nmore efficient, time-saving, and accessible landscape in legal research,\nbenefiting legal professionals, academics, and researchers.\n","authors":["Akshat Mohan Dasula","Hrushitha Tigulla","Preethika Bhukya"],"pdf_url":"https://arxiv.org/pdf/2406.01609v2.pdf","comment":"14 pages, 16 images"},{"id":"http://arxiv.org/abs/2408.07955v1","updated":"2024-08-15T06:08:53Z","published":"2024-08-15T06:08:53Z","title":"GERestaurant: A German Dataset of Annotated Restaurant Reviews for\n Aspect-Based Sentiment Analysis","summary":" We present GERestaurant, a novel dataset consisting of 3,078 German language\nrestaurant reviews manually annotated for Aspect-Based Sentiment Analysis\n(ABSA). All reviews were collected from Tripadvisor, covering a diverse\nselection of restaurants, including regional and international cuisine with\nvarious culinary styles. The annotations encompass both implicit and explicit\naspects, including all aspect terms, their corresponding aspect categories, and\nthe sentiments expressed towards them. Furthermore, we provide baseline scores\nfor the four ABSA tasks Aspect Category Detection, Aspect Category Sentiment\nAnalysis, End-to-End ABSA and Target Aspect Sentiment Detection as a reference\npoint for future advances. The dataset fills a gap in German language resources\nand facilitates exploration of ABSA in the restaurant domain.\n","authors":["Nils Constantin Hellwig","Jakob Fehle","Markus Bink","Christian Wolff"],"pdf_url":"https://arxiv.org/pdf/2408.07955v1.pdf","comment":"Accepted in KONVENS 2024. Camera Ready submission"},{"id":"http://arxiv.org/abs/2408.07377v2","updated":"2024-08-15T05:15:55Z","published":"2024-08-14T08:53:00Z","title":"Do GPT Language Models Suffer From Split Personality Disorder? The\n Advent Of Substrate-Free Psychometrics","summary":" Previous research on emergence in large language models shows these display\napparent human-like abilities and psychological latent traits. However, results\nare partly contradicting in expression and magnitude of these latent traits,\nyet agree on the worrisome tendencies to score high on the Dark Triad of\nnarcissism, psychopathy, and Machiavellianism, which, together with a track\nrecord of derailments, demands more rigorous research on safety of these\nmodels. We provided a state of the art language model with the same personality\nquestionnaire in nine languages, and performed Bayesian analysis of Gaussian\nMixture Model, finding evidence for a deeper-rooted issue. Our results suggest\nboth interlingual and intralingual instabilities, which indicate that current\nlanguage models do not develop a consistent core personality. This can lead to\nunsafe behaviour of artificial intelligence systems that are based on these\nfoundation models, and are increasingly integrated in human life. We\nsubsequently discuss the shortcomings of modern psychometrics, abstract it, and\nprovide a framework for its species-neutral, substrate-free formulation.\n","authors":["Peter Romero","Stephen Fitz","Teruo Nakatsuma"],"pdf_url":"https://arxiv.org/pdf/2408.07377v2.pdf","comment":"37 pages, 7 figures, 3 tables, date v1: Mar 26 2023; replaced with\n new version; reason: removed journal logo from older version of article that\n is no longer valid"},{"id":"http://arxiv.org/abs/2407.18552v2","updated":"2024-08-15T05:14:38Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Modigari Narendra","Vigya Sharma","Santhosh Malarvannan","Amir H. Gandomi"],"pdf_url":"https://arxiv.org/pdf/2407.18552v2.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2408.07930v1","updated":"2024-08-15T04:57:55Z","published":"2024-08-15T04:57:55Z","title":"MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and\n Iterative Sub-SQL Refinement for Text-to-SQL","summary":" Recent In-Context Learning based methods have achieved remarkable success in\nText-to-SQL task. However, there is still a large gap between the performance\nof these models and human performance on datasets with complex database schema\nand difficult questions, such as BIRD. Besides, existing work has neglected to\nsupervise intermediate steps when solving questions iteratively with question\ndecomposition methods, and the schema linking methods used in these works are\nvery rudimentary. To address these issues, we propose MAG-SQL, a multi-agent\ngenerative approach with soft schema linking and iterative Sub-SQL refinement.\nIn our framework, an entity-based method with tables' summary is used to select\nthe columns in database, and a novel targets-conditions decomposition method is\nintroduced to decompose those complex questions. Additionally, we build a\niterative generating module which includes a Sub-SQL Generator and Sub-SQL\nRefiner, introducing external oversight for each step of generation. Through a\nseries of ablation studies, the effectiveness of each agent in our framework\nhas been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL\nachieves an execution accuracy of 61.08\\%, compared to the baseline accuracy of\n46.35\\% for vanilla GPT-4 and the baseline accuracy of 57.56\\% for MAC-SQL.\nBesides, our approach makes similar progress on Spider.\n","authors":["Wenxuan Xie","Gaochen Wu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07930v1.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.07081v2","updated":"2024-08-15T04:51:43Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Understanding sentences that contain mathematical expressions in text form\nposes significant challenges. To address this, the importance of converting\nthese expressions into a compiled formula is highlighted. For instance, the\nexpression ``x equals minus b plus or minus the square root of b squared minus\nfour a c, all over two a'' from automatic speech recognition (ASR) is more\nreadily comprehensible when displayed as a compiled formula $x = \\frac{-b \\pm\n\\sqrt{b^2 - 4ac}}{2a}$. To develop a text-to-formula conversion system, we can\nbreak down the process into text-to-LaTeX and LaTeX-to-formula conversions,\nwith the latter managed by various existing LaTeX engines. However, the former\napproach has been notably hindered by the severe scarcity of text-to-LaTeX\npaired data, which presents a significant challenge in this field. In this\ncontext, we introduce MathBridge, the first extensive dataset for translating\nmathematical spoken expressions into LaTeX, to establish a robust baseline for\nfuture research on text-to-LaTeX translation. MathBridge comprises\napproximately 23 million LaTeX formulas paired with the corresponding spoken\nEnglish expressions. Through comprehensive evaluations, including fine-tuning\nand testing with data, we discovered that MathBridge significantly enhances the\ncapabilities of pretrained language models for text-to-LaTeX translation.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement. Our findings indicate the need\nfor a new metric, specifically for text-to-LaTeX conversion evaluations.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v2.pdf","comment":"9page, 6 figures"},{"id":"http://arxiv.org/abs/2406.09095v2","updated":"2024-08-15T04:47:29Z","published":"2024-06-13T13:25:50Z","title":"Modeling Comparative Logical Relation with Contrastive Learning for Text\n Generation","summary":" Data-to-Text Generation (D2T), a classic natural language generation problem,\naims at producing fluent descriptions for structured input data, such as a\ntable. Existing D2T works mainly focus on describing the superficial\nassociative relations among entities, while ignoring the deep comparative\nlogical relations, such as A is better than B in a certain aspect with a\ncorresponding opinion, which is quite common in our daily life. In this paper,\nwe introduce a new D2T task named comparative logical relation generation\n(CLRG). Additionally, we propose a Comparative Logic (CoLo) based text\ngeneration method, which generates texts following specific comparative logical\nrelations with contrastive learning. Specifically, we first construct various\npositive and negative samples by fine-grained perturbations in entities,\naspects and opinions. Then, we perform contrastive learning in the encoder\nlayer to have a better understanding of the comparative logical relations, and\nintegrate it in the decoder layer to guide the model to correctly generate the\nrelations. Noting the data scarcity problem, we construct a Chinese Comparative\nLogical Relation Dataset (CLRD), which is a high-quality human-annotated\ndataset and challenging for text generation with descriptions of multiple\nentities and annotations on their comparative logical relations. Extensive\nexperiments show that our method achieves impressive performance in both\nautomatic and human evaluations.\n","authors":["Yuhao Dan","Junfeng Tian","Jie Zhou","Ming Yan","Ji Zhang","Qin Chen","Liang He"],"pdf_url":"https://arxiv.org/pdf/2406.09095v2.pdf","comment":"NLPCC 2024"},{"id":"http://arxiv.org/abs/2408.07543v2","updated":"2024-08-15T04:01:53Z","published":"2024-08-14T13:23:43Z","title":"MathScape: Evaluating MLLMs in multimodal Math Scenarios through a\n Hierarchical Benchmark","summary":" With the development of Multimodal Large Language Models (MLLMs), the\nevaluation of multimodal models in the context of mathematical problems has\nbecome a valuable research field. Multimodal visual-textual mathematical\nreasoning serves as a critical indicator for evaluating the comprehension and\ncomplex multi-step quantitative reasoning abilities of MLLMs. However, previous\nmultimodal math benchmarks have not sufficiently integrated visual and textual\ninformation. To address this gap, we proposed MathScape, a new benchmark that\nemphasizes the understanding and application of combined visual and textual\ninformation. MathScape is designed to evaluate photo-based math problem\nscenarios, assessing the theoretical understanding and application ability of\nMLLMs through a categorical hierarchical approach. We conduct a\nmulti-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark\nis challenging even for the most sophisticated models. By analyzing the\nevaluation results, we identify the limitations of MLLMs, offering valuable\ninsights for enhancing model performance.\n","authors":["Minxuan Zhou","Hao Liang","Tianpeng Li","Zhiyu Wu","Mingan Lin","Linzhuang Sun","Yaqi Zhou","Yan Zhang","Xiaoqin Huang","Yicong Chen","Yujing Qiao","Weipeng Chen","Bin Cui","Wentao Zhang","Zenan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07910v1","updated":"2024-08-15T03:34:02Z","published":"2024-08-15T03:34:02Z","title":"DM2RM: Dual-Mode Multimodal Ranking for Target Objects and Receptacles\n Based on Open-Vocabulary Instructions","summary":" In this study, we aim to develop a domestic service robot (DSR) that, guided\nby open-vocabulary instructions, can carry everyday objects to the specified\npieces of furniture. Few existing methods handle mobile manipulation tasks with\nopen-vocabulary instructions in the image retrieval setting, and most do not\nidentify both the target objects and the receptacles. We propose the Dual-Mode\nMultimodal Ranking model (DM2RM), which enables images of both the target\nobjects and receptacles to be retrieved using a single model based on\nmultimodal foundation models. We introduce a switching mechanism that leverages\na mode token and phrase identification via a large language model to switch the\nembedding space based on the prediction target. To evaluate the DM2RM, we\nconstruct a novel dataset including real-world images collected from hundreds\nof building-scale environments and crowd-sourced instructions with referring\nexpressions. The evaluation results show that the proposed DM2RM outperforms\nprevious approaches in terms of standard metrics in image retrieval settings.\nFurthermore, we demonstrate the application of the DM2RM on a standardized\nreal-world DSR platform including fetch-and-carry actions, where it achieves a\ntask success rate of 82% despite the zero-shot transfer setting. Demonstration\nvideos, code, and more materials are available at\nhttps://kkrr10.github.io/dm2rm/.\n","authors":["Ryosuke Korekata","Kanta Kaneda","Shunya Nagashima","Yuto Imai","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2408.07910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07904v1","updated":"2024-08-15T03:19:41Z","published":"2024-08-15T03:19:41Z","title":"Assessing Language Models' Worldview for Fiction Generation","summary":" The use of Large Language Models (LLMs) has become ubiquitous, with abundant\napplications in computational creativity. One such application is fictional\nstory generation. Fiction is a narrative that occurs in a story world that is\nslightly different than ours. With LLMs becoming writing partners, we question\nhow suitable they are to generate fiction. This study investigates the ability\nof LLMs to maintain a state of world essential to generate fiction. Through a\nseries of questions to nine LLMs, we find that only two models exhibit\nconsistent worldview, while the rest are self-conflicting. Subsequent analysis\nof stories generated by four models revealed a strikingly uniform narrative\npattern. This uniformity across models further suggests a lack of `state'\nnecessary for fiction. We highlight the limitations of current LLMs in fiction\nwriting and advocate for future research to test and create story worlds for\nLLMs to reside in. All code, dataset, and the generated responses can be found\nin https://github.com/tanny411/llm-reliability-and-consistency-evaluation.\n","authors":["Aisha Khatun","Daniel G. Brown"],"pdf_url":"https://arxiv.org/pdf/2408.07904v1.pdf","comment":"Short paper"},{"id":"http://arxiv.org/abs/2407.17900v5","updated":"2024-08-15T02:33:22Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.778 and an AP value of 0.426 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07888v1","updated":"2024-08-15T02:22:48Z","published":"2024-08-15T02:22:48Z","title":"Fine-tuning Large Language Models with Human-inspired Learning\n Strategies in Medical Question Answering","summary":" Training Large Language Models (LLMs) incurs substantial data-related costs,\nmotivating the development of data-efficient training methods through optimised\ndata ordering and selection. Human-inspired learning strategies, such as\ncurriculum learning, offer possibilities for efficient training by organising\ndata according to common human learning practices. Despite evidence that\nfine-tuning with curriculum learning improves the performance of LLMs for\nnatural language understanding tasks, its effectiveness is typically assessed\nusing a single model. In this work, we extend previous research by evaluating\nboth curriculum-based and non-curriculum-based learning strategies across\nmultiple LLMs, using human-defined and automated data labels for medical\nquestion answering. Our results indicate a moderate impact of using\nhuman-inspired learning strategies for fine-tuning LLMs, with maximum accuracy\ngains of 1.77% per model and 1.81% per dataset. Crucially, we demonstrate that\nthe effectiveness of these strategies varies significantly across different\nmodel-dataset combinations, emphasising that the benefits of a specific\nhuman-inspired strategy for fine-tuning LLMs do not generalise. Additionally,\nwe find evidence that curriculum learning using LLM-defined question difficulty\noutperforms human-defined difficulty, highlighting the potential of using\nmodel-generated measures for optimal curriculum design.\n","authors":["Yushi Yang","Andrew M. Bean","Robert McCraith","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2408.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07955v2","updated":"2024-08-15T02:18:08Z","published":"2024-01-15T20:42:16Z","title":"A Study on Large Language Models' Limitations in Multiple-Choice\n Question Answering","summary":" The widespread adoption of Large Language Models (LLMs) has become\ncommonplace, particularly with the emergence of open-source models. More\nimportantly, smaller models are well-suited for integration into consumer\ndevices and are frequently employed either as standalone solutions or as\nsubroutines in various AI tasks. Despite their ubiquitous use, there is no\nsystematic analysis of their specific capabilities and limitations. In this\nstudy, we tackle one of the most widely used tasks - answering Multiple Choice\nQuestion (MCQ). We analyze 26 small open-source models and find that 65% of the\nmodels do not understand the task, only 4 models properly select an answer from\nthe given choices, and only 5 of these models are choice order independent.\nThese results are rather alarming given the extensive use of MCQ tests with\nthese models. We recommend exercising caution and testing task understanding\nbefore using MCQ to evaluate LLMs in any field whatsoever.\n","authors":["Aisha Khatun","Daniel G. Brown"],"pdf_url":"https://arxiv.org/pdf/2401.07955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10957v2","updated":"2024-08-15T02:12:52Z","published":"2024-06-16T14:24:30Z","title":"Eliminating Biased Length Reliance of Direct Preference Optimization via\n Down-Sampled KL Divergence","summary":" Direct Preference Optimization (DPO) has emerged as a prominent algorithm for\nthe direct and robust alignment of Large Language Models (LLMs) with human\npreferences, offering a more straightforward alternative to the complex\nReinforcement Learning from Human Feedback (RLHF). Despite its promising\nefficacy, DPO faces a notable drawback: \"verbosity\", a common over-optimization\nphenomenon also observed in RLHF. While previous studies mainly attributed\nverbosity to biased labels within the data, we propose that the issue also\nstems from an inherent algorithmic length reliance in DPO. Specifically, we\nsuggest that the discrepancy between sequence-level Kullback-Leibler (KL)\ndivergences between chosen and rejected sequences, used in DPO, results in\noverestimated or underestimated rewards due to varying token lengths.\nEmpirically, we utilize datasets with different label lengths to demonstrate\nthe presence of biased rewards. We then introduce an effective downsampling\napproach, named SamPO, to eliminate potential length reliance. Our experimental\nevaluations, conducted across three LLMs of varying scales and a diverse array\nof conditional and open-ended benchmarks, highlight the efficacy of SamPO in\nmitigating verbosity, achieving improvements of 5% to 12% over DPO through\ndebaised rewards. Our codes can be accessed at:\nhttps://github.com/LuJunru/SamPO/.\n","authors":["Junru Lu","Jiazheng Li","Siyu An","Meng Zhao","Yulan He","Di Yin","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2406.10957v2.pdf","comment":"We thank Shiyue Xu for pointing out the error in Equation 5 in the\n previous draft: https://github.com/LuJunru/SamPO/issues/1"},{"id":"http://arxiv.org/abs/2408.07884v1","updated":"2024-08-15T02:07:11Z","published":"2024-08-15T02:07:11Z","title":"Instruct Large Language Models to Generate Scientific Literature Survey\n Step by Step","summary":" Abstract. Automatically generating scientific literature surveys is a\nvaluable task that can significantly enhance research efficiency. However, the\ndiverse and complex nature of information within a literature survey poses\nsubstantial challenges for generative models. In this paper, we design a series\nof prompts to systematically leverage large language models (LLMs), enabling\nthe creation of comprehensive literature surveys through a step-by-step\napproach. Specifically, we design prompts to guide LLMs to sequentially\ngenerate the title, abstract, hierarchical headings, and the main content of\nthe literature survey. We argue that this design enables the generation of the\nheadings from a high-level perspective. During the content generation process,\nthis design effectively harnesses relevant information while minimizing costs\nby restricting the length of both input and output content in LLM queries. Our\nimplementation with Qwen-long achieved third place in the NLPCC 2024 Scientific\nLiterature Survey Generation evaluation task, with an overall score only 0.03%\nlower than the second-place team. Additionally, our soft heading recall is\n95.84%, the second best among the submissions. Thanks to the efficient prompt\ndesign and the low cost of the Qwen-long API, our method reduces the expense\nfor generating each literature survey to 0.1 RMB, enhancing the practical value\nof our method.\n","authors":["Yuxuan Lai","Yupeng Wu","Yidan Wang","Wenpeng Hu","Chen Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.07884v1.pdf","comment":"NLPCC 2024"},{"id":"http://arxiv.org/abs/2408.07666v2","updated":"2024-08-15T01:49:29Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07873v1","updated":"2024-08-15T01:00:28Z","published":"2024-08-15T01:00:28Z","title":"Words Matter: Reducing Stigma in Online Conversations about Substance\n Use with Large Language Models","summary":" Stigma is a barrier to treatment for individuals struggling with substance\nuse disorders (SUD), which leads to significantly lower treatment engagement\nrates. With only 7% of those affected receiving any form of help, societal\nstigma not only discourages individuals with SUD from seeking help but isolates\nthem, hindering their recovery journey and perpetuating a cycle of shame and\nself-doubt. This study investigates how stigma manifests on social media,\nparticularly Reddit, where anonymity can exacerbate discriminatory behaviors.\nWe analyzed over 1.2 million posts, identifying 3,207 that exhibited\nstigmatizing language towards people who use substances (PWUS). Using Informed\nand Stylized LLMs, we develop a model for de-stigmatization of these\nexpressions into empathetic language, resulting in 1,649 reformed phrase pairs.\nOur paper contributes to the field by proposing a computational framework for\nanalyzing stigma and destigmatizing online content, and delving into the\nlinguistic features that propagate stigma towards PWUS. Our work not only\nenhances understanding of stigma's manifestations online but also provides\npractical tools for fostering a more supportive digital environment for those\naffected by SUD. Code and data will be made publicly available upon acceptance.\n","authors":["Layla Bouzoubaa","Elham Aghakhani","Shadi Rezapour"],"pdf_url":"https://arxiv.org/pdf/2408.07873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13405v4","updated":"2024-08-15T00:24:03Z","published":"2024-02-20T22:19:56Z","title":"A Unified Taxonomy-Guided Instruction Tuning Framework for Entity Set\n Expansion and Taxonomy Expansion","summary":" Entity set expansion, taxonomy expansion, and seed-guided taxonomy\nconstruction are three representative tasks that can be applied to\nautomatically populate an existing taxonomy with emerging concepts. Previous\nstudies view them as three separate tasks. Therefore, their proposed techniques\nusually work for one specific task only, lacking generalizability and a\nholistic perspective. In this paper, we aim at a unified solution to the three\ntasks. To be specific, we identify two common skills needed for entity set\nexpansion, taxonomy expansion, and seed-guided taxonomy construction: finding\n\"siblings\" and finding \"parents\". We propose a taxonomy-guided instruction\ntuning framework to teach a large language model to generate siblings and\nparents for query entities, where the joint pre-training process facilitates\nthe mutual enhancement of the two skills. Extensive experiments on multiple\nbenchmark datasets demonstrate the efficacy of our proposed TaxoInstruct\nframework, which outperforms task-specific baselines across all three tasks.\n","authors":["Yanzhen Shen","Yu Zhang","Yunyi Zhang","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2402.13405v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07873v1","updated":"2024-08-15T01:00:28Z","published":"2024-08-15T01:00:28Z","title":"Words Matter: Reducing Stigma in Online Conversations about Substance\n Use with Large Language Models","summary":" Stigma is a barrier to treatment for individuals struggling with substance\nuse disorders (SUD), which leads to significantly lower treatment engagement\nrates. With only 7% of those affected receiving any form of help, societal\nstigma not only discourages individuals with SUD from seeking help but isolates\nthem, hindering their recovery journey and perpetuating a cycle of shame and\nself-doubt. This study investigates how stigma manifests on social media,\nparticularly Reddit, where anonymity can exacerbate discriminatory behaviors.\nWe analyzed over 1.2 million posts, identifying 3,207 that exhibited\nstigmatizing language towards people who use substances (PWUS). Using Informed\nand Stylized LLMs, we develop a model for de-stigmatization of these\nexpressions into empathetic language, resulting in 1,649 reformed phrase pairs.\nOur paper contributes to the field by proposing a computational framework for\nanalyzing stigma and destigmatizing online content, and delving into the\nlinguistic features that propagate stigma towards PWUS. Our work not only\nenhances understanding of stigma's manifestations online but also provides\npractical tools for fostering a more supportive digital environment for those\naffected by SUD. Code and data will be made publicly available upon acceptance.\n","authors":["Layla Bouzoubaa","Elham Aghakhani","Rezvaneh Rezapour"],"pdf_url":"https://arxiv.org/pdf/2408.07873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08459v1","updated":"2024-08-15T23:57:02Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11087v3","updated":"2024-08-15T22:57:08Z","published":"2024-06-16T22:11:41Z","title":"DP-MemArc: Differential Privacy Transfer Learning for Memory Efficient\n Language Models","summary":" Large language models have repeatedly shown outstanding performance across\ndiverse applications. However, deploying these models can inadvertently risk\nuser privacy. The significant memory demands during training pose a major\nchallenge in terms of resource consumption. This substantial size places a\nheavy load on memory resources, raising considerable practical concerns. In\nthis paper, we introduce DP-MemArc, a novel training framework aimed at\nreducing the memory costs of large language models while emphasizing the\nprotection of user data privacy. DP-MemArc incorporates side network or\nreversible network designs to support a variety of differential privacy\nmemory-efficient fine-tuning schemes. Our approach not only achieves in memory\noptimization but also ensures robust privacy protection, keeping user data\nsecure and confidential. Extensive experiments have demonstrated that DP-MemArc\neffectively provides differential privacy-efficient fine-tuning across\ndifferent task scenarios.\n","authors":["Yanming Liu","Xinyue Peng","Yuwei Zhang","Xiaolan Ke","Songhang Deng","Jiannan Cao","Chen Ma","Mengchen Fu","Xuhong Zhang","Sheng Cheng","Xun Wang","Jianwei Yin","Tianyu Du"],"pdf_url":"https://arxiv.org/pdf/2406.11087v3.pdf","comment":"9 pages second version"},{"id":"http://arxiv.org/abs/2408.08444v1","updated":"2024-08-15T22:34:44Z","published":"2024-08-15T22:34:44Z","title":"W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question\n Answering","summary":" In knowledge-intensive tasks such as open-domain question answering (OpenQA),\nLarge Language Models (LLMs) often struggle to generate factual answers relying\nsolely on their internal (parametric) knowledge. To address this limitation,\nRetrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving\nrelevant information from external sources, thereby positioning the retriever\nas a pivotal component. Although dense retrieval demonstrates state-of-the-art\nperformance, its training poses challenges due to the scarcity of ground-truth\nevidence, largely attributed to the high costs of human annotation. In this\npaper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create\nweakly labeled data for training dense retrievers. Specifically, we rerank the\ntop-$K$ passages retrieved via BM25 by assessing the probability that LLMs will\ngenerate the correct answer based on the question and each passage. The\nhighest-ranking passages are then used as positive training examples for dense\nretrieval. Our comprehensive experiments across four publicly available OpenQA\ndatasets demonstrate that our approach enhances both retrieval and OpenQA\nperformance compared to baseline models.\n","authors":["Jinming Nian","Zhiyuan Peng","Qifan Wang","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17587v2","updated":"2024-08-15T22:28:55Z","published":"2024-05-27T18:40:49Z","title":"RAGSys: Item-Cold-Start Recommender as RAG System","summary":" Large Language Models (LLM) hold immense promise for real-world applications,\nbut their generic knowledge often falls short of domain-specific needs.\nFine-tuning, a common approach, can suffer from catastrophic forgetting and\nhinder generalizability. In-Context Learning (ICL) offers an alternative, which\ncan leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant\ndemonstrations for few-shot learning tasks. This paper explores the desired\nqualities of a demonstration retrieval system for ICL. We argue that ICL\nretrieval in this context resembles item-cold-start recommender systems,\nprioritizing discovery and maximizing information gain over strict relevance.\nWe propose a novel evaluation method that measures the LLM's subsequent\nperformance on NLP tasks, eliminating the need for subjective diversity scores.\nOur findings demonstrate the critical role of diversity and quality bias in\nretrieved demonstrations for effective ICL, and highlight the potential of\nrecommender system techniques in this domain.\n","authors":["Emile Contal","Garrin McGoldrick"],"pdf_url":"https://arxiv.org/pdf/2405.17587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19232v6","updated":"2024-08-15T21:56:49Z","published":"2024-04-30T03:29:30Z","title":"GRAMMAR: Grounded and Modular Methodology for Assessment of\n Closed-Domain Retrieval-Augmented Language Model","summary":" Retrieval-Augmented Generation (RAG) systems are widely used across various\nindustries for querying closed-domain and in-house knowledge bases. However,\nevaluating these systems presents significant challenges due to the private\nnature of closed-domain data and a scarcity of queries with verifiable ground\ntruths. Moreover, there is a lack of analytical methods to diagnose problematic\nmodules and identify types of failure, such as those caused by knowledge\ndeficits or issues with robustness. To address these challenges, we introduce\nGRAMMAR (GRounded And Modular Methodology for Assessment of RAG), an evaluation\nframework comprising a grounded data generation process and an evaluation\nprotocol that effectively pinpoints defective modules. Our validation\nexperiments reveal that % traditional reference-free evaluation methods often\ninaccurately assess false generations, tending toward optimism. In contrast,\nGRAMMAR provides a reliable approach for identifying vulnerable modules and\nsupports hypothesis testing for textual form vulnerabilities. % An open-source\ntool accompanying this framework will be released to easily reproduce our\nresults and enable reliable and modular evaluation in closed-domain settings.\nAn open-source tool accompanying this framework is available in our GitHub\nrepository \\url{https://github.com/xinzhel/grammar}, allowing for easy\nreproduction of our results and enabling reliable and modular evaluation in\nclosed-domain settings.\n","authors":["Xinzhe Li","Ming Liu","Shang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.19232v6.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.08411v1","updated":"2024-08-15T20:37:36Z","published":"2024-08-15T20:37:36Z","title":"Rater Cohesion and Quality from a Vicarious Perspective","summary":" Human feedback is essential for building human-centered AI systems across\ndomains where disagreement is prevalent, such as AI safety, content moderation,\nor sentiment analysis. Many disagreements, particularly in politically charged\nsettings, arise because raters have opposing values or beliefs. Vicarious\nannotation is a method for breaking down disagreement by asking raters how they\nthink others would annotate the data. In this paper, we explore the use of\nvicarious annotation with analytical methods for moderating rater disagreement.\nWe employ rater cohesion metrics to study the potential influence of political\naffiliations and demographic backgrounds on raters' perceptions of offense.\nAdditionally, we utilize CrowdTruth's rater quality metrics, which consider the\ndemographics of the raters, to score the raters and their annotations. We study\nhow the rater quality metrics influence the in-group and cross-group rater\ncohesion across the personal and vicarious levels.\n","authors":["Deepak Pandita","Tharindu Cyril Weerasooriya","Sujan Dutta","Sarah K. Luger","Tharindu Ranasinghe","Ashiqur R. KhudaBukhsh","Marcos Zampieri","Christopher M. Homan"],"pdf_url":"https://arxiv.org/pdf/2408.08411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08400v1","updated":"2024-08-15T19:57:42Z","published":"2024-08-15T19:57:42Z","title":"Zero-Shot Learning and Key Points Are All You Need for Automated\n Fact-Checking","summary":" Automated fact-checking is an important task because determining the accurate\nstatus of a proposed claim within the vast amount of information available\nonline is a critical challenge. This challenge requires robust evaluation to\nprevent the spread of false information. Modern large language models (LLMs)\nhave demonstrated high capability in performing a diverse range of Natural\nLanguage Processing (NLP) tasks. By utilizing proper prompting strategies,\ntheir versatility due to their understanding of large context sizes and\nzero-shot learning ability enables them to simulate human problem-solving\nintuition and move towards being an alternative to humans for solving problems.\nIn this work, we introduce a straightforward framework based on Zero-Shot\nLearning and Key Points (ZSL-KeP) for automated fact-checking, which despite\nits simplicity, performed well on the AVeriTeC shared task dataset by robustly\nimproving the baseline and achieving 10th place.\n","authors":["Mohammad Ghiasvand Mohammadkhani","Ali Ghiasvand Mohammadkhani","Hamid Beigy"],"pdf_url":"https://arxiv.org/pdf/2408.08400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09433v3","updated":"2024-08-15T19:51:07Z","published":"2023-11-15T23:07:40Z","title":"Trojan Activation Attack: Red-Teaming Large Language Models using\n Activation Steering for Safety-Alignment","summary":" To ensure AI safety, instruction-tuned Large Language Models (LLMs) are\nspecifically trained to ensure alignment, which refers to making models behave\nin accordance with human intentions. While these models have demonstrated\ncommendable results on various safety benchmarks, the vulnerability of their\nsafety alignment has not been extensively studied. This is particularly\ntroubling given the potential harm that LLMs can inflict. Existing attack\nmethods on LLMs often rely on poisoned training data or the injection of\nmalicious prompts. These approaches compromise the stealthiness and\ngeneralizability of the attacks, making them susceptible to detection.\nAdditionally, these models often demand substantial computational resources for\nimplementation, making them less practical for real-world applications. In this\nwork, we study a different attack scenario, called Trojan Activation Attack\n(TA^2), which injects trojan steering vectors into the activation layers of\nLLMs. These malicious steering vectors can be triggered at inference time to\nsteer the models toward attacker-desired behaviors by manipulating their\nactivations. Our experiment results on four primary alignment tasks show that\nTA^2 is highly effective and adds little or no overhead to attack efficiency.\nAdditionally, we discuss potential countermeasures against such activation\nattacks.\n","authors":["Haoran Wang","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2311.09433v3.pdf","comment":"ACM International Conference on Information and Knowledge Management\n (CIKM'24)"},{"id":"http://arxiv.org/abs/2408.08396v1","updated":"2024-08-15T19:46:21Z","published":"2024-08-15T19:46:21Z","title":"Level Up Your Tutorials: VLMs for Game Tutorials Quality Assessment","summary":" Designing effective game tutorials is crucial for a smooth learning curve for\nnew players, especially in games with many rules and complex core mechanics.\nEvaluating the effectiveness of these tutorials usually requires multiple\niterations with testers who have no prior knowledge of the game. Recent\nVision-Language Models (VLMs) have demonstrated significant capabilities in\nunderstanding and interpreting visual content. VLMs can analyze images, provide\ndetailed insights, and answer questions about their content. They can recognize\nobjects, actions, and contexts in visual data, making them valuable tools for\nvarious applications, including automated game testing. In this work, we\npropose an automated game-testing solution to evaluate the quality of game\ntutorials. Our approach leverages VLMs to analyze frames from video game\ntutorials, answer relevant questions to simulate human perception, and provide\nfeedback. This feedback is compared with expected results to identify confusing\nor problematic scenes and highlight potential errors for developers. In\naddition, we publish complete tutorial videos and annotated frames from\ndifferent game versions used in our tests. This solution reduces the need for\nextensive manual testing, especially by speeding up and simplifying the initial\ndevelopment stages of the tutorial to improve the final game experience.\n","authors":["Daniele Rege Cambrin","Gabriele Scaffidi Militone","Luca Colomba","Giovanni Malnati","Daniele Apiletti","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2408.08396v1.pdf","comment":"Accepted at ECCV 2024 CV2 Workshop"},{"id":"http://arxiv.org/abs/2406.18221v2","updated":"2024-08-15T19:30:09Z","published":"2024-06-26T10:08:47Z","title":"Enhancing Data Privacy in Large Language Models through Private\n Association Editing","summary":" Large Language Models (LLMs) are powerful tools with extensive applications,\nbut their tendency to memorize private information raises significant concerns\nas private data leakage can easily happen. In this paper, we introduce Private\nAssociation Editing (PAE), a novel defense approach for private data leakage.\nPAE is designed to effectively remove Personally Identifiable Information (PII)\nwithout retraining the model. Our approach consists of a four-step procedure:\ndetecting memorized PII, applying PAE cards to mitigate memorization of private\ndata, verifying resilience to targeted data extraction (TDE) attacks, and\nensuring consistency in the post-edit LLMs. The versatility and efficiency of\nPAE, which allows for batch modifications, significantly enhance data privacy\nin LLMs. Experimental results demonstrate the effectiveness of PAE in\nmitigating private data leakage. We believe PAE will serve as a critical tool\nin the ongoing effort to protect data privacy in LLMs, encouraging the\ndevelopment of safer models for real-world applications.\n","authors":["Davide Venditti","Elena Sofia Ruzzetti","Giancarlo A. Xompero","Cristina Giannone","Andrea Favalli","Raniero Romagnoli","Fabio Massimo Zanzotto"],"pdf_url":"https://arxiv.org/pdf/2406.18221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08379v1","updated":"2024-08-15T18:43:50Z","published":"2024-08-15T18:43:50Z","title":"Towards Realistic Synthetic User-Generated Content: A Scaffolding\n Approach to Generating Online Discussions","summary":" The emergence of synthetic data represents a pivotal shift in modern machine\nlearning, offering a solution to satisfy the need for large volumes of data in\ndomains where real data is scarce, highly private, or difficult to obtain. We\ninvestigate the feasibility of creating realistic, large-scale synthetic\ndatasets of user-generated content, noting that such content is increasingly\nprevalent and a source of frequently sought information. Large language models\n(LLMs) offer a starting point for generating synthetic social media discussion\nthreads, due to their ability to produce diverse responses that typify online\ninteractions. However, as we demonstrate, straightforward application of LLMs\nyields limited success in capturing the complex structure of online\ndiscussions, and standard prompting mechanisms lack sufficient control. We\ntherefore propose a multi-step generation process, predicated on the idea of\ncreating compact representations of discussion threads, referred to as\nscaffolds. Our framework is generic yet adaptable to the unique characteristics\nof specific social media platforms. We demonstrate its feasibility using data\nfrom two distinct online discussion platforms. To address the fundamental\nchallenge of ensuring the representativeness and realism of synthetic data, we\npropose a portfolio of evaluation measures to compare various instantiations of\nour framework.\n","authors":["Krisztian Balog","John Palowitch","Barbara Ikica","Filip Radlinski","Hamidreza Alvari","Mehdi Manshadi"],"pdf_url":"https://arxiv.org/pdf/2408.08379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08374v1","updated":"2024-08-15T18:33:54Z","published":"2024-08-15T18:33:54Z","title":"Evaluating Text Classification Robustness to Part-of-Speech Adversarial\n Examples","summary":" As machine learning systems become more widely used, especially for safety\ncritical applications, there is a growing need to ensure that these systems\nbehave as intended, even in the face of adversarial examples. Adversarial\nexamples are inputs that are designed to trick the decision making process, and\nare intended to be imperceptible to humans. However, for text-based\nclassification systems, changes to the input, a string of text, are always\nperceptible. Therefore, text-based adversarial examples instead focus on trying\nto preserve semantics. Unfortunately, recent work has shown this goal is often\nnot met. To improve the quality of text-based adversarial examples, we need to\nknow what elements of the input text are worth focusing on. To address this, in\nthis paper, we explore what parts of speech have the highest impact of\ntext-based classifiers. Our experiments highlight a distinct bias in CNN\nalgorithms against certain parts of speech tokens within review datasets. This\nfinding underscores a critical vulnerability in the linguistic processing\ncapabilities of CNNs.\n","authors":["Anahita Samadi","Allison Sullivan"],"pdf_url":"https://arxiv.org/pdf/2408.08374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08335v1","updated":"2024-08-15T04:29:33Z","published":"2024-08-15T04:29:33Z","title":"Plan with Code: Comparing approaches for robust NL to DSL generation","summary":" Planning in code is considered a more reliable approach for many\norchestration tasks. This is because code is more tractable than steps\ngenerated via Natural Language and make it easy to support more complex\nsequences by abstracting deterministic logic into functions. It also allows\nspotting issues with incorrect function names with the help of parsing checks\nthat can be run on code. Progress in Code Generation methodologies, however,\nremains limited to general-purpose languages like C, C++, and Python. LLMs\ncontinue to face challenges with custom function names in Domain Specific\nLanguages or DSLs, leading to higher hallucination rates and syntax errors.\nThis is more common for custom function names, that are typically part of the\nplan. Moreover, keeping LLMs up-to-date with newer function names is an issue.\nThis poses a challenge for scenarios like task planning over a large number of\nAPIs, since the plan is represented as a DSL having custom API names. In this\npaper, we focus on workflow automation in RPA (Robotic Process Automation)\ndomain as a special case of task planning. We present optimizations for using\nRetrieval Augmented Generation (or RAG) with LLMs for DSL generation along with\nan ablation study comparing these strategies with a fine-tuned model. Our\nresults showed that the fine-tuned model scored the best on code similarity\nmetric. However, with our optimizations, RAG approach is able to match the\nquality for in-domain API names in the test set. Additionally, it offers\nsignificant advantage for out-of-domain or unseen API names, outperforming\nFine-Tuned model on similarity metric by 7 pts.\n","authors":["Nastaran Bassamzadeh","Chhaya Methani"],"pdf_url":"https://arxiv.org/pdf/2408.08335v1.pdf","comment":"9 pages, 1 figure, 5 tables. arXiv admin note: substantial text\n overlap with arXiv:2407.02742"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.08313v1","updated":"2024-08-15T17:59:57Z","published":"2024-08-15T17:59:57Z","title":"Can Large Language Models Understand Symbolic Graphics Programs?","summary":" Assessing the capabilities of large language models (LLMs) is often\nchallenging, in part, because it is hard to find tasks to which they have not\nbeen exposed during training. We take one step to address this challenge by\nturning to a new task: focusing on symbolic graphics programs, which are a\npopular representation for graphics content that procedurally generates visual\ndata. LLMs have shown exciting promise towards program synthesis, but do they\nunderstand symbolic graphics programs? Unlike conventional programs, symbolic\ngraphics programs can be translated to graphics content. Here, we characterize\nan LLM's understanding of symbolic programs in terms of their ability to answer\nquestions related to the graphics content. This task is challenging as the\nquestions are difficult to answer from the symbolic programs alone -- yet, they\nwould be easy to answer from the corresponding graphics content as we verify\nthrough a human experiment. To understand symbolic programs, LLMs may need to\npossess the ability to imagine how the corresponding graphics content would\nlook without directly accessing the rendered visual content. We use this task\nto evaluate LLMs by creating a large benchmark for the semantic understanding\nof symbolic graphics programs. This benchmark is built via program-graphics\ncorrespondence, hence requiring minimal human efforts. We evaluate current LLMs\non our benchmark to elucidate a preliminary assessment of their ability to\nreason about visual scenes from programs. We find that this task distinguishes\nexisting LLMs and models considered good at reasoning perform better. Lastly,\nwe introduce Symbolic Instruction Tuning (SIT) to improve this ability.\nSpecifically, we query GPT4-o with questions and images generated by symbolic\nprograms. Such data are then used to finetune an LLM. We also find that SIT\ndata can improve the general instruction following ability of LLMs.\n","authors":["Zeju Qiu","Weiyang Liu","Haiwen Feng","Zhen Liu","Tim Z. Xiao","Katherine M. Collins","Joshua B. Tenenbaum","Adrian Weller","Michael J. Black","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2408.08313v1.pdf","comment":"Technical Report v1 (44 pages, 23 figures, project page:\n https://sgp-bench.github.io/)"},{"id":"http://arxiv.org/abs/2408.08307v1","updated":"2024-08-15T17:59:06Z","published":"2024-08-15T17:59:06Z","title":"Understanding the Local Geometry of Generative Model Manifolds","summary":" Deep generative models learn continuous representations of complex data\nmanifolds using a finite number of samples during training. For a pre-trained\ngenerative model, the common way to evaluate the quality of the manifold\nrepresentation learned, is by computing global metrics like Fr\\'echet Inception\nDistance using a large number of generated and real samples. However,\ngenerative model performance is not uniform across the learned manifold, e.g.,\nfor \\textit{foundation models} like Stable Diffusion generation performance can\nvary significantly based on the conditioning or initial noise vector being\ndenoised. In this paper we study the relationship between the \\textit{local\ngeometry of the learned manifold} and downstream generation. Based on the\ntheory of continuous piecewise-linear (CPWL) generators, we use three geometric\ndescriptors - scaling ($\\psi$), rank ($\\nu$), and complexity ($\\delta$) - to\ncharacterize a pre-trained generative model manifold locally. We provide\nquantitative and qualitative evidence showing that for a given latent, the\nlocal descriptors are correlated with generation aesthetics, artifacts,\nuncertainty, and even memorization. Finally we demonstrate that training a\n\\textit{reward model} on the local geometry can allow controlling the\nlikelihood of a generated sample under the learned distribution.\n","authors":["Ahmed Imtiaz Humayun","Ibtihel Amara","Candice Schumann","Golnoosh Farnadi","Negar Rostamzadeh","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2408.08307v1.pdf","comment":"Pre-print. 11 pages main, 8 pages app., 28 figures"},{"id":"http://arxiv.org/abs/2408.08305v1","updated":"2024-08-15T17:57:38Z","published":"2024-08-15T17:57:38Z","title":"Towards Flexible Visual Relationship Segmentation","summary":" Visual relationship understanding has been studied separately in human-object\ninteraction(HOI) detection, scene graph generation(SGG), and referring\nrelationships(RR) tasks. Given the complexity and interconnectedness of these\ntasks, it is crucial to have a flexible framework that can effectively address\nthese tasks in a cohesive manner. In this work, we propose FleVRS, a single\nmodel that seamlessly integrates the above three aspects in standard and\npromptable visual relationship segmentation, and further possesses the\ncapability for open-vocabulary segmentation to adapt to novel scenarios. FleVRS\nleverages the synergy between text and image modalities, to ground various\ntypes of relationships from images and use textual features from\nvision-language models to visual conceptual understanding. Empirical validation\nacross various datasets demonstrates that our framework outperforms existing\nmodels in standard, promptable, and open-vocabulary tasks, e.g., +1.9 $mAP$ on\nHICO-DET, +11.4 $Acc$ on VRD, +4.7 $mAP$ on unseen HICO-DET. Our FleVRS\nrepresents a significant step towards a more intuitive, comprehensive, and\nscalable understanding of visual relationships.\n","authors":["Fangrui Zhu","Jianwei Yang","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08295v1","updated":"2024-08-15T17:50:07Z","published":"2024-08-15T17:50:07Z","title":"SLCA++: Unleash the Power of Sequential Fine-tuning for Continual\n Learning with Pre-training","summary":" In recent years, continual learning with pre-training (CLPT) has received\nwidespread interest, instead of its traditional focus of training from scratch.\nThe use of strong pre-trained models (PTMs) can greatly facilitate knowledge\ntransfer and alleviate catastrophic forgetting, but also suffers from\nprogressive overfitting of pre-trained knowledge into specific downstream\ntasks. A majority of current efforts often keep the PTMs frozen and incorporate\ntask-specific prompts to instruct representation learning, coupled with a\nprompt selection process for inference. However, due to the limited capacity of\nprompt parameters, this strategy demonstrates only sub-optimal performance in\ncontinual learning. In comparison, tuning all parameters of PTMs often provides\nthe greatest potential for representation learning, making sequential\nfine-tuning (Seq FT) a fundamental baseline that has been overlooked in CLPT.\nTo this end, we present an in-depth analysis of the progressive overfitting\nproblem from the lens of Seq FT. Considering that the overly fast\nrepresentation learning and the biased classification layer constitute this\nparticular problem, we introduce the advanced Slow Learner with Classifier\nAlignment (SLCA++) framework to unleash the power of Seq FT, serving as a\nstrong baseline approach for CLPT. Our approach involves a Slow Learner to\nselectively reduce the learning rate of backbone parameters, and a Classifier\nAlignment to align the disjoint classification layers in a post-hoc fashion. We\nfurther enhance the efficacy of SL with a symmetric cross-entropy loss, as well\nas employ a parameter-efficient strategy to implement Seq FT with SLCA++.\nAcross a variety of continual learning scenarios on image classification\nbenchmarks, our approach provides substantial improvements and outperforms\nstate-of-the-art methods by a large margin. Code:\nhttps://github.com/GengDavid/SLCA.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.08295v1.pdf","comment":"This paper is an extension of our ICCV 23 paper (arXiv:2303.05118)"},{"id":"http://arxiv.org/abs/2403.19611v2","updated":"2024-08-15T17:41:45Z","published":"2024-03-28T17:31:23Z","title":"Nearest Neighbor Classification for Classical Image Upsampling","summary":" Given a set of ordered pixel data in the form of an image, our goal is to\nperform upsampling on the data such that: the resulting resolution is improved\nby some factor, the final result passes the human test, having added new,\nbelievable, and realistic information and detail to the image, the time\ncomplexity for upscaling is relatively close to that of lossy upscaling\nimplementations.\n","authors":["Evan Matthews","Nicolas Prate"],"pdf_url":"https://arxiv.org/pdf/2403.19611v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2408.07278v2","updated":"2024-08-15T17:40:33Z","published":"2024-08-03T13:03:31Z","title":"Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization\n in CTR Prediction","summary":" In the realm of modern mobile E-commerce, providing users with nearby\ncommercial service recommendations through location-based online services has\nbecome increasingly vital. While machine learning approaches have shown promise\nin multi-scene recommendation, existing methodologies often struggle to address\ncold-start problems in unprecedented scenes: the increasing diversity of\ncommercial choices, along with the short online lifespan of scenes, give rise\nto the complexity of effective recommendations in online and dynamic scenes. In\nthis work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that\nemphasizes high-performance cold-start online recommendations for new scenes.\nOur approach introduces several crucial capabilities, including scene\nsimilarity learning, user-specific scene transition cognition, scene-specific\ninformation construction for the new scene, and enhancing the diverged logical\ninformation between scenes. We demonstrate SwAN's potential to optimize dynamic\nmulti-scene recommendation problems by effectively online handling cold-start\nrecommendations for any newly arrived scenes. More encouragingly, SwAN has been\nsuccessfully deployed in Meituan's online catering recommendation service,\nwhich serves millions of customers per day, and SwAN has achieved a 5.64% CTR\nindex improvement relative to the baselines and a 5.19% increase in daily order\nvolume proportion.\n","authors":["Wenhao Li","Jie Zhou","Chuan Luo","Chao Tang","Kun Zhang","Shixiong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.07278v2.pdf","comment":"10 pages, 6 figures, accepted by Recsys 2024"},{"id":"http://arxiv.org/abs/2408.08270v1","updated":"2024-08-15T17:14:57Z","published":"2024-08-15T17:14:57Z","title":"HeightLane: BEV Heightmap guided 3D Lane Detection","summary":" Accurate 3D lane detection from monocular images presents significant\nchallenges due to depth ambiguity and imperfect ground modeling. Previous\nattempts to model the ground have often used a planar ground assumption with\nlimited degrees of freedom, making them unsuitable for complex road\nenvironments with varying slopes. Our study introduces HeightLane, an\ninnovative method that predicts a height map from monocular images by creating\nanchors based on a multi-slope assumption. This approach provides a detailed\nand accurate representation of the ground. HeightLane employs the predicted\nheightmap along with a deformable attention-based spatial feature transform\nframework to efficiently convert 2D image features into 3D bird's eye view\n(BEV) features, enhancing spatial understanding and lane structure recognition.\nAdditionally, the heightmap is used for the positional encoding of BEV\nfeatures, further improving their spatial accuracy. This explicit view\ntransformation bridges the gap between front-view perceptions and spatially\naccurate BEV representations, significantly improving detection performance. To\naddress the lack of the necessary ground truth (GT) height map in the original\nOpenLane dataset, we leverage the Waymo dataset and accumulate its LiDAR data\nto generate a height map for the drivable area of each scene. The GT heightmaps\nare used to train the heightmap extraction module from monocular images.\nExtensive experiments on the OpenLane validation set show that HeightLane\nachieves state-of-the-art performance in terms of F-score, highlighting its\npotential in real-world applications.\n","authors":["Chaesong Park","Eunbin Seo","Jongwoo Lim"],"pdf_url":"https://arxiv.org/pdf/2408.08270v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.10237v3","updated":"2024-08-15T17:10:19Z","published":"2023-12-15T22:09:04Z","title":"A Distributed Privacy Preserving Model for the Detection of Alzheimer's\n Disease","summary":" BACKGROUND: Segmentation of medical data, concerns about personal health\ninformation (PHI) breaches, and the direct and indirect costs of consolidating\nand managing such segmented date should motivate diagnostic machine learning\n(DML) researchers to identify privacy-preserving machine learning algorithms\nthat can train on distributed or decentralized datasets of different\nmodalities. Federated learning models provide such a decentralized machine\nlearning framework in which multiple investigators in possession of disparate\ndatasets and working on different devices or servers can train collaboratively\na global machine learning models without ever having to exchange local data and\nthus can meet statutory PHI protections. To this end, a vertical federate\nlearning model is devised and tested for efficacy in the detection of\nAlzheimer's Disease (AD).\n METHODS: The second version of Open Access Series of Imaging Studies -- with\nits panoply of demographic, imaging, and clinical assessment datasets -- was\nused to test a multimodal vertical federated learning (VFL) model for AD\ndetection.\n RESULTS: By training and validating this VFL model on the demographic,\nclinical, and MRI data in OASIS-2, an 82.9\\% accuracy rate is achieved,\nconsistent with previously reported results.\n CONCLUSIONS: The VFL architecture proposed herein offers a novel distributed\narchitecture, enabling collaborative learning across diverse sources of medical\ndata while respecting statutory privacy constraints. By leveraging multiple\nmodalities of data, the robustness and accuracy of AD detection can be\nenhanced. This model not only contributes to the advancement of federated\nlearning techniques but also holds promise for overcoming the hurdles posed by\ndata segmentation in medical research.\n","authors":["Paul K. Mandal"],"pdf_url":"https://arxiv.org/pdf/2312.10237v3.pdf","comment":"17 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2403.18080v2","updated":"2024-08-15T17:08:49Z","published":"2024-03-26T20:02:48Z","title":"EgoPoseFormer: A Simple Baseline for Stereo Egocentric 3D Human Pose\n Estimation","summary":" We present EgoPoseFormer, a simple yet effective transformer-based model for\nstereo egocentric human pose estimation. The main challenge in egocentric pose\nestimation is overcoming joint invisibility, which is caused by self-occlusion\nor a limited field of view (FOV) of head-mounted cameras. Our approach\novercomes this challenge by incorporating a two-stage pose estimation paradigm:\nin the first stage, our model leverages the global information to estimate each\njoint's coarse location, then in the second stage, it employs a DETR style\ntransformer to refine the coarse locations by exploiting fine-grained stereo\nvisual features. In addition, we present a Deformable Stereo Attention\noperation to enable our transformer to effectively process multi-view features,\nwhich enables it to accurately localize each joint in the 3D world. We evaluate\nour method on the stereo UnrealEgo dataset and show it significantly\noutperforms previous approaches while being computationally efficient: it\nimproves MPJPE by 27.4mm (45% improvement) with only 7.9% model parameters and\n13.1% FLOPs compared to the state-of-the-art. Surprisingly, with proper\ntraining settings, we find that even our first-stage pose proposal network can\nachieve superior performance compared to previous arts. We also show that our\nmethod can be seamlessly extended to monocular settings, which achieves\nstate-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm\n(21% improvement) compared to the best existing method with only 60.7% model\nparameters and 36.4% FLOPs. Code is available at:\nhttps://github.com/ChenhongyiYang/egoposeformer .\n","authors":["Chenhongyi Yang","Anastasia Tkach","Shreyas Hampali","Linguang Zhang","Elliot J. Crowley","Cem Keskin"],"pdf_url":"https://arxiv.org/pdf/2403.18080v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.08258v1","updated":"2024-08-15T16:59:15Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce\n\\textbf{\\textit{Snuffy}} architecture, a novel MIL-pooling method based on\nsparse transformers that mitigates performance loss with limited pre-training\nand enables continual few-shot pre-training as a competitive option. Our\nsparsity pattern is tailored for pathology and is theoretically proven to be a\nuniversal approximator with the tightest probabilistic sharp bound on the\nnumber of layers for sparse transformers, to date. We demonstrate Snuffy's\neffectiveness on CAMELYON16 and TCGA Lung cancer datasets, achieving superior\nWSI and patch-level accuracies. The code is available on\n\\url{https://github.com/jafarinia/snuffy}.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v1.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2308.10015v3","updated":"2024-08-15T16:44:06Z","published":"2023-08-19T13:46:49Z","title":"DyFFPAD: Dynamic Fusion of Convolutional and Handcrafted Features for\n Fingerprint Presentation Attack Detection","summary":" Automatic fingerprint recognition systems suffer from the threat of\npresentation attacks due to their wide range of deployment in areas including\nnational borders and commercial applications. A presentation attack can be\nperformed by creating a spoof of a user's fingerprint with or without their\nconsent. This paper presents a dynamic ensemble of deep CNN and handcrafted\nfeatures to detect presentation attacks in known-material and unknown-material\nprotocols of the livness detection competition. The proposed presentation\nattack detection model, in this way, utilizes the capabilities of both deep CNN\nand handcrafted features techniques and exhibits better performance than their\nindividual performances. We have validated our proposed method on benchmark\ndatabases from the Liveness Detection Competition in 2015, 2017, and 2019,\nyielding overall accuracy of 96.10\\%, 96.49\\%, and 94.99\\% on them,\nrespectively. The proposed method outperforms state-of-the-art methods in terms\nof classification accuracy.\n","authors":["Anuj Rai","Parsheel Kumar Tiwari","Jyotishna Baishya","Ram Prakash Sharma","Somnath Dey"],"pdf_url":"https://arxiv.org/pdf/2308.10015v3.pdf","comment":"arXiv admin note:"},{"id":"http://arxiv.org/abs/2408.08250v1","updated":"2024-08-15T16:41:55Z","published":"2024-08-15T16:41:55Z","title":"Computer Vision Model Compression Techniques for Embedded Systems: A\n Survey","summary":" Deep neural networks have consistently represented the state of the art in\nmost computer vision problems. In these scenarios, larger and more complex\nmodels have demonstrated superior performance to smaller architectures,\nespecially when trained with plenty of representative data. With the recent\nadoption of Vision Transformer (ViT) based architectures and advanced\nConvolutional Neural Networks (CNNs), the total number of parameters of leading\nbackbone architectures increased from 62M parameters in 2012 with AlexNet to 7B\nparameters in 2024 with AIM-7B. Consequently, deploying such deep architectures\nfaces challenges in environments with processing and runtime constraints,\nparticularly in embedded systems. This paper covers the main model compression\ntechniques applied for computer vision tasks, enabling modern models to be used\nin embedded systems. We present the characteristics of compression subareas,\ncompare different approaches, and discuss how to choose the best technique and\nexpected variations when analyzing it on various embedded devices. We also\nshare codes to assist researchers and new practitioners in overcoming initial\nimplementation challenges for each subarea and present trends for Model\nCompression. Case studies for compression models are available at\n\\href{https://github.com/venturusbr/cv-model-compression}{https://github.com/venturusbr/cv-model-compression}.\n","authors":["Alexandre Lopes","Fernando Pereira dos Santos","Diulhio de Oliveira","Mauricio Schiezaro","Helio Pedrini"],"pdf_url":"https://arxiv.org/pdf/2408.08250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03298v3","updated":"2024-08-15T16:32:04Z","published":"2023-12-06T05:39:00Z","title":"DiffPMAE: Diffusion Masked Autoencoders for Point Cloud Reconstruction","summary":" Point cloud streaming is increasingly getting popular, evolving into the norm\nfor interactive service delivery and the future Metaverse. However, the\nsubstantial volume of data associated with point clouds presents numerous\nchallenges, particularly in terms of high bandwidth consumption and large\nstorage capacity. Despite various solutions proposed thus far, with a focus on\npoint cloud compression, upsampling, and completion, these\nreconstruction-related methods continue to fall short in delivering high\nfidelity point cloud output. As a solution, in DiffPMAE, we propose an\neffective point cloud reconstruction architecture. Inspired by self-supervised\nlearning concepts, we combine Masked Auto-Encoding and Diffusion Model\nmechanism to remotely reconstruct point cloud data. By the nature of this\nreconstruction process, DiffPMAE can be extended to many related downstream\ntasks including point cloud compression, upsampling and completion. Leveraging\nShapeNet-55 and ModelNet datasets with over 60000 objects, we validate the\nperformance of DiffPMAE exceeding many state-of-the-art methods in-terms of\nauto-encoding and downstream tasks considered.\n","authors":["Yanlong Li","Chamara Madarasingha","Kanchana Thilakarathna"],"pdf_url":"https://arxiv.org/pdf/2312.03298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08234v1","updated":"2024-08-15T15:58:11Z","published":"2024-08-15T15:58:11Z","title":"Comparative Evaluation of 3D Reconstruction Methods for Object Pose\n Estimation","summary":" Object pose estimation is essential to many industrial applications involving\nrobotic manipulation, navigation, and augmented reality. Current generalizable\nobject pose estimators, i.e., approaches that do not need to be trained per\nobject, rely on accurate 3D models. Predominantly, CAD models are used, which\ncan be hard to obtain in practice. At the same time, it is often possible to\nacquire images of an object. Naturally, this leads to the question whether 3D\nmodels reconstructed from images are sufficient to facilitate accurate object\npose estimation. We aim to answer this question by proposing a novel benchmark\nfor measuring the impact of 3D reconstruction quality on pose estimation\naccuracy. Our benchmark provides calibrated images for object reconstruction\nregistered with the test images of the YCB-V dataset for pose evaluation under\nthe BOP benchmark format. Detailed experiments with multiple state-of-the-art\n3D reconstruction and object pose estimation approaches show that the geometry\nproduced by modern reconstruction methods is often sufficient for accurate pose\nestimation. Our experiments lead to interesting observations: (1) Standard\nmetrics for measuring 3D reconstruction quality are not necessarily indicative\nof pose estimation accuracy, which shows the need for dedicated benchmarks such\nas ours. (2) Classical, non-learning-based approaches can perform on par with\nmodern learning-based reconstruction techniques and can even offer a better\nreconstruction time-pose accuracy tradeoff. (3) There is still a sizable gap\nbetween performance with reconstructed and with CAD models. To foster research\non closing this gap, our benchmark is publicly available at\nhttps://github.com/VarunBurde/reconstruction_pose_benchmark}.\n","authors":["Varun Burde","Assia Benbihi","Pavel Burget","Torsten Sattler"],"pdf_url":"https://arxiv.org/pdf/2408.08234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08228v1","updated":"2024-08-15T15:55:07Z","published":"2024-08-15T15:55:07Z","title":"Rethinking Medical Anomaly Detection in Brain MRI: An Image Quality\n Assessment Perspective","summary":" Reconstruction-based methods, particularly those leveraging autoencoders,\nhave been widely adopted to perform anomaly detection in brain MRI. While most\nexisting works try to improve detection accuracy by proposing new model\nstructures or algorithms, we tackle the problem through image quality\nassessment, an underexplored perspective in the field. We propose a fusion\nquality loss function that combines Structural Similarity Index Measure loss\nwith l1 loss, offering a more comprehensive evaluation of reconstruction\nquality. Additionally, we introduce a data pre-processing strategy that\nenhances the average intensity ratio (AIR) between normal and abnormal regions,\nfurther improving the distinction of anomalies. By fusing the aforementioned\ntwo methods, we devise the image quality assessment (IQA) approach. The\nproposed IQA approach achieves significant improvements (>10%) in terms of Dice\ncoefficient (DICE) and Area Under the Precision-Recall Curve (AUPRC) on the\nBraTS21 (T2, FLAIR) and MSULB datasets when compared with state-of-the-art\nmethods. These results highlight the importance of invoking the comprehensive\nimage quality assessment in medical anomaly detection and provide a new\nperspective for future research in this field.\n","authors":["Zixuan Pan","Jun Xia","Zheyu Yan","Guoyue Xu","Yawen Wu","Zhenge Jia","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.08228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05133v2","updated":"2024-08-15T15:40:48Z","published":"2023-12-08T16:05:15Z","title":"GIR: 3D Gaussian Inverse Rendering for Relightable Scene Factorization","summary":" This paper presents a 3D Gaussian Inverse Rendering (GIR) method, employing\n3D Gaussian representations to effectively factorize the scene into material\nproperties, light, and geometry. The key contributions lie in three-fold. We\ncompute the normal of each 3D Gaussian using the shortest eigenvector, with a\ndirectional masking scheme forcing accurate normal estimation without external\nsupervision. We adopt an efficient voxel-based indirect illumination tracing\nscheme that stores direction-aware outgoing radiance in each 3D Gaussian to\ndisentangle secondary illumination for approximating multi-bounce light\ntransport. To further enhance the illumination disentanglement, we represent a\nhigh-resolution environmental map with a learnable low-resolution map and a\nlightweight, fully convolutional network. Our method achieves state-of-the-art\nperformance in both relighting and novel view synthesis tasks among the\nrecently proposed inverse rendering methods while achieving real-time\nrendering. This substantiates our proposed method's efficacy and broad\napplicability, highlighting its potential as an influential tool in various\nreal-time interactive graphics applications such as material editing and\nrelighting. The code will be released at https://github.com/guduxiaolang/GIR.\n","authors":["Yahao Shi","Yanmin Wu","Chenming Wu","Xing Liu","Chen Zhao","Haocheng Feng","Jian Zhang","Bin Zhou","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.05133v2.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2408.08216v1","updated":"2024-08-15T15:26:12Z","published":"2024-08-15T15:26:12Z","title":"The Dawn of KAN in Image-to-Image (I2I) Translation: Integrating\n Kolmogorov-Arnold Networks with GANs for Unpaired I2I Translation","summary":" Image-to-Image translation in Generative Artificial Intelligence (Generative\nAI) has been a central focus of research, with applications spanning\nhealthcare, remote sensing, physics, chemistry, photography, and more. Among\nthe numerous methodologies, Generative Adversarial Networks (GANs) with\ncontrastive learning have been particularly successful. This study aims to\ndemonstrate that the Kolmogorov-Arnold Network (KAN) can effectively replace\nthe Multi-layer Perceptron (MLP) method in generative AI, particularly in the\nsubdomain of image-to-image translation, to achieve better generative quality.\nOur novel approach replaces the two-layer MLP with a two-layer KAN in the\nexisting Contrastive Unpaired Image-to-Image Translation (CUT) model,\ndeveloping the KAN-CUT model. This substitution favors the generation of more\ninformative features in low-dimensional vector representations, which\ncontrastive learning can utilize more effectively to produce high-quality\nimages in the target domain. Extensive experiments, detailed in the results\nsection, demonstrate the applicability of KAN in conjunction with contrastive\nlearning and GANs in Generative AI, particularly for image-to-image\ntranslation. This work suggests that KAN could be a valuable component in the\nbroader generative AI domain.\n","authors":["Arpan Mahara","Naphtali D. Rishe","Liangdong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.08216v1.pdf","comment":"10 pages, 6 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2405.18299v2","updated":"2024-08-15T15:25:13Z","published":"2024-05-28T15:51:18Z","title":"Deep Learning Innovations for Underwater Waste Detection: An In-Depth\n Analysis","summary":" Addressing the issue of submerged underwater trash is crucial for\nsafeguarding aquatic ecosystems and preserving marine life. While identifying\ndebris present on the surface of water bodies is straightforward, assessing the\nunderwater submerged waste is a challenge due to the image distortions caused\nby factors such as light refraction, absorption, suspended particles, color\nshifts, and occlusion. This paper conducts a comprehensive review of\nstate-of-the-art architectures and on the existing datasets to establish a\nbaseline for submerged waste and trash detection. The primary goal remains to\nestablish the benchmark of the object localization techniques to be leveraged\nby advanced underwater sensors and autonomous underwater vehicles. The ultimate\nobjective is to explore the underwater environment, to identify, and remove\nunderwater debris. The absence of benchmarks (dataset or algorithm) in many\nresearches emphasizes the need for a more robust algorithmic solution. Through\nthis research, we aim to give performance comparative analysis of various\nunderwater trash detection algorithms.\n","authors":["Jaskaran Singh Walia","Pavithra L K"],"pdf_url":"https://arxiv.org/pdf/2405.18299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08215v1","updated":"2024-08-15T15:23:37Z","published":"2024-08-15T15:23:37Z","title":"Moving Healthcare AI-Support Systems for Visually Detectable Diseases\n onto Constrained Devices","summary":" Image classification usually requires connectivity and access to the cloud\nwhich is often limited in many parts of the world, including hard to reach\nrural areas. TinyML aims to solve this problem by hosting AI assistants on\nconstrained devices, eliminating connectivity issues by processing data within\nthe device itself, without internet or cloud access. This pilot study explores\nthe use of tinyML to provide healthcare support with low spec devices in low\nconnectivity environments, focusing on diagnosis of skin diseases and the\nethical use of AI assistants in a healthcare setting. To investigate this,\n10,000 images of skin lesions were used to train a model for classifying\nvisually detectable diseases (VDDs). The model weights were then offloaded to a\nRaspberry Pi with a webcam attached, to be used for the classification of skin\nlesions without internet access. It was found that the developed prototype\nachieved a test accuracy of 78% and a test loss of 1.08.\n","authors":["Tess Watt","Christos Chrysoulas","Peter J Barclay"],"pdf_url":"https://arxiv.org/pdf/2408.08215v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08211v1","updated":"2024-08-15T15:20:55Z","published":"2024-08-15T15:20:55Z","title":"Learned Multimodal Compression for Autonomous Driving","summary":" Autonomous driving sensors generate an enormous amount of data. In this\npaper, we explore learned multimodal compression for autonomous driving,\nspecifically targeted at 3D object detection. We focus on camera and LiDAR\nmodalities and explore several coding approaches. One approach involves joint\ncoding of fused modalities, while others involve coding one modality first,\nfollowed by conditional coding of the other modality. We evaluate the\nperformance of these coding schemes on the nuScenes dataset. Our experimental\nresults indicate that joint coding of fused modalities yields better results\ncompared to the alternatives.\n","authors":["Hadi Hadizadeh","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2408.08211v1.pdf","comment":"6 pages, 5 figures, IEEE MMSP 2024"},{"id":"http://arxiv.org/abs/2311.04698v5","updated":"2024-08-15T15:19:01Z","published":"2023-11-08T14:10:19Z","title":"Examining Common Paradigms in Multi-Task Learning","summary":" While multi-task learning (MTL) has gained significant attention in recent\nyears, its underlying mechanisms remain poorly understood. Recent methods did\nnot yield consistent performance improvements over single task learning (STL)\nbaselines, underscoring the importance of gaining more profound insights about\nchallenges specific to MTL. In our study, we investigate paradigms in MTL in\nthe context of STL: First, the impact of the choice of optimizer has only been\nmildly investigated in MTL. We show the pivotal role of common STL tools such\nas the Adam optimizer in MTL empirically in various experiments. To further\ninvestigate Adam's effectiveness, we theoretical derive a partial loss-scale\ninvariance under mild assumptions. Second, the notion of gradient conflicts has\noften been phrased as a specific problem in MTL. We delve into the role of\ngradient conflicts in MTL and compare it to STL. For angular gradient alignment\nwe find no evidence that this is a unique problem in MTL. We emphasize\ndifferences in gradient magnitude as the main distinguishing factor. Overall,\nwe find surprising similarities between STL and MTL suggesting to consider\nmethods from both fields in a broader context.\n","authors":["Cathrin Elich","Lukas Kirchdorfer","Jan M. Köhler","Lukas Schott"],"pdf_url":"https://arxiv.org/pdf/2311.04698v5.pdf","comment":"Accepted for publication in German Conference for Pattern Recognition\n (GCPR), 2024"},{"id":"http://arxiv.org/abs/2408.08206v1","updated":"2024-08-15T15:16:49Z","published":"2024-08-15T15:16:49Z","title":"WaterSplatting: Fast Underwater 3D Scene Reconstruction Using Gaussian\n Splatting","summary":" The underwater 3D scene reconstruction is a challenging, yet interesting\nproblem with applications ranging from naval robots to VR experiences. The\nproblem was successfully tackled by fully volumetric NeRF-based methods which\ncan model both the geometry and the medium (water). Unfortunately, these\nmethods are slow to train and do not offer real-time rendering. More recently,\n3D Gaussian Splatting (3DGS) method offered a fast alternative to NeRFs.\nHowever, because it is an explicit method that renders only the geometry, it\ncannot render the medium and is therefore unsuited for underwater\nreconstruction. Therefore, we propose a novel approach that fuses volumetric\nrendering with 3DGS to handle underwater data effectively. Our method employs\n3DGS for explicit geometry representation and a separate volumetric field\n(queried once per pixel) for capturing the scattering medium. This dual\nrepresentation further allows the restoration of the scenes by removing the\nscattering medium. Our method outperforms state-of-the-art NeRF-based methods\nin rendering quality on the underwater SeaThru-NeRF dataset. Furthermore, it\ndoes so while offering real-time rendering performance, addressing the\nefficiency limitations of existing methods. Web:\nhttps://water-splatting.github.io\n","authors":["Huapeng Li","Wenxuan Song","Tianao Xu","Alexandre Elsig","Jonas Kulhanek"],"pdf_url":"https://arxiv.org/pdf/2408.08206v1.pdf","comment":"Web: https://water-splatting.github.io"},{"id":"http://arxiv.org/abs/2408.08205v1","updated":"2024-08-15T15:13:22Z","published":"2024-08-15T15:13:22Z","title":"A Multi-task Adversarial Attack Against Face Authentication","summary":" Deep-learning-based identity management systems, such as face authentication\nsystems, are vulnerable to adversarial attacks. However, existing attacks are\ntypically designed for single-task purposes, which means they are tailored to\nexploit vulnerabilities unique to the individual target rather than being\nadaptable for multiple users or systems. This limitation makes them unsuitable\nfor certain attack scenarios, such as morphing, universal, transferable, and\ncounter attacks. In this paper, we propose a multi-task adversarial attack\nalgorithm called MTADV that are adaptable for multiple users or systems. By\ninterpreting these scenarios as multi-task attacks, MTADV is applicable to both\nsingle- and multi-task attacks, and feasible in the white- and gray-box\nsettings. Furthermore, MTADV is effective against various face datasets,\nincluding LFW, CelebA, and CelebA-HQ, and can work with different deep learning\nmodels, such as FaceNet, InsightFace, and CurricularFace. Importantly, MTADV\nretains its feasibility as a single-task attack targeting a single user/system.\nTo the best of our knowledge, MTADV is the first adversarial attack method that\ncan target all of the aforementioned scenarios in one algorithm.\n","authors":["Hanrui Wang","Shuo Wang","Cunjian Chen","Massimo Tistarelli","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08205v1.pdf","comment":"Accepted by ACM Transactions on Multimedia Computing, Communications,\n and Applications"},{"id":"http://arxiv.org/abs/2408.08202v1","updated":"2024-08-15T15:10:01Z","published":"2024-08-15T15:10:01Z","title":"Towards Practical Human Motion Prediction with LiDAR Point Clouds","summary":" Human motion prediction is crucial for human-centric multimedia understanding\nand interacting. Current methods typically rely on ground truth human poses as\nobserved input, which is not practical for real-world scenarios where only raw\nvisual sensor data is available. To implement these methods in practice, a\npre-phrase of pose estimation is essential. However, such two-stage approaches\noften lead to performance degradation due to the accumulation of errors.\nMoreover, reducing raw visual data to sparse keypoint representations\nsignificantly diminishes the density of information, resulting in the loss of\nfine-grained features. In this paper, we propose \\textit{LiDAR-HMP}, the first\nsingle-LiDAR-based 3D human motion prediction approach, which receives the raw\nLiDAR point cloud as input and forecasts future 3D human poses directly.\nBuilding upon our novel structure-aware body feature descriptor, LiDAR-HMP\nadaptively maps the observed motion manifold to future poses and effectively\nmodels the spatial-temporal correlations of human motions for further\nrefinement of prediction results. Extensive experiments show that our method\nachieves state-of-the-art performance on two public benchmarks and demonstrates\nremarkable robustness and efficacy in real-world deployments.\n","authors":["Xiao Han","Yiming Ren","Yichen Yao","Yujing Sun","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2408.08202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08201v1","updated":"2024-08-15T15:08:58Z","published":"2024-08-15T15:08:58Z","title":"Heavy Labels Out! Dataset Distillation with Label Space Lightening","summary":" Dataset distillation or condensation aims to condense a large-scale training\ndataset into a much smaller synthetic one such that the training performance of\ndistilled and original sets on neural networks are similar. Although the number\nof training samples can be reduced substantially, current state-of-the-art\nmethods heavily rely on enormous soft labels to achieve satisfactory\nperformance. As a result, the required storage can be comparable even to\noriginal datasets, especially for large-scale ones. To solve this problem,\ninstead of storing these heavy labels, we propose a novel label-lightening\nframework termed HeLlO aiming at effective image-to-label projectors, with\nwhich synthetic labels can be directly generated online from synthetic images.\nSpecifically, to construct such projectors, we leverage prior knowledge in\nopen-source foundation models, e.g., CLIP, and introduce a LoRA-like\nfine-tuning strategy to mitigate the gap between pre-trained and target\ndistributions, so that original models for soft-label generation can be\ndistilled into a group of low-rank matrices. Moreover, an effective image\noptimization method is proposed to further mitigate the potential error between\nthe original and distilled label generators. Extensive experiments demonstrate\nthat with only about 0.003% of the original storage required for a complete set\nof soft labels, we achieve comparable performance to current state-of-the-art\ndataset distillation methods on large-scale datasets. Our code will be\navailable.\n","authors":["Ruonan Yu","Songhua Liu","Zigeng Chen","Jingwen Ye","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07433v2","updated":"2024-08-15T15:00:39Z","published":"2024-08-14T10:08:46Z","title":"MagicFace: Training-free Universal-Style Human Image Customized\n Synthesis","summary":" Existing human image personalized generation methods often require tedious\ntraining: either fine-tuning with a few images or retraining on large-scale\ndatasets. In such cases, these methods are prone to overfitting and encounter\ndifficulties when personalizing individuals of diverse styles. Moreover, these\ntraining-based approaches also struggle with multi-concept human image\ncustomizing. To this end, we propose MagicFace, the first method for\nuniversal-style human image personalized synthesis that enables\nsingle/multi-concept customization for humans of any style in a training-free\nmanner. MagicFace introduces a coarse-to-fine generation pipeline, involving\ntwo sequential stages: semantic scene construction and concept feature\ninjection. This is achieved by our Reference-aware Self-Attention (RSA) and\nRegion-grouped Blend Attention (RBA) mechanisms. Specifically, in the first\nstage, RSA enables the latent image to query features from reference concepts\nsimultaneously, extracting the coarse-grained overall semantic understanding to\nfacilitate the initial semantic layout establishment. In the second stage, we\nemploy an attention-based semantic segmentation method to pinpoint the\ngenerated regions of all concepts in the latent image at each step. Following\nthis, RBA divides the pixels of the latent image into semantic groups, with\neach group querying fine-grained features from its reference concept, which\nensures precise attribute alignment and feature injection. Throughout the\ntwo-stage process, a weight mask strategy is employed to ensure the model\nfocuses more on the reference concepts. Extensive experiments demonstrate our\nsuperiority in both human-centric subject-to-image synthesis and multi-concept\nhuman image customization. Our approach also can be applied to texture\ntransformation, further enhancing its versatility and applicability.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07433v2.pdf","comment":"project page: https://codegoat24.github.io/MagicFace"},{"id":"http://arxiv.org/abs/2408.08191v1","updated":"2024-08-15T14:49:12Z","published":"2024-08-15T14:49:12Z","title":"Beyond Full Label: Single-Point Prompt for Infrared Small Target Label\n Generation","summary":" In this work, we make the first attempt to construct a learning-based\nsingle-point annotation paradigm for infrared small target label generation\n(IRSTLG). Our intuition is that label generation requires just one more point\nprompt than target detection: IRSTLG can be regarded as an infrared small\ntarget detection (IRSTD) task with the target location hint. Based on this\ninsight, we introduce an energy double guided single-point prompt (EDGSP)\nframework, which adeptly transforms the target detection network into a refined\nlabel generation method. Specifically, the proposed EDGSP includes: 1) target\nenergy initialization (TEI) to create a foundational outline for sufficient\nshape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid\nlocalization of interested regions and reinforcement of individual differences\nto avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate\nfalse alarms. Experimental results show that pseudo labels generated by three\nbaselines equipped with EDGSP achieve 100% object-level probability of\ndetection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k\ndatasets, with a pixel-level intersection over union (IoU) improvement of\n13.28% over state-of-the-art label generation methods. Additionally, the\ndownstream detection task reveals that our centroid-annotated pseudo labels\nsurpass full labels, even with coarse single-point annotations, it still\nachieves 99.5% performance of full labeling.\n","authors":["Shuai Yuan","Hanlin Qin","Renke Kou","Xiang Yan","Zechuan Li","Chenxu Peng","Abd-Krim Seghouane"],"pdf_url":"https://arxiv.org/pdf/2408.08191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08189v1","updated":"2024-08-15T14:47:44Z","published":"2024-08-15T14:47:44Z","title":"FancyVideo: Towards Dynamic and Consistent Video Generation via\n Cross-frame Textual Guidance","summary":" Synthesizing motion-rich and temporally consistent videos remains a challenge\nin artificial intelligence, especially when dealing with extended durations.\nExisting text-to-video (T2V) models commonly employ spatial cross-attention for\ntext control, equivalently guiding different frame generations without\nframe-specific textual guidance. Thus, the model's capacity to comprehend the\ntemporal logic conveyed in prompts and generate videos with coherent motion is\nrestricted. To tackle this limitation, we introduce FancyVideo, an innovative\nvideo generator that improves the existing text-control mechanism with the\nwell-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM\nincorporates the Temporal Information Injector (TII), Temporal Affinity Refiner\n(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of\ncross-attention, respectively, to achieve frame-specific textual guidance.\nFirstly, TII injects frame-specific information from latent features into text\nconditions, thereby obtaining cross-frame textual conditions. Then, TAR refines\nthe correlation matrix between cross-frame textual conditions and latent\nfeatures along the time dimension. Lastly, TFB boosts the temporal consistency\nof latent features. Extensive experiments comprising both quantitative and\nqualitative evaluations demonstrate the effectiveness of FancyVideo. Our\napproach achieves state-of-the-art T2V generation results on the EvalCrafter\nbenchmark and facilitates the synthesis of dynamic and consistent videos. The\nvideo show results can be available at https://fancyvideo.github.io/, and we\nwill make our code and model weights publicly available.\n","authors":["Jiasong Feng","Ao Ma","Jing Wang","Bo Cheng","Xiaodan Liang","Dawei Leng","Yuhui Yin"],"pdf_url":"https://arxiv.org/pdf/2408.08189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08184v1","updated":"2024-08-15T14:42:02Z","published":"2024-08-15T14:42:02Z","title":"Not Every Image is Worth a Thousand Words: Quantifying Originality in\n Stable Diffusion","summary":" This work addresses the challenge of quantifying originality in text-to-image\n(T2I) generative diffusion models, with a focus on copyright originality. We\nbegin by evaluating T2I models' ability to innovate and generalize through\ncontrolled experiments, revealing that stable diffusion models can effectively\nrecreate unseen elements with sufficiently diverse training data. Then, our key\ninsight is that concepts and combinations of image elements the model is\nfamiliar with, and saw more during training, are more concisly represented in\nthe model's latent space. We hence propose a method that leverages textual\ninversion to measure the originality of an image based on the number of tokens\nrequired for its reconstruction by the model. Our approach is inspired by legal\ndefinitions of originality and aims to assess whether a model can produce\noriginal content without relying on specific prompts or having the training\ndata of the model. We demonstrate our method using both a pre-trained stable\ndiffusion model and a synthetic dataset, showing a correlation between the\nnumber of tokens and image originality. This work contributes to the\nunderstanding of originality in generative models and has implications for\ncopyright infringement cases.\n","authors":["Adi Haviv","Shahar Sarfaty","Uri Hacohen","Niva Elkin-Koren","Roi Livni","Amit H Bermano"],"pdf_url":"https://arxiv.org/pdf/2408.08184v1.pdf","comment":"GenLaw ICML 2024"},{"id":"http://arxiv.org/abs/2408.08182v1","updated":"2024-08-15T14:36:07Z","published":"2024-08-15T14:36:07Z","title":"Your Turn: Real-World Turning Angle Estimation for Parkinson's Disease\n Severity Assessment","summary":" People with Parkinson's Disease (PD) often experience progressively worsening\ngait, including changes in how they turn around, as the disease progresses.\nExisting clinical rating tools are not capable of capturing hour-by-hour\nvariations of PD symptoms, as they are confined to brief assessments within\nclinic settings. Measuring real-world gait turning angles continuously and\npassively is a component step towards using gait characteristics as sensitive\nindicators of disease progression in PD. This paper presents a deep\nlearning-based approach to automatically quantify turning angles by extracting\n3D skeletons from videos and calculating the rotation of hip and knee joints.\nWe utilise state-of-the-art human pose estimation models, Fastpose and Strided\nTransformer, on a total of 1386 turning video clips from 24 subjects (12 people\nwith PD and 12 healthy control volunteers), trimmed from a PD dataset of\nunscripted free-living videos in a home-like setting (Turn-REMAP). We also\ncurate a turning video dataset, Turn-H3.6M, from the public Human3.6M human\npose benchmark with 3D ground truth, to further validate our method. Previous\ngait research has primarily taken place in clinics or laboratories evaluating\nscripted gait outcomes, but this work focuses on real-world settings where\ncomplexities exist, such as baggy clothing and poor lighting. Due to\ndifficulties in obtaining accurate ground truth data in a free-living setting,\nwe quantise the angle into the nearest bin $45^\\circ$ based on the manual\nlabelling of expert clinicians. Our method achieves a turning calculation\naccuracy of 41.6%, a Mean Absolute Error (MAE) of 34.7{\\deg}, and a weighted\nprecision WPrec of 68.3% for Turn-REMAP. This is the first work to explore the\nuse of single monocular camera data to quantify turns by PD patients in a home\nsetting.\n","authors":["Qiushuo Cheng","Catherine Morgan","Arindam Sikdar","Alessandro Masullo","Alan Whone","Majid Mirmehdi"],"pdf_url":"https://arxiv.org/pdf/2408.08182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17695v2","updated":"2024-08-15T14:30:02Z","published":"2024-03-26T13:35:10Z","title":"PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition","summary":" We present PlainMamba: a simple non-hierarchical state space model (SSM)\ndesigned for general visual recognition. The recent Mamba model has shown how\nSSMs can be highly competitive with other architectures on sequential data and\ninitial attempts have been made to apply it to images. In this paper, we\nfurther adapt the selective scanning process of Mamba to the visual domain,\nenhancing its ability to learn features from two-dimensional images by (i) a\ncontinuous 2D scanning process that improves spatial continuity by ensuring\nadjacency of tokens in the scanning sequence, and (ii) direction-aware updating\nwhich enables the model to discern the spatial relations of tokens by encoding\ndirectional information. Our architecture is designed to be easy to use and\neasy to scale, formed by stacking identical PlainMamba blocks, resulting in a\nmodel with constant width throughout all layers. The architecture is further\nsimplified by removing the need for special tokens. We evaluate PlainMamba on a\nvariety of visual recognition tasks, achieving performance gains over previous\nnon-hierarchical models and is competitive with hierarchical alternatives. For\ntasks requiring high-resolution inputs, in particular, PlainMamba requires much\nless computing while maintaining high performance. Code and models are\navailable at: https://github.com/ChenhongyiYang/PlainMamba .\n","authors":["Chenhongyi Yang","Zehui Chen","Miguel Espinosa","Linus Ericsson","Zhenyu Wang","Jiaming Liu","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2403.17695v2.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2407.13322v3","updated":"2024-08-15T14:27:51Z","published":"2024-07-18T09:22:40Z","title":"Fully Test-Time rPPG Estimation via Synthetic Signal-Guided Feature\n Learning","summary":" Many remote photoplethysmography (rPPG) estimation models have achieved\npromising performance in the training domain but often fail to accurately\nestimate physiological signals or heart rates (HR) in the target domains.\nDomain generalization (DG) or domain adaptation (DA) techniques are therefore\nadopted during the offline training stage to adapt the model to either\nunobserved or observed target domains by utilizing all available source domain\ndata. However, in rPPG estimation problems, the adapted model usually\nencounters challenges in estimating target data with significant domain\nvariation. In contrast, Test-Time Adaptation (TTA) enables the model to\nadaptively estimate rPPG signals in various unseen domains by online adapting\nto unlabeled target data without referring to any source data. In this paper,\nwe first establish a new TTA-rPPG benchmark that encompasses various domain\ninformation and HR distributions to simulate the challenges encountered in\nreal-world rPPG estimation. Next, we propose a novel synthetic signal-guided\nrPPG estimation framework to address the forgetting issue during the TTA stage\nand to enhance the adaptation capability of the pre-trained rPPG model. To this\nend, we develop a synthetic signal-guided feature learning method by\nsynthesizing pseudo rPPG signals as pseudo ground truths to guide a conditional\ngenerator in generating latent rPPG features. In addition, we design an\neffective spectral-based entropy minimization technique to encourage the rPPG\nmodel to learn new target domain information. Both the generated rPPG features\nand synthesized rPPG signals prevent the rPPG model from overfitting to target\ndata and forgetting previously acquired knowledge, while also broadly covering\nvarious heart rate (HR) distributions. Our extensive experiments on the\nTTA-rPPG benchmark show that the proposed method achieves superior performance.\n","authors":["Pei-Kai Huang","Tzu-Hsien Chen","Ya-Ting Chan","Kuan-Wen Chen","Chiou-Ting Hsu"],"pdf_url":"https://arxiv.org/pdf/2407.13322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08172v1","updated":"2024-08-15T14:19:13Z","published":"2024-08-15T14:19:13Z","title":"Towards flexible perception with visual memory","summary":" Training a neural network is a monolithic endeavor, akin to carving knowledge\ninto stone: once the process is completed, editing the knowledge in a network\nis nearly impossible, since all information is distributed across the network's\nweights. We here explore a simple, compelling alternative by marrying the\nrepresentational power of deep neural networks with the flexibility of a\ndatabase. Decomposing the task of image classification into image similarity\n(from a pre-trained embedding) and search (via fast nearest neighbor retrieval\nfrom a knowledge database), we build a simple and flexible visual memory that\nhas the following key capabilities: (1.) The ability to flexibly add data\nacross scales: from individual samples all the way to entire classes and\nbillion-scale data; (2.) The ability to remove data through unlearning and\nmemory pruning; (3.) An interpretable decision-mechanism on which we can\nintervene to control its behavior. Taken together, these capabilities\ncomprehensively demonstrate the benefits of an explicit visual memory. We hope\nthat it might contribute to a conversation on how knowledge should be\nrepresented in deep vision models -- beyond carving it in ``stone'' weights.\n","authors":["Robert Geirhos","Priyank Jaini","Austin Stone","Sourabh Medapati","Xi Yi","George Toderici","Abhijit Ogale","Jonathon Shlens"],"pdf_url":"https://arxiv.org/pdf/2408.08172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06542v3","updated":"2024-08-15T14:07:04Z","published":"2024-01-12T12:35:45Z","title":"Robustness-Aware 3D Object Detection in Autonomous Driving: A Review and\n Outlook","summary":" In the realm of modern autonomous driving, the perception system is\nindispensable for accurately assessing the state of the surrounding\nenvironment, thereby enabling informed prediction and planning. The key step to\nthis system is related to 3D object detection that utilizes vehicle-mounted\nsensors such as LiDAR and cameras to identify the size, the category, and the\nlocation of nearby objects. Despite the surge in 3D object detection methods\naimed at enhancing detection precision and efficiency, there is a gap in the\nliterature that systematically examines their resilience against environmental\nvariations, noise, and weather changes. This study emphasizes the importance of\nrobustness, alongside accuracy and latency, in evaluating perception systems\nunder practical scenarios. Our work presents an extensive survey of\ncamera-only, LiDAR-only, and multi-modal 3D object detection algorithms,\nthoroughly evaluating their trade-off between accuracy, latency, and\nrobustness, particularly on datasets like KITTI-C and nuScenes-C to ensure fair\ncomparisons. Among these, multi-modal 3D detection approaches exhibit superior\nrobustness, and a novel taxonomy is introduced to reorganize the literature for\nenhanced clarity. This survey aims to offer a more practical perspective on the\ncurrent capabilities and the constraints of 3D object detection algorithms in\nreal-world applications, thus steering future research towards\nrobustness-centric advancements.\n","authors":["Ziying Song","Lin Liu","Feiyang Jia","Yadan Luo","Guoxin Zhang","Lei Yang","Li Wang","Caiyan Jia"],"pdf_url":"https://arxiv.org/pdf/2401.06542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21783v2","updated":"2024-08-15T13:57:20Z","published":"2024-07-31T17:54:27Z","title":"The Llama 3 Herd of Models","summary":" Modern artificial intelligence (AI) systems are powered by foundation models.\nThis paper presents a new set of foundation models, called Llama 3. It is a\nherd of language models that natively support multilinguality, coding,\nreasoning, and tool usage. Our largest model is a dense Transformer with 405B\nparameters and a context window of up to 128K tokens. This paper presents an\nextensive empirical evaluation of Llama 3. We find that Llama 3 delivers\ncomparable quality to leading language models such as GPT-4 on a plethora of\ntasks. We publicly release Llama 3, including pre-trained and post-trained\nversions of the 405B parameter language model and our Llama Guard 3 model for\ninput and output safety. The paper also presents the results of experiments in\nwhich we integrate image, video, and speech capabilities into Llama 3 via a\ncompositional approach. We observe this approach performs competitively with\nthe state-of-the-art on image, video, and speech recognition tasks. The\nresulting models are not yet being broadly released as they are still under\ndevelopment.\n","authors":["Abhimanyu Dubey","Abhinav Jauhri","Abhinav Pandey","Abhishek Kadian","Ahmad Al-Dahle","Aiesha Letman","Akhil Mathur","Alan Schelten","Amy Yang","Angela Fan","Anirudh Goyal","Anthony Hartshorn","Aobo Yang","Archi Mitra","Archie Sravankumar","Artem Korenev","Arthur Hinsvark","Arun Rao","Aston Zhang","Aurelien Rodriguez","Austen Gregerson","Ava Spataru","Baptiste Roziere","Bethany Biron","Binh Tang","Bobbie Chern","Charlotte Caucheteux","Chaya Nayak","Chloe Bi","Chris Marra","Chris McConnell","Christian Keller","Christophe Touret","Chunyang Wu","Corinne Wong","Cristian Canton Ferrer","Cyrus Nikolaidis","Damien Allonsius","Daniel Song","Danielle Pintz","Danny Livshits","David Esiobu","Dhruv Choudhary","Dhruv Mahajan","Diego Garcia-Olano","Diego Perino","Dieuwke Hupkes","Egor Lakomkin","Ehab AlBadawy","Elina Lobanova","Emily Dinan","Eric Michael Smith","Filip Radenovic","Frank Zhang","Gabriel Synnaeve","Gabrielle Lee","Georgia Lewis Anderson","Graeme Nail","Gregoire Mialon","Guan Pang","Guillem Cucurell","Hailey Nguyen","Hannah Korevaar","Hu Xu","Hugo Touvron","Iliyan Zarov","Imanol Arrieta Ibarra","Isabel Kloumann","Ishan Misra","Ivan Evtimov","Jade Copet","Jaewon Lee","Jan Geffert","Jana Vranes","Jason Park","Jay Mahadeokar","Jeet Shah","Jelmer van der Linde","Jennifer Billock","Jenny Hong","Jenya Lee","Jeremy Fu","Jianfeng Chi","Jianyu Huang","Jiawen Liu","Jie Wang","Jiecao Yu","Joanna Bitton","Joe Spisak","Jongsoo Park","Joseph Rocca","Joshua Johnstun","Joshua Saxe","Junteng Jia","Kalyan Vasuden Alwala","Kartikeya Upasani","Kate Plawiak","Ke Li","Kenneth Heafield","Kevin Stone","Khalid El-Arini","Krithika Iyer","Kshitiz Malik","Kuenley Chiu","Kunal Bhalla","Lauren Rantala-Yeary","Laurens van der Maaten","Lawrence Chen","Liang Tan","Liz Jenkins","Louis Martin","Lovish Madaan","Lubo Malo","Lukas Blecher","Lukas Landzaat","Luke de Oliveira","Madeline Muzzi","Mahesh Pasupuleti","Mannat Singh","Manohar Paluri","Marcin Kardas","Mathew Oldham","Mathieu Rita","Maya Pavlova","Melanie Kambadur","Mike Lewis","Min Si","Mitesh Kumar Singh","Mona Hassan","Naman Goyal","Narjes Torabi","Nikolay Bashlykov","Nikolay Bogoychev","Niladri Chatterji","Olivier Duchenne","Onur Çelebi","Patrick Alrassy","Pengchuan Zhang","Pengwei Li","Petar Vasic","Peter Weng","Prajjwal Bhargava","Pratik Dubal","Praveen Krishnan","Punit Singh Koura","Puxin Xu","Qing He","Qingxiao Dong","Ragavan Srinivasan","Raj Ganapathy","Ramon Calderer","Ricardo Silveira Cabral","Robert Stojnic","Roberta Raileanu","Rohit Girdhar","Rohit Patel","Romain Sauvestre","Ronnie Polidoro","Roshan Sumbaly","Ross Taylor","Ruan Silva","Rui Hou","Rui Wang","Saghar Hosseini","Sahana Chennabasappa","Sanjay Singh","Sean Bell","Seohyun Sonia Kim","Sergey Edunov","Shaoliang Nie","Sharan Narang","Sharath Raparthy","Sheng Shen","Shengye Wan","Shruti Bhosale","Shun Zhang","Simon Vandenhende","Soumya Batra","Spencer Whitman","Sten Sootla","Stephane Collot","Suchin Gururangan","Sydney Borodinsky","Tamar Herman","Tara Fowler","Tarek Sheasha","Thomas Georgiou","Thomas Scialom","Tobias Speckbacher","Todor Mihaylov","Tong Xiao","Ujjwal Karn","Vedanuj Goswami","Vibhor Gupta","Vignesh Ramanathan","Viktor Kerkez","Vincent Gonguet","Virginie Do","Vish Vogeti","Vladan Petrovic","Weiwei Chu","Wenhan Xiong","Wenyin Fu","Whitney Meers","Xavier Martinet","Xiaodong Wang","Xiaoqing Ellen Tan","Xinfeng Xie","Xuchao Jia","Xuewei Wang","Yaelle Goldschlag","Yashesh Gaur","Yasmine Babaei","Yi Wen","Yiwen Song","Yuchen Zhang","Yue Li","Yuning Mao","Zacharie Delpierre Coudert","Zheng Yan","Zhengxing Chen","Zoe Papakipos","Aaditya Singh","Aaron Grattafiori","Abha Jain","Adam Kelsey","Adam Shajnfeld","Adithya Gangidi","Adolfo Victoria","Ahuva Goldstand","Ajay Menon","Ajay Sharma","Alex Boesenberg","Alex Vaughan","Alexei Baevski","Allie Feinstein","Amanda Kallet","Amit Sangani","Anam Yunus","Andrei Lupu","Andres Alvarado","Andrew Caples","Andrew Gu","Andrew Ho","Andrew Poulton","Andrew Ryan","Ankit Ramchandani","Annie Franco","Aparajita Saraf","Arkabandhu Chowdhury","Ashley Gabriel","Ashwin Bharambe","Assaf Eisenman","Azadeh Yazdan","Beau James","Ben Maurer","Benjamin Leonhardi","Bernie Huang","Beth Loyd","Beto De Paola","Bhargavi Paranjape","Bing Liu","Bo Wu","Boyu Ni","Braden Hancock","Bram Wasti","Brandon Spence","Brani Stojkovic","Brian Gamido","Britt Montalvo","Carl Parker","Carly Burton","Catalina Mejia","Changhan Wang","Changkyu Kim","Chao Zhou","Chester Hu","Ching-Hsiang Chu","Chris Cai","Chris Tindal","Christoph Feichtenhofer","Damon Civin","Dana Beaty","Daniel Kreymer","Daniel Li","Danny Wyatt","David Adkins","David Xu","Davide Testuggine","Delia David","Devi Parikh","Diana Liskovich","Didem Foss","Dingkang Wang","Duc Le","Dustin Holland","Edward Dowling","Eissa Jamil","Elaine Montgomery","Eleonora Presani","Emily Hahn","Emily Wood","Erik Brinkman","Esteban Arcaute","Evan Dunbar","Evan Smothers","Fei Sun","Felix Kreuk","Feng Tian","Firat Ozgenel","Francesco Caggioni","Francisco Guzmán","Frank Kanayet","Frank Seide","Gabriela Medina Florez","Gabriella Schwarz","Gada Badeer","Georgia Swee","Gil Halpern","Govind Thattai","Grant Herman","Grigory Sizov"," Guangyi"," Zhang","Guna Lakshminarayanan","Hamid Shojanazeri","Han Zou","Hannah Wang","Hanwen Zha","Haroun Habeeb","Harrison Rudolph","Helen Suk","Henry Aspegren","Hunter Goldman","Ibrahim Damlaj","Igor Molybog","Igor Tufanov","Irina-Elena Veliche","Itai Gat","Jake Weissman","James Geboski","James Kohli","Japhet Asher","Jean-Baptiste Gaya","Jeff Marcus","Jeff Tang","Jennifer Chan","Jenny Zhen","Jeremy Reizenstein","Jeremy Teboul","Jessica Zhong","Jian Jin","Jingyi Yang","Joe Cummings","Jon Carvill","Jon Shepard","Jonathan McPhie","Jonathan Torres","Josh Ginsburg","Junjie Wang","Kai Wu","Kam Hou U","Karan Saxena","Karthik Prasad","Kartikay Khandelwal","Katayoun Zand","Kathy Matosich","Kaushik Veeraraghavan","Kelly Michelena","Keqian Li","Kun Huang","Kunal Chawla","Kushal Lakhotia","Kyle Huang","Lailin Chen","Lakshya Garg","Lavender A","Leandro Silva","Lee Bell","Lei Zhang","Liangpeng Guo","Licheng Yu","Liron Moshkovich","Luca Wehrstedt","Madian Khabsa","Manav Avalani","Manish Bhatt","Maria Tsimpoukelli","Martynas Mankus","Matan Hasson","Matthew Lennie","Matthias Reso","Maxim Groshev","Maxim Naumov","Maya Lathi","Meghan Keneally","Michael L. Seltzer","Michal Valko","Michelle Restrepo","Mihir Patel","Mik Vyatskov","Mikayel Samvelyan","Mike Clark","Mike Macey","Mike Wang","Miquel Jubert Hermoso","Mo Metanat","Mohammad Rastegari","Munish Bansal","Nandhini Santhanam","Natascha Parks","Natasha White","Navyata Bawa","Nayan Singhal","Nick Egebo","Nicolas Usunier","Nikolay Pavlovich Laptev","Ning Dong","Ning Zhang","Norman Cheng","Oleg Chernoguz","Olivia Hart","Omkar Salpekar","Ozlem Kalinli","Parkin Kent","Parth Parekh","Paul Saab","Pavan Balaji","Pedro Rittner","Philip Bontrager","Pierre Roux","Piotr Dollar","Polina Zvyagina","Prashant Ratanchandani","Pritish Yuvraj","Qian Liang","Rachad Alao","Rachel Rodriguez","Rafi Ayub","Raghotham Murthy","Raghu Nayani","Rahul Mitra","Raymond Li","Rebekkah Hogan","Robin Battey","Rocky Wang","Rohan Maheswari","Russ Howes","Ruty Rinott","Sai Jayesh Bondu","Samyak Datta","Sara Chugh","Sara Hunt","Sargun Dhillon","Sasha Sidorov","Satadru Pan","Saurabh Verma","Seiji Yamamoto","Sharadh Ramaswamy","Shaun Lindsay","Shaun Lindsay","Sheng Feng","Shenghao Lin","Shengxin Cindy Zha","Shiva Shankar","Shuqiang Zhang","Shuqiang Zhang","Sinong Wang","Sneha Agarwal","Soji Sajuyigbe","Soumith Chintala","Stephanie Max","Stephen Chen","Steve Kehoe","Steve Satterfield","Sudarshan Govindaprasad","Sumit Gupta","Sungmin Cho","Sunny Virk","Suraj Subramanian","Sy Choudhury","Sydney Goldman","Tal Remez","Tamar Glaser","Tamara Best","Thilo Kohler","Thomas Robinson","Tianhe Li","Tianjun Zhang","Tim Matthews","Timothy Chou","Tzook Shaked","Varun Vontimitta","Victoria Ajayi","Victoria Montanez","Vijai Mohan","Vinay Satish Kumar","Vishal Mangla","Vítor Albiero","Vlad Ionescu","Vlad Poenaru","Vlad Tiberiu Mihailescu","Vladimir Ivanov","Wei Li","Wenchen Wang","Wenwen Jiang","Wes Bouaziz","Will Constable","Xiaocheng Tang","Xiaofang Wang","Xiaojian Wu","Xiaolan Wang","Xide Xia","Xilun Wu","Xinbo Gao","Yanjun Chen","Ye Hu","Ye Jia","Ye Qi","Yenda Li","Yilin Zhang","Ying Zhang","Yossi Adi","Youngjin Nam"," Yu"," Wang","Yuchen Hao","Yundi Qian","Yuzi He","Zach Rait","Zachary DeVito","Zef Rosnbrick","Zhaoduo Wen","Zhenyu Yang","Zhiwei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.21783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16927v3","updated":"2024-08-15T13:55:30Z","published":"2023-06-29T14:17:24Z","title":"End-to-end Autonomous Driving: Challenges and Frontiers","summary":" The autonomous driving community has witnessed a rapid growth in approaches\nthat embrace an end-to-end algorithm framework, utilizing raw sensor input to\ngenerate vehicle motion plans, instead of concentrating on individual tasks\nsuch as detection and motion prediction. End-to-end systems, in comparison to\nmodular pipelines, benefit from joint feature optimization for perception and\nplanning. This field has flourished due to the availability of large-scale\ndatasets, closed-loop evaluation, and the increasing need for autonomous\ndriving algorithms to perform effectively in challenging scenarios. In this\nsurvey, we provide a comprehensive analysis of more than 270 papers, covering\nthe motivation, roadmap, methodology, challenges, and future trends in\nend-to-end autonomous driving. We delve into several critical challenges,\nincluding multi-modality, interpretability, causal confusion, robustness, and\nworld models, amongst others. Additionally, we discuss current advancements in\nfoundation models and visual pre-training, as well as how to incorporate these\ntechniques within the end-to-end driving framework. we maintain an active\nrepository that contains up-to-date literature and open-source projects at\nhttps://github.com/OpenDriveLab/End-to-end-Autonomous-Driving.\n","authors":["Li Chen","Penghao Wu","Kashyap Chitta","Bernhard Jaeger","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.16927v3.pdf","comment":"Accepted by IEEE TPAMI"},{"id":"http://arxiv.org/abs/2408.06970v2","updated":"2024-08-15T13:43:10Z","published":"2024-08-13T15:27:43Z","title":"Prompt-Based Segmentation at Multiple Resolutions and Lighting\n Conditions using Segment Anything Model 2","summary":" This paper provides insight into the effectiveness of zero-shot,\nprompt-based, Segment Anything Model (SAM), and its updated version, SAM 2, and\nthe non-promptable, conventional convolutional network (CNN), in segmenting\nsolar panels, in RGB aerial imagery, across lighting conditions, spatial\nresolutions, and prompt strategies. SAM 2 demonstrates improvements over SAM,\nparticularly in sub-optimal lighting conditions when prompted by points. Both\nSAMs, prompted by user-box, outperformed CNN, in all scenarios. Additionally,\nYOLOv9 prompting outperformed user points prompting. In high-resolution\nimagery, both in optimal and sub-optimal lighting conditions, Eff-UNet\noutperformed both SAM models prompted by YOLOv9 boxes, positioning Eff-UNet as\nthe appropriate model for automatic segmentation in high-resolution data. In\nlow-resolution data, user box prompts were found crucial to achieve a\nreasonable performance. This paper provides details on strengths and\nlimitations of each model and outlines robustness of user prompted image\nsegmentation models in inconsistent resolution and lighting conditions of\nremotely sensed data.\n","authors":["Osher Rafaeli","Tal Svoray","Roni Blushtein-Livnon","Ariel Nahlieli"],"pdf_url":"https://arxiv.org/pdf/2408.06970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08206v3","updated":"2024-08-15T13:38:05Z","published":"2023-10-12T10:51:23Z","title":"Long-Tailed Classification Based on Coarse-Grained Leading Forest and\n Multi-Center Loss","summary":" Long-tailed (LT) classification is an unavoidable and challenging problem in\nthe real world. Most existing long-tailed classification methods focus only on\nsolving the class-wise imbalance while ignoring the attribute-wise imbalance.\nThe deviation of a classification model is caused by both class-wise and\nattribute-wise imbalance. Due to the fact that attributes are implicit in most\ndatasets and the combination of attributes is complex, attribute-wise imbalance\nis more difficult to handle. For this purpose, we proposed a novel long-tailed\nclassification framework, aiming to build a multi-granularity classification\nmodel by means of invariant feature learning. This method first unsupervisedly\nconstructs Coarse-Grained forest (CLF) to better characterize the distribution\nof attributes within a class. Depending on the distribution of attributes, one\ncan customize suitable sampling strategies to construct different imbalanced\ndatasets. We then introduce multi-center loss (MCL) that aims to gradually\neliminate confusing attributes during feature learning process. The proposed\nframework does not necessarily couple to a specific LT classification model\nstructure and can be integrated with any existing LT method as an independent\ncomponent. Extensive experiments show that our approach achieves\nstate-of-the-art performance on both existing benchmarks ImageNet-GLT and\nMSCOCO-GLT and can improve the performance of existing LT methods. Our codes\nare available on GitHub: \\url{https://github.com/jinyery/cognisance}\n","authors":["Jinye Yang","Ji Xu","Di Wu","Jianhang Tang","Shaobo Li","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08206v3.pdf","comment":"This is another research work to apply leading tree structure along\n with deep learning architecture, aiming to deal with attribute-wise long-tail\n distribution within class"},{"id":"http://arxiv.org/abs/2405.01040v2","updated":"2024-08-15T13:36:43Z","published":"2024-05-02T06:52:49Z","title":"Few Shot Class Incremental Learning using Vision-Language models","summary":" Recent advancements in deep learning have demonstrated remarkable performance\ncomparable to human capabilities across various supervised computer vision\ntasks. However, the prevalent assumption of having an extensive pool of\ntraining data encompassing all classes prior to model training often diverges\nfrom real-world scenarios, where limited data availability for novel classes is\nthe norm. The challenge emerges in seamlessly integrating new classes with few\nsamples into the training data, demanding the model to adeptly accommodate\nthese additions without compromising its performance on base classes. To\naddress this exigency, the research community has introduced several solutions\nunder the realm of few-shot class incremental learning (FSCIL).\n In this study, we introduce an innovative FSCIL framework that utilizes\nlanguage regularizer and subspace regularizer. During base training, the\nlanguage regularizer helps incorporate semantic information extracted from a\nVision-Language model. The subspace regularizer helps in facilitating the\nmodel's acquisition of nuanced connections between image and text semantics\ninherent to base classes during incremental training. Our proposed framework\nnot only empowers the model to embrace novel classes with limited data, but\nalso ensures the preservation of performance on base classes. To substantiate\nthe efficacy of our approach, we conduct comprehensive experiments on three\ndistinct FSCIL benchmarks, where our framework attains state-of-the-art\nperformance.\n","authors":["Anurag Kumar","Chinmay Bharti","Saikat Dutta","Srikrishna Karanam","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2405.01040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08149v1","updated":"2024-08-15T13:35:59Z","published":"2024-08-15T13:35:59Z","title":"Unsupervised Variational Translator for Bridging Image Restoration and\n High-Level Vision Tasks","summary":" Recent research tries to extend image restoration capabilities from human\nperception to machine perception, thereby enhancing the performance of\nhigh-level vision tasks in degraded environments. These methods, primarily\nbased on supervised learning, typically involve the retraining of restoration\nnetworks or high-level vision networks. However, collecting paired data in\nreal-world scenarios and retraining large-scale models are challenge. To this\nend, we propose an unsupervised learning method called \\textbf{Va}riational\n\\textbf{T}ranslator (VaT), which does not require retraining existing\nrestoration and high-level vision networks. Instead, it establishes a\nlightweight network that serves as an intermediate bridge between them. By\nvariational inference, VaT approximates the joint distribution of restoration\noutput and high-level vision input, dividing the optimization objective into\npreserving content and maximizing marginal likelihood associated with\nhigh-level vision tasks. By cleverly leveraging self-training paradigms, VaT\nachieves the above optimization objective without requiring labels. As a\nresult, the translated images maintain a close resemblance to their original\ncontent while also demonstrating exceptional performance on high-level vision\ntasks. Extensive experiments in dehazing and low-light enhancement for\ndetection and classification show the superiority of our method over other\nstate-of-the-art unsupervised counterparts, even significantly surpassing\nsupervised methods in some complex real-world scenarios.\n","authors":["Jiawei Wu","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08143v1","updated":"2024-08-15T13:26:13Z","published":"2024-08-15T13:26:13Z","title":"Unlearnable Examples Detection via Iterative Filtering","summary":" Deep neural networks are proven to be vulnerable to data poisoning attacks.\nRecently, a specific type of data poisoning attack known as availability\nattacks has led to the failure of data utilization for model learning by adding\nimperceptible perturbations to images. Consequently, it is quite beneficial and\nchallenging to detect poisoned samples, also known as Unlearnable Examples\n(UEs), from a mixed dataset. In response, we propose an Iterative Filtering\napproach for UEs identification. This method leverages the distinction between\nthe inherent semantic mapping rules and shortcuts, without the need for any\nadditional information. We verify that when training a classifier on a mixed\ndataset containing both UEs and clean data, the model tends to quickly adapt to\nthe UEs compared to the clean data. Due to the accuracy gaps between training\nwith clean/poisoned samples, we employ a model to misclassify clean samples\nwhile correctly identifying the poisoned ones. The incorporation of additional\nclasses and iterative refinement enhances the model's ability to differentiate\nbetween clean and poisoned samples. Extensive experiments demonstrate the\nsuperiority of our method over state-of-the-art detection approaches across\nvarious attacks, datasets, and poison ratios, significantly reducing the Half\nTotal Error Rate (HTER) compared to existing methods.\n","authors":["Yi Yu","Qichen Zheng","Siyuan Yang","Wenhan Yang","Jun Liu","Shijian Lu","Yap-Peng Tan","Kwok-Yan Lam","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2408.08143v1.pdf","comment":"Accepted by ICANN 2024"},{"id":"http://arxiv.org/abs/2311.17460v4","updated":"2024-08-15T13:16:20Z","published":"2023-11-29T09:02:07Z","title":"Capturing Human Motion from Monocular Images in World Space with\n Weak-supervised Calibration","summary":" Previous methods for 3D human motion recovery from monocular images often\nfall short due to reliance on camera coordinates, leading to inaccuracies in\nreal-world applications where complex shooting conditions are prevalent. The\nlimited availability and diversity of focal length labels further exacerbate\nmisalignment issues in reconstructed 3D human bodies. To address these\nchallenges, we introduce W-HMR, a weak-supervised calibration method that\npredicts \"reasonable\" focal lengths based on body distortion information,\neliminating the need for precise focal length labels. Our approach enhances 2D\nsupervision precision and recovery accuracy. Additionally, we present the\nOrientCorrect module, which corrects body orientation for plausible\nreconstructions in world space, avoiding the error accumulation associated with\ninaccurate camera rotation predictions. Our contributions include a novel\nweak-supervised camera calibration technique, an effective orientation\ncorrection module, and a decoupling strategy that significantly improves the\ngeneralizability and accuracy of human motion recovery in both camera and world\ncoordinates. The robustness of W-HMR is validated through extensive experiments\non various datasets, showcasing its superiority over existing methods. Codes\nand demos have been released on the project page\nhttps://yw0208.github.io/w-hmr/.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v4.pdf","comment":"Project Page: https://yw0208.github.io/w-hmr/"},{"id":"http://arxiv.org/abs/2408.08134v1","updated":"2024-08-15T13:09:37Z","published":"2024-08-15T13:09:37Z","title":"CorrAdaptor: Adaptive Local Context Learning for Correspondence Pruning","summary":" In the fields of computer vision and robotics, accurate pixel-level\ncorrespondences are essential for enabling advanced tasks such as\nstructure-from-motion and simultaneous localization and mapping. Recent\ncorrespondence pruning methods usually focus on learning local consistency\nthrough k-nearest neighbors, which makes it difficult to capture robust context\nfor each correspondence. We propose CorrAdaptor, a novel architecture that\nintroduces a dual-branch structure capable of adaptively adjusting local\ncontexts through both explicit and implicit local graph learning. Specifically,\nthe explicit branch uses KNN-based graphs tailored for initial neighborhood\nidentification, while the implicit branch leverages a learnable matrix to\nsoftly assign neighbors and adaptively expand the local context scope,\nsignificantly enhancing the model's robustness and adaptability to complex\nimage variations. Moreover, we design a motion injection module to integrate\nmotion consistency into the network to suppress the impact of outliers and\nrefine local context learning, resulting in substantial performance\nimprovements. The experimental results on extensive correspondence-based tasks\nindicate that our CorrAdaptor achieves state-of-the-art performance both\nqualitatively and quantitatively. The code and pre-trained models are available\nat https://github.com/TaoWangzj/CorrAdaptor.\n","authors":["Wei Zhu","Yicheng Liu","Yuping He","Tangfei Liao","Kang Zheng","Xiaoqiu Xu","Tao Wang","Tong Lu"],"pdf_url":"https://arxiv.org/pdf/2408.08134v1.pdf","comment":"8 pages, 4 figures, accepted by ECAI"},{"id":"http://arxiv.org/abs/2404.11357v2","updated":"2024-08-15T13:02:41Z","published":"2024-04-17T13:12:14Z","title":"Detector Collapse: Physical-World Backdooring Object Detection to\n Catastrophic Overload or Blindness in Autonomous Driving","summary":" Object detection tasks, crucial in safety-critical systems like autonomous\ndriving, focus on pinpointing object locations. These detectors are known to be\nsusceptible to backdoor attacks. However, existing backdoor techniques have\nprimarily been adapted from classification tasks, overlooking deeper\nvulnerabilities specific to object detection. This paper is dedicated to\nbridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor\nattack paradigm tailored for object detection. DC is designed to instantly\nincapacitate detectors (i.e., severely impairing detector's performance and\nculminating in a denial-of-service). To this end, we develop two innovative\nattack schemes: Sponge for triggering widespread misidentifications and\nBlinding for rendering objects invisible. Remarkably, we introduce a novel\npoisoning strategy exploiting natural objects, enabling DC to act as a\npractical backdoor in real-world environments. Our experiments on different\ndetectors across several benchmarks show a significant improvement\n($\\sim$10\\%-60\\% absolute and $\\sim$2-7$\\times$ relative) in attack efficacy\nover state-of-the-art attacks.\n","authors":["Hangtao Zhang","Shengshan Hu","Yichen Wang","Leo Yu Zhang","Ziqi Zhou","Xianlong Wang","Yanjun Zhang","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2404.11357v2.pdf","comment":"Accepted to IJCAI 2024"},{"id":"http://arxiv.org/abs/2403.11192v2","updated":"2024-08-15T12:52:13Z","published":"2024-03-17T12:38:58Z","title":"Self-Supervised Video Desmoking for Laparoscopic Surgery","summary":" Due to the difficulty of collecting real paired data, most existing desmoking\nmethods train the models by synthesizing smoke, generalizing poorly to real\nsurgical scenarios. Although a few works have explored single-image real-world\ndesmoking in unpaired learning manners, they still encounter challenges in\nhandling dense smoke. In this work, we address these issues together by\nintroducing the self-supervised surgery video desmoking (SelfSVD). On the one\nhand, we observe that the frame captured before the activation of high-energy\ndevices is generally clear (named pre-smoke frame, PS frame), thus it can serve\nas supervision for other smoky frames, making real-world self-supervised video\ndesmoking practically feasible. On the other hand, in order to enhance the\ndesmoking performance, we further feed the valuable information from PS frame\ninto models, where a masking strategy and a regularization term are presented\nto avoid trivial solutions. In addition, we construct a real surgery video\ndataset for desmoking, which covers a variety of smoky scenes. Extensive\nexperiments on the dataset show that our SelfSVD can remove smoke more\neffectively and efficiently while recovering more photo-realistic details than\nthe state-of-the-art methods. The dataset, codes, and pre-trained models are\navailable at \\url{https://github.com/ZcsrenlongZ/SelfSVD}.\n","authors":["Renlong Wu","Zhilu Zhang","Shuohao Zhang","Longfei Gou","Haobin Chen","Lei Zhang","Hao Chen","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2403.11192v2.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2403.16481v2","updated":"2024-08-15T12:52:11Z","published":"2024-03-25T07:07:50Z","title":"REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices","summary":" This work tackles the challenging task of achieving real-time novel view\nsynthesis for reflective surfaces across various scenes. Existing real-time\nrendering methods, especially those based on meshes, often have subpar\nperformance in modeling surfaces with rich view-dependent appearances. Our key\nidea lies in leveraging meshes for rendering acceleration while incorporating a\nnovel approach to parameterize view-dependent information. We decompose the\ncolor into diffuse and specular, and model the specular color in the reflected\ndirection based on a neural environment map. Our experiments demonstrate that\nour method achieves comparable reconstruction quality for highly reflective\nsurfaces compared to state-of-the-art offline methods, while also efficiently\nenabling real-time rendering on edge devices such as smartphones.\n","authors":["Chaojie Ji","Yufeng Li","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2403.16481v2.pdf","comment":"ECCV 2024 accepted. Project Page: https://xdimlab.github.io/REFRAME/"},{"id":"http://arxiv.org/abs/2408.08125v1","updated":"2024-08-15T12:51:57Z","published":"2024-08-15T12:51:57Z","title":"Category-Prompt Refined Feature Learning for Long-Tailed Multi-Label\n Image Classification","summary":" Real-world data consistently exhibits a long-tailed distribution, often\nspanning multiple categories. This complexity underscores the challenge of\ncontent comprehension, particularly in scenarios requiring Long-Tailed\nMulti-Label image Classification (LTMLC). In such contexts, imbalanced data\ndistribution and multi-object recognition pose significant hurdles. To address\nthis issue, we propose a novel and effective approach for LTMLC, termed\nCategory-Prompt Refined Feature Learning (CPRFL), utilizing semantic\ncorrelations between different categories and decoupling category-specific\nvisual representations for each category. Specifically, CPRFL initializes\ncategory-prompts from the pretrained CLIP's embeddings and decouples\ncategory-specific visual representations through interaction with visual\nfeatures, thereby facilitating the establishment of semantic correlations\nbetween the head and tail classes. To mitigate the visual-semantic domain bias,\nwe design a progressive Dual-Path Back-Propagation mechanism to refine the\nprompts by progressively incorporating context-related visual information into\nprompts. Simultaneously, the refinement process facilitates the progressive\npurification of the category-specific visual representations under the guidance\nof the refined prompts. Furthermore, taking into account the negative-positive\nsample imbalance, we adopt the Asymmetric Loss as our optimization objective to\nsuppress negative samples across all classes and potentially enhance the\nhead-to-tail recognition performance. We validate the effectiveness of our\nmethod on two LTMLC benchmarks and extensive experiments demonstrate the\nsuperiority of our work over baselines.\n The code is available at https://github.com/jiexuanyan/CPRFL.\n","authors":["Jiexuan Yan","Sheng Huang","Nankun Mu","Luwen Huangfu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08125v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2405.00514v2","updated":"2024-08-15T12:48:12Z","published":"2024-05-01T13:49:09Z","title":"Get Your Embedding Space in Order: Domain-Adaptive Regression for Forest\n Monitoring","summary":" Image-level regression is an important task in Earth observation, where\nvisual domain and label shifts are a core challenge hampering generalization.\nHowever, cross-domain regression within remote sensing data remains\nunderstudied due to the absence of suited datasets. We introduce a new dataset\nwith aerial and satellite imagery in five countries with three forest-related\nregression tasks. To match real-world applicative interests, we compare methods\nthrough a restrictive setup where no prior on the target domain is available\nduring training, and models are adapted with limited information during\ntesting. Building on the assumption that ordered relationships generalize\nbetter, we propose manifold diffusion for regression as a strong baseline for\ntransduction in low-data regimes. Our comparison highlights the comparative\nadvantages of inductive and transductive methods in cross-domain regression.\n","authors":["Sizhuo Li","Dimitri Gominski","Martin Brandt","Xiaoye Tong","Philippe Ciais"],"pdf_url":"https://arxiv.org/pdf/2405.00514v2.pdf","comment":"Updated with review comments"},{"id":"http://arxiv.org/abs/2404.04908v2","updated":"2024-08-15T12:32:04Z","published":"2024-04-07T10:28:01Z","title":"Dual-Camera Smooth Zoom on Mobile Phones","summary":" When zooming between dual cameras on a mobile, noticeable jumps in geometric\ncontent and image color occur in the preview, inevitably affecting the user's\nzoom experience. In this work, we introduce a new task, ie, dual-camera smooth\nzoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI)\ntechnique is a potential solution but struggles with ground-truth collection.\nTo address the issue, we suggest a data factory solution where continuous\nvirtual cameras are assembled to generate DCSZ data by rendering reconstructed\n3D models of the scene. In particular, we propose a novel dual-camera smooth\nzoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is\nintroduced to construct a specific 3D model for each virtual camera. With the\nproposed data factory, we construct a synthetic dataset for DCSZ, and we\nutilize it to fine-tune FI models. In addition, we collect real-world dual-zoom\nimages without ground-truth for evaluation. Extensive experiments are conducted\nwith multiple FI methods. The results show that the fine-tuned FI models\nachieve a significant performance improvement over the original ones on DCSZ\ntask. The datasets, codes, and pre-trained models will are available at\nhttps://github.com/ZcsrenlongZ/ZoomGS.\n","authors":["Renlong Wu","Zhilu Zhang","Yu Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.04908v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2407.15706v6","updated":"2024-08-15T12:25:39Z","published":"2024-07-22T15:16:47Z","title":"Multi-Modality Co-Learning for Efficient Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has garnered significant attention due to\nthe utilization of concise and resilient skeletons. Nevertheless, the absence\nof detailed body information in skeletons restricts performance, while other\nmultimodal methods require substantial inference resources and are inefficient\nwhen using multimodal data during both training and inference stages. To\naddress this and fully harness the complementary multimodal features, we\npropose a novel multi-modality co-learning (MMCL) framework by leveraging the\nmultimodal large language models (LLMs) as auxiliary networks for efficient\nskeleton-based action recognition, which engages in multi-modality co-learning\nduring the training stage and keeps efficiency by employing only concise\nskeletons in inference. Our MMCL framework primarily consists of two modules.\nFirst, the Feature Alignment Module (FAM) extracts rich RGB features from video\nframes and aligns them with global skeleton features via contrastive learning.\nSecond, the Feature Refinement Module (FRM) uses RGB images with temporal\ninformation and text instruction to generate instructive features based on the\npowerful generalization of multimodal LLMs. These instructive text features\nwill further refine the classification scores and the refined scores will\nenhance the model's robustness and generalization in a manner similar to soft\nlabels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA\nbenchmarks consistently verify the effectiveness of our MMCL, which outperforms\nthe existing skeleton-based action recognition methods. Meanwhile, experiments\non UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization\nof our MMCL in zero-shot and domain-adaptive action recognition. Our code is\npublicly available at: https://github.com/liujf69/MMCL-Action.\n","authors":["Jinfu Liu","Chen Chen","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15706v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03785v3","updated":"2024-08-15T12:17:11Z","published":"2024-01-08T10:06:52Z","title":"Identifying Important Group of Pixels using Interactions","summary":" To better understand the behavior of image classifiers, it is useful to\nvisualize the contribution of individual pixels to the model prediction. In\nthis study, we propose a method, MoXI ($\\textbf{Mo}$del e$\\textbf{X}$planation\nby $\\textbf{I}$nteractions), that efficiently and accurately identifies a group\nof pixels with high prediction confidence. The proposed method employs\ngame-theoretic concepts, Shapley values and interactions, taking into account\nthe effects of individual pixels and the cooperative influence of pixels on\nmodel confidence. Theoretical analysis and experiments demonstrate that our\nmethod better identifies the pixels that are highly contributing to the model\noutputs than widely-used visualization by Grad-CAM, Attention rollout, and\nShapley value. While prior studies have suffered from the exponential\ncomputational cost in the computation of Shapley value and interactions, we\nshow that this can be reduced to quadratic cost for our task. The code is\navailable at https://github.com/KosukeSumiyasu/MoXI.\n","authors":["Kosuke Sumiyasu","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2401.03785v3.pdf","comment":"CVPR 2024 (update: minor typos, new references, Eqs. (12) and (13))"},{"id":"http://arxiv.org/abs/2408.08108v1","updated":"2024-08-15T12:11:20Z","published":"2024-08-15T12:11:20Z","title":"Unsupervised Part Discovery via Dual Representation Alignment","summary":" Object parts serve as crucial intermediate representations in various\ndownstream tasks, but part-level representation learning still has not received\nas much attention as other vision tasks. Previous research has established that\nVision Transformer can learn instance-level attention without labels,\nextracting high-quality instance-level representations for boosting downstream\ntasks. In this paper, we achieve unsupervised part-specific attention learning\nusing a novel paradigm and further employ the part representations to improve\npart discovery performance. Specifically, paired images are generated from the\nsame image with different geometric transformations, and multiple part\nrepresentations are extracted from these paired images using a novel module,\nnamed PartFormer. These part representations from the paired images are then\nexchanged to improve geometric transformation invariance. Subsequently, the\npart representations are aligned with the feature map extracted by a feature\nmap encoder, achieving high similarity with the pixel representations of the\ncorresponding part regions and low similarity in irrelevant regions. Finally,\nthe geometric and semantic constraints are applied to the part representations\nthrough the intermediate results in alignment for part-specific attention\nlearning, encouraging the PartFormer to focus locally and the part\nrepresentations to explicitly include the information of the corresponding\nparts. Moreover, the aligned part representations can further serve as a series\nof reliable detectors in the testing phase, predicting pixel masks for part\ndiscovery. Extensive experiments are carried out on four widely used datasets,\nand our results demonstrate that the proposed method achieves competitive\nperformance and robustness due to its part-specific attention.\n","authors":["Jiahao Xia","Wenjian Huang","Min Xu","Jianguo Zhang","Haimin Zhang","Ziyu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08108v1.pdf","comment":"Accepted by TPAMI-2024"},{"id":"http://arxiv.org/abs/2408.07680v2","updated":"2024-08-15T12:07:00Z","published":"2024-08-14T17:28:58Z","title":"A Spitting Image: Modular Superpixel Tokenization in Vision Transformers","summary":" Vision Transformer (ViT) architectures traditionally employ a grid-based\napproach to tokenization independent of the semantic content of an image. We\npropose a modular superpixel tokenization strategy which decouples tokenization\nand feature extraction; a shift from contemporary approaches where these are\ntreated as an undifferentiated whole. Using on-line content-aware tokenization\nand scale- and shape-invariant positional embeddings, we perform experiments\nand ablations that contrast our approach with patch-based tokenization and\nrandomized partitions as baselines. We show that our method significantly\nimproves the faithfulness of attributions, gives pixel-level granularity on\nzero-shot unsupervised dense prediction tasks, while maintaining predictive\nperformance in classification tasks. Our approach provides a modular\ntokenization framework commensurable with standard architectures, extending the\nspace of ViTs to a larger class of semantically-rich models.\n","authors":["Marius Aasan","Odd Kolbjørnsen","Anne Schistad Solberg","Adín Ramirez Rivera"],"pdf_url":"https://arxiv.org/pdf/2408.07680v2.pdf","comment":"To appear in ECCV (MELEX) 2024 Workshop Proceedings"},{"id":"http://arxiv.org/abs/2408.08105v1","updated":"2024-08-15T12:04:32Z","published":"2024-08-15T12:04:32Z","title":"Multimodal Causal Reasoning Benchmark: Challenging Vision Large Language\n Models to Infer Causal Links Between Siamese Images","summary":" Large Language Models (LLMs) have showcased exceptional ability in causal\nreasoning from textual information. However, will these causalities remain\nstraightforward for Vision Large Language Models (VLLMs) when only visual hints\nare provided? Motivated by this, we propose a novel Multimodal Causal Reasoning\nbenchmark, namely MuCR, to challenge VLLMs to infer semantic cause-and-effect\nrelationship when solely relying on visual cues such as action, appearance,\nclothing, and environment. Specifically, we introduce a prompt-driven image\nsynthesis approach to create siamese images with embedded semantic causality\nand visual cues, which can effectively evaluate VLLMs' causal reasoning\ncapabilities. Additionally, we develop tailored metrics from multiple\nperspectives, including image-level match, phrase-level understanding, and\nsentence-level explanation, to comprehensively assess VLLMs' comprehension\nabilities. Our extensive experiments reveal that the current state-of-the-art\nVLLMs are not as skilled at multimodal causal reasoning as we might have hoped.\nFurthermore, we perform a comprehensive analysis to understand these models'\nshortcomings from different views and suggest directions for future research.\nWe hope MuCR can serve as a valuable resource and foundational benchmark in\nmultimodal causal reasoning research. The project is available at:\nhttps://github.com/Zhiyuan-Li-John/MuCR\n","authors":["Zhiyuan Li","Heng Wang","Dongnan Liu","Chaoyi Zhang","Ao Ma","Jieting Long","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08105v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2406.16633v4","updated":"2024-08-15T11:59:15Z","published":"2024-06-24T13:30:55Z","title":"MLAAN: Scaling Supervised Local Learning with Multilaminar Leap\n Augmented Auxiliary Network","summary":" Deep neural networks (DNNs) typically employ an end-to-end (E2E) training\nparadigm which presents several challenges, including high GPU memory\nconsumption, inefficiency, and difficulties in model parallelization during\ntraining. Recent research has sought to address these issues, with one\npromising approach being local learning. This method involves partitioning the\nbackbone network into gradient-isolated modules and manually designing\nauxiliary networks to train these local modules. Existing methods often neglect\nthe interaction of information between local modules, leading to myopic issues\nand a performance gap compared to E2E training. To address these limitations,\nwe propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN).\nSpecifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap\nAugmented Modules (LAM). MLM captures both local and global features through\nindependent and cascaded auxiliary networks, alleviating performance issues\ncaused by insufficient global features. However, overly simplistic auxiliary\nnetworks can impede MLM's ability to capture global information. To address\nthis, we further design LAM, an enhanced auxiliary network that uses the\nExponential Moving Average (EMA) method to facilitate information exchange\nbetween local modules, thereby mitigating the shortsightedness resulting from\ninadequate interaction. The synergy between MLM and LAM has demonstrated\nexcellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and\nImageNet datasets show that MLAAN can be seamlessly integrated into existing\nlocal learning frameworks, significantly enhancing their performance and even\nsurpassing end-to-end (E2E) training methods, while also reducing GPU memory\nconsumption.\n","authors":["Yuming Zhang","Shouxin Zhang","Peizhe Wang","Feiyu Zhu","Dongzhi Guan","Junhao Su","Jiabin Liu","Changpeng Cai"],"pdf_url":"https://arxiv.org/pdf/2406.16633v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07818v2","updated":"2024-08-15T11:51:57Z","published":"2024-03-12T16:57:56Z","title":"Label Dropout: Improved Deep Learning Echocardiography Segmentation\n Using Multiple Datasets With Domain Shift and Partial Labelling","summary":" Echocardiography (echo) is the first imaging modality used when assessing\ncardiac function. The measurement of functional biomarkers from echo relies\nupon the segmentation of cardiac structures and deep learning models have been\nproposed to automate the segmentation process. However, in order to translate\nthese tools to widespread clinical use it is important that the segmentation\nmodels are robust to a wide variety of images (e.g. acquired from different\nscanners, by operators with different levels of expertise etc.). To achieve\nthis level of robustness it is necessary that the models are trained with\nmultiple diverse datasets. A significant challenge faced when training with\nmultiple diverse datasets is the variation in label presence, i.e. the combined\ndata are often partially-labelled. Adaptations of the cross entropy loss\nfunction have been proposed to deal with partially labelled data. In this paper\nwe show that training naively with such a loss function and multiple diverse\ndatasets can lead to a form of shortcut learning, where the model associates\nlabel presence with domain characteristics, leading to a drop in performance.\nTo address this problem, we propose a novel label dropout scheme to break the\nlink between domain characteristics and the presence or absence of labels. We\ndemonstrate that label dropout improves echo segmentation Dice score by 62% and\n25% on two cardiac structures when training using multiple diverse partially\nlabelled datasets.\n","authors":["Iman Islam","Esther Puyol-Antón","Bram Ruijsink","Andrew J. Reader","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2403.07818v2.pdf","comment":"10 pages, 5 figures, ASMUS 2024, Held in Conjunction with MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08093v1","updated":"2024-08-15T11:36:18Z","published":"2024-08-15T11:36:18Z","title":"When Video Coding Meets Multimodal Large Language Models: A Unified\n Paradigm for Video Coding","summary":" Existing codecs are designed to eliminate intrinsic redundancies to create a\ncompact representation for compression. However, strong external priors from\nMultimodal Large Language Models (MLLMs) have not been explicitly explored in\nvideo compression. Herein, we introduce a unified paradigm for Cross-Modality\nVideo Coding (CMVC), which is a pioneering approach to explore multimodality\nrepresentation and video generative models in video coding. Specifically, on\nthe encoder side, we disentangle a video into spatial content and motion\ncomponents, which are subsequently transformed into distinct modalities to\nachieve very compact representation by leveraging MLLMs. During decoding,\npreviously encoded components and video generation models are leveraged to\ncreate multiple encoding-decoding modes that optimize video reconstruction\nquality for specific decoding requirements, including Text-Text-to-Video (TT2V)\nmode to ensure high-quality semantic information and Image-Text-to-Video (IT2V)\nmode to achieve superb perceptual consistency. In addition, we propose an\nefficient frame interpolation model for IT2V mode via Low-Rank Adaption (LoRA)\ntuning to guarantee perceptual quality, which allows the generated motion cues\nto behave smoothly. Experiments on benchmarks indicate that TT2V achieves\neffective semantic reconstruction, while IT2V exhibits competitive perceptual\nconsistency. These results highlight potential directions for future research\nin video coding.\n","authors":["Pingping Zhang","Jinlong Li","Meng Wang","Nicu Sebe","Sam Kwong","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08092v1","updated":"2024-08-15T11:34:53Z","published":"2024-08-15T11:34:53Z","title":"OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse\n Click Annotation","summary":" LiDAR-based outdoor 3D object detection has received widespread attention.\nHowever, training 3D detectors from the LiDAR point cloud typically relies on\nexpensive bounding box annotations. This paper presents OC3D, an innovative\nweakly supervised method requiring only coarse clicks on the bird' s eye view\nof the 3D point cloud. A key challenge here is the absence of complete\ngeometric descriptions of the target objects from such simple click\nannotations. To address this problem, our proposed OC3D adopts a two-stage\nstrategy. In the first stage, we initially design a novel dynamic and static\nclassification strategy and then propose the Click2Box and Click2Mask modules\nto generate box-level and mask-level pseudo-labels for static and dynamic\ninstances, respectively. In the second stage, we design a Mask2Box module,\nleveraging the learning capabilities of neural networks to update mask-level\npseudo-labels, which contain less information, to box level pseudo-labels.\nExperimental results on the widely used KITTI and nuScenes datasets demonstrate\nthat our OC3D with only coarse clicks achieves state-of-the-art performance\ncompared to weakly-supervised 3D detection methods. Combining OC3D with a\nmissing click mining strategy, we propose a OC3D++ pipeline, which requires\nonly 0.2% annotation cost in the KITTI dataset to achieve performance\ncomparable to fully supervised methods.\n","authors":["Qiming Xia","Hongwei Lin","Wei Ye","Hai Wu","Yadan Luo","Shijia Zhao","Xin Li","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08091v1","updated":"2024-08-15T11:34:33Z","published":"2024-08-15T11:34:33Z","title":"HAIR: Hypernetworks-based All-in-One Image Restoration","summary":" Image restoration involves recovering a high-quality clean image from its\ndegraded version, which is a fundamental task in computer vision. Recent\nprogress in image restoration has demonstrated the effectiveness of learning\nmodels capable of addressing various degradations simultaneously, i.e., the\nAll-in-One image restoration models. However, these existing methods typically\nutilize the same parameters facing images with different degradation types,\nwhich causes the model to be forced to trade off between degradation types,\ntherefore impair the total performance. To solve this problem, we propose HAIR,\na Hypernetworks-based plug-in-and-play method that dynamically generated\nparameters for the corresponding networks based on the contents of input\nimages. HAIR consists of 2 main components: Classifier (Cl) and Hyper Selecting\nNet (HSN). To be more specific, the Classifier is a simple image classification\nnetwork which is used to generate a Global Information Vector (GIV) that\ncontains the degradation information of the input image; And the HSNs can be\nseen as a simple Fully-connected Neural Network that receive the GIV and output\nparameters for the corresponding modules. Extensive experiments shows that\nincorporating HAIR into the architectures can significantly improve the\nperformance of different models on image restoration tasks at a low cost,\n\\textbf{although HAIR only generate parameters and haven't change these models'\nlogical structures at all.} With incorporating HAIR into the popular\narchitecture Restormer, our method obtains superior or at least comparable\nperformance to current state-of-the-art methods on a range of image restoration\ntasks.\n\\href{https://github.com/toummHus/HAIR}{\\textcolor{blue}{$\\underline{\\textbf{Code\nand pre-trained checkpoints are available here.}}$}}\n","authors":["Jin Cao","Yi Cao","Li Pang","Deyu Meng","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2408.08091v1.pdf","comment":"13 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.08087v1","updated":"2024-08-15T11:29:13Z","published":"2024-08-15T11:29:13Z","title":"ColorMamba: Towards High-quality NIR-to-RGB Spectral Translation with\n Mamba","summary":" Translating NIR to the visible spectrum is challenging due to cross-domain\ncomplexities. Current models struggle to balance a broad receptive field with\ncomputational efficiency, limiting practical use. Although the Selective\nStructured State Space Model, especially the improved version, Mamba, excels in\ngenerative tasks by capturing long-range dependencies with linear complexity,\nits default approach of converting 2D images into 1D sequences neglects local\ncontext. In this work, we propose a simple but effective backbone, dubbed\nColorMamba, which first introduces Mamba into spectral translation tasks. To\nexplore global long-range dependencies and local context for efficient spectral\ntranslation, we introduce learnable padding tokens to enhance the distinction\nof image boundaries and prevent potential confusion within the sequence model.\nFurthermore, local convolutional enhancement and agent attention are designed\nto improve the vanilla Mamba. Moreover, we exploit the HSV color to provide\nmulti-scale guidance in the reconstruction process for more accurate spectral\ntranslation. Extensive experiments show that our ColorMamba achieves a 1.02\nimprovement in terms of PSNR compared with the state-of-the-art method. Our\ncode is available at https://github.com/AlexYangxx/ColorMamba.\n","authors":["Huiyu Zhai","Guang Jin","Xingxing Yang","Guosheng Kang"],"pdf_url":"https://arxiv.org/pdf/2408.08087v1.pdf","comment":"Code is available at https://github.com/AlexYangxx/ColorMamba"},{"id":"http://arxiv.org/abs/2408.08086v1","updated":"2024-08-15T11:27:18Z","published":"2024-08-15T11:27:18Z","title":"Single-image coherent reconstruction of objects and humans","summary":" Existing methods for reconstructing objects and humans from a monocular image\nsuffer from severe mesh collisions and performance limitations for interacting\noccluding objects. This paper introduces a method to obtain a globally\nconsistent 3D reconstruction of interacting objects and people from a single\nimage. Our contributions include: 1) an optimization framework, featuring a\ncollision loss, tailored to handle human-object and human-human interactions,\nensuring spatially coherent scene reconstruction; and 2) a novel technique to\nrobustly estimate 6 degrees of freedom (DOF) poses, specifically for heavily\noccluded objects, exploiting image inpainting. Notably, our proposed method\noperates effectively on images from real-world scenarios, without necessitating\nscene or object-level 3D supervision. Extensive qualitative and quantitative\nevaluation against existing methods demonstrates a significant reduction in\ncollisions in the final reconstructions of scenes with multiple interacting\nhumans and objects and a more coherent scene reconstruction.\n","authors":["Sarthak Batra","Partha P. Chakrabarti","Simon Hadfield","Armin Mustafa"],"pdf_url":"https://arxiv.org/pdf/2408.08086v1.pdf","comment":"Accepted at AI for 3D Generation, CVPR Workshop"},{"id":"http://arxiv.org/abs/2408.08078v1","updated":"2024-08-15T11:04:26Z","published":"2024-08-15T11:04:26Z","title":"Treat Stillness with Movement: Remote Sensing Change Detection via\n Coarse-grained Temporal Foregrounds Mining","summary":" Current works focus on addressing the remote sensing change detection task\nusing bi-temporal images. Although good performance can be achieved, however,\nseldom of they consider the motion cues which may also be vital. In this work,\nwe revisit the widely adopted bi-temporal images-based framework and propose a\nnovel Coarse-grained Temporal Mining Augmented (CTMA) framework. To be\nspecific, given the bi-temporal images, we first transform them into a video\nusing interpolation operations. Then, a set of temporal encoders is adopted to\nextract the motion features from the obtained video for coarse-grained changed\nregion prediction. Subsequently, we design a novel Coarse-grained Foregrounds\nAugmented Spatial Encoder module to integrate both global and local\ninformation. We also introduce a motion augmented strategy that leverages\nmotion cues as an additional output to aggregate with the spatial features for\nimproved results. Meanwhile, we feed the input image pairs into the ResNet to\nget the different features and also the spatial blocks for fine-grained feature\nlearning. More importantly, we propose a mask augmented strategy that utilizes\ncoarse-grained changed regions, incorporating them into the decoder blocks to\nenhance the final changed prediction. Extensive experiments conducted on\nmultiple benchmark datasets fully validated the effectiveness of our proposed\nframework for remote sensing image change detection. The source code of this\npaper will be released on\nhttps://github.com/Event-AHU/CTM_Remote_Sensing_Change_Detection\n","authors":["Xixi Wang","Zitian Wang","Jingtao Jiang","Lan Chen","Xiao Wang","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08078v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2408.05075v2","updated":"2024-08-15T11:03:41Z","published":"2024-08-09T14:04:21Z","title":"DeepInteraction++: Multi-Modality Interaction for Autonomous Driving","summary":" Existing top-performance autonomous driving systems typically rely on the\nmulti-modal fusion strategy for reliable scene understanding. This design is\nhowever fundamentally restricted due to overlooking the modality-specific\nstrengths and finally hampering the model performance. To address this\nlimitation, in this work, we introduce a novel modality interaction strategy\nthat allows individual per-modality representations to be learned and\nmaintained throughout, enabling their unique characteristics to be exploited\nduring the whole perception pipeline. To demonstrate the effectiveness of the\nproposed strategy, we design DeepInteraction++, a multi-modal interaction\nframework characterized by a multi-modal representational interaction encoder\nand a multi-modal predictive interaction decoder. Specifically, the encoder is\nimplemented as a dual-stream Transformer with specialized attention operation\nfor information exchange and integration between separate modality-specific\nrepresentations. Our multi-modal representational learning incorporates both\nobject-centric, precise sampling-based feature alignment and global dense\ninformation spreading, essential for the more challenging planning task. The\ndecoder is designed to iteratively refine the predictions by alternately\naggregating information from separate representations in a unified\nmodality-agnostic manner, realizing multi-modal predictive interaction.\nExtensive experiments demonstrate the superior performance of the proposed\nframework on both 3D object detection and end-to-end autonomous driving tasks.\nOur code is available at https://github.com/fudan-zvg/DeepInteraction.\n","authors":["Zeyu Yang","Nan Song","Wei Li","Xiatian Zhu","Li Zhang","Philip H. S. Torr"],"pdf_url":"https://arxiv.org/pdf/2408.05075v2.pdf","comment":"Journal extension of NeurIPS 2022. arXiv admin note: text overlap\n with arXiv:2208.11112"},{"id":"http://arxiv.org/abs/2408.07440v2","updated":"2024-08-15T10:45:18Z","published":"2024-08-14T10:18:42Z","title":"BAPLe: Backdoor Attacks on Medical Foundational Models using Prompt\n Learning","summary":" Medical foundation models are gaining prominence in the medical community for\ntheir ability to derive general representations from extensive collections of\nmedical image-text pairs. Recent research indicates that these models are\nsusceptible to backdoor attacks, which allow them to classify clean images\naccurately but fail when specific triggers are introduced. However, traditional\nbackdoor attacks necessitate a considerable amount of additional data to\nmaliciously pre-train a model. This requirement is often impractical in medical\nimaging applications due to the usual scarcity of data. Inspired by the latest\ndevelopments in learnable prompts, this work introduces a method to embed a\nbackdoor into the medical foundation model during the prompt learning phase. By\nincorporating learnable prompts within the text encoder and introducing\nimperceptible learnable noise trigger to the input images, we exploit the full\ncapabilities of the medical foundation models (Med-FM). Our method, BAPLe,\nrequires only a minimal subset of data to adjust the noise trigger and the text\nprompts for downstream tasks, enabling the creation of an effective backdoor\nattack. Through extensive experiments with four medical foundation models, each\npre-trained on different modalities and evaluated across six downstream\ndatasets, we demonstrate the efficacy of our approach. BAPLe achieves a high\nbackdoor success rate across all models and datasets, outperforming the\nbaseline backdoor attack methods. Our work highlights the vulnerability of\nMed-FMs towards backdoor attacks and strives to promote the safe adoption of\nMed-FMs before their deployment in real-world applications. Code is available\nat https://asif-hanif.github.io/baple/.\n","authors":["Asif Hanif","Fahad Shamshad","Muhammad Awais","Muzammal Naseer","Fahad Shahbaz Khan","Karthik Nandakumar","Salman Khan","Rao Muhammad Anwer"],"pdf_url":"https://arxiv.org/pdf/2408.07440v2.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08070v1","updated":"2024-08-15T10:35:26Z","published":"2024-08-15T10:35:26Z","title":"MambaMIM: Pre-training Mamba with State Space Token-interpolation","summary":" Generative self-supervised learning demonstrates outstanding representation\nlearning capabilities in both Convolutional Neural Networks (CNNs) and Vision\nTransformers (ViTs). However, there are currently no generative pre-training\nmethods related to selective state space models (Mamba) that can handle\nlong-range dependencies effectively. To address this challenge, we introduce a\ngenerative self-supervised learning method for Mamba (MambaMIM) based on\nSelective Structure State Space Sequence Token-interpolation (S6T), a\ngeneral-purpose pre-training method for arbitrary Mamba architectures. Our\nmethod, MambaMIM, incorporates a bottom-up 3D hybrid masking strategy in the\nencoder to maintain masking consistency across different architectures.\nAdditionally, S6T is employed to learn causal relationships between the masked\nsequence in the state space. MambaMIM can be used on any single or hybrid Mamba\narchitectures to enhance the Mamba long-range representation capability.\nExtensive downstream experiments reveal the feasibility and advancement of\nusing Mamba for pre-training medical image tasks. The code is available at:\nhttps://github.com/FengheTan9/MambaMIM\n","authors":["Fenghe Tang","Bingkun Nian","Yingtai Li","Jie Yang","Liu Wei","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.08070v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.13764v3","updated":"2024-08-15T10:03:37Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v3.pdf","comment":"Accepted to ECCV 2024. Code is available at\n https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2406.17342v2","updated":"2024-08-15T09:59:58Z","published":"2024-06-25T07:57:03Z","title":"Masked Generative Extractor for Synergistic Representation and 3D\n Generation of Point Clouds","summary":" Representation and generative learning, as reconstruction-based methods, have\ndemonstrated their potential for mutual reinforcement across various domains.\nIn the field of point cloud processing, although existing studies have adopted\ntraining strategies from generative models to enhance representational\ncapabilities, these methods are limited by their inability to genuinely\ngenerate 3D shapes. To explore the benefits of deeply integrating 3D\nrepresentation learning and generative learning, we propose an innovative\nframework called \\textit{Point-MGE}. Specifically, this framework first\nutilizes a vector quantized variational autoencoder to reconstruct a neural\nfield representation of 3D shapes, thereby learning discrete semantic features\nof point patches. Subsequently, we design a sliding masking ratios to smooth\nthe transition from representation learning to generative learning. Moreover,\nour method demonstrates strong generalization capability in learning\nhigh-capacity models, achieving new state-of-the-art performance across\nmultiple downstream tasks. In shape classification, Point-MGE achieved an\naccuracy of 94.2% (+1.0%) on the ModelNet40 dataset and 92.9% (+5.5%) on the\nScanObjectNN dataset. Experimental results also confirmed that Point-MGE can\ngenerate high-quality 3D shapes in both unconditional and conditional settings.\n","authors":["Hongliang Zeng","Ping Zhang","Fang Li","Jiahua Wang","Tingyu Ye","Pengteng Guo"],"pdf_url":"https://arxiv.org/pdf/2406.17342v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08058v1","updated":"2024-08-15T09:55:51Z","published":"2024-08-15T09:55:51Z","title":"Navigating Data Scarcity using Foundation Models: A Benchmark of\n Few-Shot and Zero-Shot Learning Approaches in Medical Imaging","summary":" Data scarcity is a major limiting factor for applying modern machine learning\ntechniques to clinical tasks. Although sufficient data exists for some\nwell-studied medical tasks, there remains a long tail of clinically relevant\ntasks with poor data availability. Recently, numerous foundation models have\ndemonstrated high suitability for few-shot learning (FSL) and zero-shot\nlearning (ZSL), potentially making them more accessible to practitioners.\nHowever, it remains unclear which foundation model performs best on FSL medical\nimage analysis tasks and what the optimal methods are for learning from limited\ndata. We conducted a comprehensive benchmark study of ZSL and FSL using 16\npretrained foundation models on 19 diverse medical imaging datasets. Our\nresults indicate that BiomedCLIP, a model pretrained exclusively on medical\ndata, performs best on average for very small training set sizes, while very\nlarge CLIP models pretrained on LAION-2B perform best with slightly more\ntraining samples. However, simply fine-tuning a ResNet-18 pretrained on\nImageNet performs similarly with more than five training examples per class.\nOur findings also highlight the need for further research on foundation models\nspecifically tailored for medical applications and the collection of more\ndatasets to train these models.\n","authors":["Stefano Woerner","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2408.08058v1.pdf","comment":"Accepted as an oral presentation in MICCAI 2024 2nd International\n Workshop on Foundation Models for General Medical AI"},{"id":"http://arxiv.org/abs/2402.17514v2","updated":"2024-08-15T09:38:42Z","published":"2024-02-27T13:55:17Z","title":"Robust Zero-Shot Crowd Counting and Localization With Adaptive\n Resolution SAM","summary":" The existing crowd counting models require extensive training data, which is\ntime-consuming to annotate. To tackle this issue, we propose a simple yet\neffective crowd counting method by utilizing the Segment-Everything-Everywhere\nModel (SEEM), an adaptation of the Segmentation Anything Model (SAM), to\ngenerate pseudo-labels for training crowd counting models. However, our initial\ninvestigation reveals that SEEM's performance in dense crowd scenes is limited,\nprimarily due to the omission of many persons in high-density areas. To\novercome this limitation, we propose an adaptive resolution SEEM to handle the\nscale variations, occlusions, and overlapping of people within crowd scenes.\nAlongside this, we introduce a robust localization method, based on Gaussian\nMixture Models, for predicting the head positions in the predicted people\nmasks. Given the mask and point pseudo-labels, we propose a robust loss\nfunction, which is designed to exclude uncertain regions based on SEEM's\npredictions, thereby enhancing the training process of the counting networks.\nFinally, we propose an iterative method for generating pseudo-labels. This\nmethod aims at improving the quality of the segmentation masks by identifying\nmore tiny persons in high-density regions, which are often missed in the first\npseudo-labeling stage. Overall, our proposed method achieves the best\nunsupervised performance in crowd counting, while also being comparable results\nto some supervised methods. This makes it a highly effective and versatile tool\nfor crowd counting, especially in situations where labeled data is not\navailable.\n","authors":["Jia Wan","Qiangqiang Wu","Wei Lin","Antoni B. Chan"],"pdf_url":"https://arxiv.org/pdf/2402.17514v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2404.07600v3","updated":"2024-08-15T09:34:34Z","published":"2024-04-11T09:39:58Z","title":"Implicit and Explicit Language Guidance for Diffusion-based Visual\n Perception","summary":" Text-to-image diffusion models have shown powerful ability on conditional\nimage synthesis. With large-scale vision-language pre-training, diffusion\nmodels are able to generate high-quality images with rich texture and\nreasonable structure under different text prompts. However, it is an open\nproblem to adapt the pre-trained diffusion model for visual perception. In this\npaper, we propose an implicit and explicit language guidance framework for\ndiffusion-based perception, named IEDP. Our IEDP comprises an implicit language\nguidance branch and an explicit language guidance branch. The implicit branch\nemploys frozen CLIP image encoder to directly generate implicit text embeddings\nthat are fed to diffusion model, without using explicit text prompts. The\nexplicit branch utilizes the ground-truth labels of corresponding images as\ntext prompts to condition feature extraction of diffusion model. During\ntraining, we jointly train diffusion model by sharing the model weights of\nthese two branches. As a result, implicit and explicit branches can jointly\nguide feature learning. During inference, we only employ implicit branch for\nfinal prediction, which does not require any ground-truth labels. Experiments\nare performed on two typical perception tasks, including semantic segmentation\nand depth estimation. Our IEDP achieves promising performance on both tasks.\nFor semantic segmentation, our IEDP has the mIoU$^\\text{ss}$ score of 55.9% on\nAD20K validation set, which outperforms the baseline method VPD by 2.2%. For\ndepth estimation, our IEDP outperforms the baseline method VPD with a relative\ngain of 11.0%.\n","authors":["Hefeng Wang","Jiale Cao","Jin Xie","Aiping Yang","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2404.07600v3.pdf","comment":"Accepted by IEEE TMM"},{"id":"http://arxiv.org/abs/2408.08050v1","updated":"2024-08-15T09:33:43Z","published":"2024-08-15T09:33:43Z","title":"CamoTeacher: Dual-Rotation Consistency Learning for Semi-Supervised\n Camouflaged Object Detection","summary":" Existing camouflaged object detection~(COD) methods depend heavily on\nlarge-scale pixel-level annotations.However, acquiring such annotations is\nlaborious due to the inherent camouflage characteristics of the\nobjects.Semi-supervised learning offers a promising solution to this\nchallenge.Yet, its application in COD is hindered by significant pseudo-label\nnoise, both pixel-level and instance-level.We introduce CamoTeacher, a novel\nsemi-supervised COD framework, utilizing Dual-Rotation Consistency\nLearning~(DRCL) to effectively address these noise issues.Specifically, DRCL\nminimizes pseudo-label noise by leveraging rotation views' consistency in\npixel-level and instance-level.First, it employs Pixel-wise Consistency\nLearning~(PCL) to deal with pixel-level noise by reweighting the different\nparts within the pseudo-label.Second, Instance-wise Consistency Learning~(ICL)\nis used to adjust weights for pseudo-labels, which handles instance-level\nnoise.Extensive experiments on four COD benchmark datasets demonstrate that the\nproposed CamoTeacher not only achieves state-of-the-art compared with\nsemi-supervised learning methods, but also rivals established fully-supervised\nlearning methods.Our code will be available soon.\n","authors":["Xunfa Lai","Zhiyu Yang","Jie Hu","Shengchuan Zhang","Liujuan Cao","Guannan Jiang","Zhiyu Wang","Songan Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.08050v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2401.13961v4","updated":"2024-08-15T09:23:00Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n in VEM images","summary":" While imaging techniques at macro and mesoscales have garnered substantial\nattention and resources, microscale Volume Electron Microscopy (vEM) imaging,\ncapable of revealing intricate vascular details, has lacked the necessary\nbenchmarking infrastructure. In this paper, we address a significant gap in\nthis field of neuroimaging by introducing the first-in-class public benchmark,\nBvEM, designed specifically for cortical blood vessel segmentation in vEM\nimages. Our BvEM benchmark is based on vEM image volumes from three mammals:\nadult mouse, macaque, and human. We standardized the resolution, addressed\nimaging variations, and meticulously annotated blood vessels through\nsemi-automatic, manual, and quality control processes, ensuring high-quality 3D\nsegmentation. Furthermore, we developed a zero-shot cortical blood vessel\nsegmentation method named TriSAM, which leverages the powerful segmentation\nmodel SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation,\nTriSAM employs a multi-seed tracking framework, leveraging the reliability of\ncertain image planes for tracking while using others to identify potential\nturning points. This approach effectively achieves long-term 3D blood vessel\nsegmentation without model training or fine-tuning. Experimental results show\nthat TriSAM achieved superior performances on the BvEM benchmark across three\nspecies. Our dataset, code, and model are available online at\n\\url{https://jia-wan.github.io/bvem}.\n","authors":["Jia Wan","Wanhua Li","Jason Ken Adhinarta","Atmadeep Banerjee","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v4.pdf","comment":"BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9"},{"id":"http://arxiv.org/abs/2408.08038v1","updated":"2024-08-15T09:06:49Z","published":"2024-08-15T09:06:49Z","title":"PI-Att: Topology Attention for Segmentation Networks through Adaptive\n Persistence Image Representation","summary":" Segmenting multiple objects (e.g., organs) in medical images often requires\nan understanding of their topology, which simultaneously quantifies the shape\nof the objects and their positions relative to each other. This understanding\nis important for segmentation networks to generalize better with limited\ntraining data, which is common in medical image analysis. However, many popular\nnetworks were trained to optimize only pixel-wise performance, ignoring the\ntopological correctness of the segmentation. In this paper, we introduce a new\ntopology-aware loss function, which we call PI-Att, that explicitly forces the\nnetwork to minimize the topological dissimilarity between the ground truth and\nprediction maps. We quantify the topology of each map by the persistence image\nrepresentation, for the first time in the context of a segmentation network\nloss. Besides, we propose a new mechanism to adaptively calculate the\npersistence image at the end of each epoch based on the network's performance.\nThis adaptive calculation enables the network to learn topology outline in the\nfirst epochs, and then topology details towards the end of training. The\neffectiveness of the proposed PI-Att loss is demonstrated on two different\ndatasets for aorta and great vessel segmentation in computed tomography images.\n","authors":["Mehmet Bahadir Erden","Sinan Unver","Ilke Ali Gurses","Rustu Turkay","Cigdem Gunduz-Demir"],"pdf_url":"https://arxiv.org/pdf/2408.08038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08035v1","updated":"2024-08-15T09:05:00Z","published":"2024-08-15T09:05:00Z","title":"An Advanced Deep Learning Based Three-Stream Hybrid Model for Dynamic\n Hand Gesture Recognition","summary":" In the modern context, hand gesture recognition has emerged as a focal point.\nThis is due to its wide range of applications, which include comprehending sign\nlanguage, factories, hands-free devices, and guiding robots. Many researchers\nhave attempted to develop more effective techniques for recognizing these hand\ngestures. However, there are challenges like dataset limitations, variations in\nhand forms, external environments, and inconsistent lighting conditions. To\naddress these challenges, we proposed a novel three-stream hybrid model that\ncombines RGB pixel and skeleton-based features to recognize hand gestures. In\nthe procedure, we preprocessed the dataset, including augmentation, to make\nrotation, translation, and scaling independent systems. We employed a\nthree-stream hybrid model to extract the multi-feature fusion using the power\nof the deep learning module. In the first stream, we extracted the initial\nfeature using the pre-trained Imagenet module and then enhanced this feature by\nusing a multi-layer of the GRU and LSTM modules. In the second stream, we\nextracted the initial feature with the pre-trained ReseNet module and enhanced\nit with the various combinations of the GRU and LSTM modules. In the third\nstream, we extracted the hand pose key points using the media pipe and then\nenhanced them using the stacked LSTM to produce the hierarchical feature. After\nthat, we concatenated the three features to produce the final. Finally, we\nemployed a classification module to produce the probabilistic map to generate\npredicted output. We mainly produced a powerful feature vector by taking\nadvantage of the pixel-based deep learning feature and pos-estimation-based\nstacked deep learning feature, including a pre-trained model with a scratched\ndeep learning model for unequalled gesture detection capabilities.\n","authors":["Md Abdur Rahim","Abu Saleh Musa Miah","Hemel Sharker Akash","Jungpil Shin","Md. Imran Hossain","Md. Najmul Hossain"],"pdf_url":"https://arxiv.org/pdf/2408.08035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06162v2","updated":"2024-08-15T08:59:38Z","published":"2024-06-02T17:09:59Z","title":"RNNs, CNNs and Transformers in Human Action Recognition: A Survey and a\n Hybrid Model","summary":" Human Action Recognition (HAR) encompasses the task of monitoring human\nactivities across various domains, including but not limited to medical,\neducational, entertainment, visual surveillance, video retrieval, and the\nidentification of anomalous activities. Over the past decade, the field of HAR\nhas witnessed substantial progress by leveraging Convolutional Neural Networks\n(CNNs) to effectively extract and comprehend intricate information, thereby\nenhancing the overall performance of HAR systems. Recently, the domain of\ncomputer vision has witnessed the emergence of Vision Transformers (ViTs) as a\npotent solution. The efficacy of transformer architecture has been validated\nbeyond the confines of image analysis, extending their applicability to diverse\nvideo-related tasks. Notably, within this landscape, the research community has\nshown keen interest in HAR, acknowledging its manifold utility and widespread\nadoption across various domains. This article aims to present an encompassing\nsurvey that focuses on CNNs and the evolution of Recurrent Neural Networks\n(RNNs) to ViTs given their importance in the domain of HAR. By conducting a\nthorough examination of existing literature and exploring emerging trends, this\nstudy undertakes a critical analysis and synthesis of the accumulated knowledge\nin this field. Additionally, it investigates the ongoing efforts to develop\nhybrid approaches. Following this direction, this article presents a novel\nhybrid model that seeks to integrate the inherent strengths of CNNs and ViTs.\n","authors":["Khaled Alomar","Halil Ibrahim Aysel","Xiaohao Cai"],"pdf_url":"https://arxiv.org/pdf/2407.06162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00640v2","updated":"2024-08-15T08:56:45Z","published":"2024-08-01T15:27:48Z","title":"AMAES: Augmented Masked Autoencoder Pretraining on Public Brain MRI Data\n for 3D-Native Segmentation","summary":" This study investigates the impact of self-supervised pretraining of 3D\nsemantic segmentation models on a large-scale, domain-specific dataset. We\nintroduce BRAINS-45K, a dataset of 44,756 brain MRI volumes from public\nsources, the largest public dataset available, and revisit a number of design\nchoices for pretraining modern segmentation architectures by simplifying and\noptimizing state-of-the-art methods, and combining them with a novel\naugmentation strategy. The resulting AMAES framework is based on\nmasked-image-modeling and intensity-based augmentation reversal and balances\nmemory usage, runtime, and finetuning performance. Using the popular U-Net and\nthe recent MedNeXt architecture as backbones, we evaluate the effect of\npretraining on three challenging downstream tasks, covering single-sequence,\nlow-resource settings, and out-of-domain generalization. The results highlight\nthat pretraining on the proposed dataset with AMAES significantly improves\nsegmentation performance in the majority of evaluated cases, and that it is\nbeneficial to pretrain the model with augmentations, despite pretraing on a\nlarge-scale dataset. Code and model checkpoints for reproducing results, as\nwell as the BRAINS-45K dataset are available at\n\\url{https://github.com/asbjrnmunk/amaes}.\n","authors":["Asbjørn Munk","Jakob Ambsdorf","Sebastian Llambias","Mads Nielsen"],"pdf_url":"https://arxiv.org/pdf/2408.00640v2.pdf","comment":"Accepted at ADSMI @ MICCAI 2024"},{"id":"http://arxiv.org/abs/2405.19730v3","updated":"2024-08-15T08:41:57Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Large Model","summary":" This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v3.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.08021v1","updated":"2024-08-15T08:37:24Z","published":"2024-08-15T08:37:24Z","title":"DIVE: Towards Descriptive and Diverse Visual Commonsense Generation","summary":" Towards human-level visual understanding, visual commonsense generation has\nbeen introduced to generate commonsense inferences beyond images. However,\ncurrent research on visual commonsense generation has overlooked an important\nhuman cognitive ability: generating descriptive and diverse inferences. In this\nwork, we propose a novel visual commonsense generation framework, called DIVE,\nwhich aims to improve the descriptiveness and diversity of generated\ninferences. DIVE involves two methods, generic inference filtering and\ncontrastive retrieval learning, which address the limitations of existing\nvisual commonsense resources and training objectives. Experimental results\nverify that DIVE outperforms state-of-the-art models for visual commonsense\ngeneration in terms of both descriptiveness and diversity, while showing a\nsuperior quality in generating unique and novel inferences. Notably, DIVE\nachieves human-level descriptiveness and diversity on Visual Commonsense\nGraphs. Furthermore, human evaluations confirm that DIVE aligns closely with\nhuman judgments on descriptiveness and diversity\\footnote{Our code and dataset\nare available at https://github.com/Park-ing-lot/DIVE.\n","authors":["Jun-Hyung Park","Hyuntae Park","Youjin Kang","Eojin Jeon","SangKeun Lee"],"pdf_url":"https://arxiv.org/pdf/2408.08021v1.pdf","comment":"19 pages, 10 figuers, EMNLP 2023 (main)"},{"id":"http://arxiv.org/abs/2408.08015v1","updated":"2024-08-15T08:25:50Z","published":"2024-08-15T08:25:50Z","title":"Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for\n Collaborative DNN Training on Heterogeneous Edge Devices","summary":" On-device Deep Neural Network (DNN) training has been recognized as crucial\nfor privacy-preserving machine learning at the edge. However, the intensive\ntraining workload and limited onboard computing resources pose significant\nchallenges to the availability and efficiency of model training. While existing\nworks address these challenges through native resource management optimization,\nwe instead leverage our observation that edge environments usually comprise a\nrich set of accompanying trusted edge devices with idle resources beyond a\nsingle terminal. We propose Asteroid, a distributed edge training system that\nbreaks the resource walls across heterogeneous edge devices for efficient model\ntraining acceleration. Asteroid adopts a hybrid pipeline parallelism to\norchestrate distributed training, along with a judicious parallelism planning\nfor maximizing throughput under certain resource constraints. Furthermore, a\nfault-tolerant yet lightweight pipeline replay mechanism is developed to tame\nthe device-level dynamics for training robustness and performance stability. We\nimplement Asteroid on heterogeneous edge devices with both vision and language\nmodels, demonstrating up to 12.2x faster training than conventional parallelism\nmethods and 2.1x faster than state-of-the-art hybrid parallelism methods\nthrough evaluations. Furthermore, Asteroid can recover training pipeline 14x\nfaster than baseline methods while preserving comparable throughput despite\nunexpected device exiting and failure.\n","authors":["Shengyuan Ye","Liekang Zeng","Xiaowen Chu","Guoliang Xing","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08015v1.pdf","comment":"Accepted by The 30th Annual International Conference on Mobile\n Computing and Networking (MobiCom'24)"},{"id":"http://arxiv.org/abs/2408.08013v1","updated":"2024-08-15T08:22:10Z","published":"2024-08-15T08:22:10Z","title":"Adaptive Learning of Consistency and Inconsistency Information for Fake\n News Detection","summary":" The rapid advancement of social media platforms has significantly reduced the\ncost of information dissemination, yet it has also led to a proliferation of\nfake news, posing a threat to societal trust and credibility. Most of fake news\ndetection research focused on integrating text and image information to\nrepresent the consistency of multiple modes in news content, while paying less\nattention to inconsistent information. Besides, existing methods that leveraged\ninconsistent information often caused one mode overshadowing another, leading\nto ineffective use of inconsistent clue. To address these issues, we propose an\nadaptive multi-modal feature fusion network (MFF-Net). Inspired by human\njudgment processes for determining truth and falsity in news, MFF-Net focuses\non inconsistent parts when news content is generally consistent and consistent\nparts when it is generally inconsistent. Specifically, MFF-Net extracts\nsemantic and global features from images and texts respectively, and learns\nconsistency information between modes through a multiple feature fusion module.\nTo deal with the problem of modal information being easily masked, we design a\nsingle modal feature filtering strategy to capture inconsistent information\nfrom corresponding modes separately. Finally, similarity scores are calculated\nbased on global features with adaptive adjustments made to achieve weighted\nfusion of consistent and inconsistent features. Extensive experimental results\ndemonstrate that MFF-Net outperforms state-of-the-art methods across three\npublic news datasets derived from real social medias.\n","authors":["Aohan Li","Jiaxin Chen","Xin Liao","Dengyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08000v1","updated":"2024-08-15T07:57:28Z","published":"2024-08-15T07:57:28Z","title":"MVInpainter: Learning Multi-View Consistent Inpainting to Bridge 2D and\n 3D Editing","summary":" Novel View Synthesis (NVS) and 3D generation have recently achieved prominent\nimprovements. However, these works mainly focus on confined categories or\nsynthetic 3D assets, which are discouraged from generalizing to challenging\nin-the-wild scenes and fail to be employed with 2D synthesis directly.\nMoreover, these methods heavily depended on camera poses, limiting their\nreal-world applications. To overcome these issues, we propose MVInpainter,\nre-formulating the 3D editing as a multi-view 2D inpainting task. Specifically,\nMVInpainter partially inpaints multi-view images with the reference guidance\nrather than intractably generating an entirely novel view from scratch, which\nlargely simplifies the difficulty of in-the-wild NVS and leverages unmasked\nclues instead of explicit pose conditions. To ensure cross-view consistency,\nMVInpainter is enhanced by video priors from motion components and appearance\nguidance from concatenated reference key&value attention. Furthermore,\nMVInpainter incorporates slot attention to aggregate high-level optical flow\nfeatures from unmasked regions to control the camera movement with pose-free\ntraining and inference. Sufficient scene-level experiments on both\nobject-centric and forward-facing datasets verify the effectiveness of\nMVInpainter, including diverse tasks, such as multi-view object removal,\nsynthesis, insertion, and replacement. The project page is\nhttps://ewrfcas.github.io/MVInpainter/.\n","authors":["Chenjie Cao","Chaohui Yu","Yanwei Fu","Fan Wang","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2408.08000v1.pdf","comment":"Project page: https://ewrfcas.github.io/MVInpainter/"},{"id":"http://arxiv.org/abs/2408.07999v1","updated":"2024-08-15T07:56:02Z","published":"2024-08-15T07:56:02Z","title":"Co-Fix3D: Enhancing 3D Object Detection with Collaborative Refinement","summary":" In the realm of autonomous driving,accurately detecting occluded or distant\nobjects,referred to as weak positive sample ,presents significant challenges.\nThese challenges predominantly arise during query initialization, where an\nover-reliance on heatmap confidence often results in a high rate of false\npositives, consequently masking weaker detections and impairing system\nperformance. To alleviate this issue, we propose a novel approach, Co-Fix3D,\nwhich employs a collaborative hybrid multi-stage parallel query generation\nmechanism for BEV representations. Our method incorporates the Local-Global\nFeature Enhancement (LGE) module, which refines BEV features to more\neffectively highlight weak positive samples. It uniquely leverages the Discrete\nWavelet Transform (DWT) for accurate noise reduction and features refinement in\nlocalized areas, and incorporates an attention mechanism to more\ncomprehensively optimize global BEV features. Moreover, our method increases\nthe volume of BEV queries through a multi-stage parallel processing of the LGE,\nsignificantly enhancing the probability of selecting weak positive samples.\nThis enhancement not only improves training efficiency within the decoder\nframework but also boosts overall system performance. Notably, Co-Fix3D\nachieves superior results on the stringent nuScenes benchmark, outperforming\nall previous models with a 69.1% mAP and 72.9% NDS on the LiDAR-based\nbenchmark, and 72.3% mAP and 74.1% NDS on the multi-modality benchmark, without\nrelying on test-time augmentation or additional datasets. The source code will\nbe made publicly available upon acceptance.\n","authors":["Wenxuan Li","Qin Zou","Chi Chen","Bo Du","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07996v1","updated":"2024-08-15T07:46:51Z","published":"2024-08-15T07:46:51Z","title":"Monte Carlo Path Tracing and Statistical Event Detection for Event\n Camera Simulation","summary":" This paper presents a novel event camera simulation system fully based on\nphysically based Monte Carlo path tracing with adaptive path sampling. The\nadaptive sampling performed in the proposed method is based on a statistical\ntechnique, hypothesis testing for the hypothesis whether the difference of\nlogarithmic luminances at two distant periods is significantly larger than a\npredefined event threshold. To this end, our rendering system collects\nlogarithmic luminances rather than raw luminance in contrast to the\nconventional rendering system imitating conventional RGB cameras. Then, based\non the central limit theorem, we reasonably assume that the distribution of the\npopulation mean of logarithmic luminance can be modeled as a normal\ndistribution, allowing us to model the distribution of the difference of\nlogarithmic luminance as a normal distribution. Then, using Student's t-test,\nwe can test the hypothesis and determine whether to discard the null hypothesis\nfor event non-occurrence. When we sample a sufficiently large number of path\nsamples to satisfy the central limit theorem and obtain a clean set of events,\nour method achieves significant speed up compared to a simple approach of\nsampling paths uniformly at every pixel. To our knowledge, we are the first to\nsimulate the behavior of event cameras in a physically accurate manner using an\nadaptive sampling technique in Monte Carlo path tracing, and we believe this\nstudy will contribute to the development of computer vision applications using\nevent cameras.\n","authors":["Yuichiro Manabe","Tatsuya Yatagawa","Shigeo Morishima","Hiroyuki Kubo"],"pdf_url":"https://arxiv.org/pdf/2408.07996v1.pdf","comment":"10 pages, 7 figures, Presented at ICCP 2024"},{"id":"http://arxiv.org/abs/2408.07989v1","updated":"2024-08-15T07:30:47Z","published":"2024-08-15T07:30:47Z","title":"IIU: Independent Inference Units for Knowledge-based Visual Question\n Answering","summary":" Knowledge-based visual question answering requires external knowledge beyond\nvisible content to answer the question correctly. One limitation of existing\nmethods is that they focus more on modeling the inter-modal and intra-modal\ncorrelations, which entangles complex multimodal clues by implicit embeddings\nand lacks interpretability and generalization ability. The key challenge to\nsolve the above problem is to separate the information and process it\nseparately at the functional level. By reusing each processing unit, the\ngeneralization ability of the model to deal with different data can be\nincreased. In this paper, we propose Independent Inference Units (IIU) for\nfine-grained multi-modal reasoning to decompose intra-modal information by the\nfunctionally independent units. Specifically, IIU processes each\nsemantic-specific intra-modal clue by an independent inference unit, which also\ncollects complementary information by communication from different units. To\nfurther reduce the impact of redundant information, we propose a memory update\nmodule to maintain semantic-relevant memory along with the reasoning process\ngradually. In comparison with existing non-pretrained multi-modal reasoning\nmodels on standard datasets, our model achieves a new state-of-the-art,\nenhancing performance by 3%, and surpassing basic pretrained multi-modal\nmodels. The experimental results show that our IIU model is effective in\ndisentangling intra-modal clues as well as reasoning units to provide\nexplainable reasoning evidence. Our code is available at\nhttps://github.com/Lilidamowang/IIU.\n","authors":["Yili Li","Jing Yu","Keke Gai","Gang Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.07989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07988v1","updated":"2024-08-15T07:30:21Z","published":"2024-08-15T07:30:21Z","title":"Exploring learning environments for label\\-efficient cancer diagnosis","summary":" Despite significant research efforts and advancements, cancer remains a\nleading cause of mortality. Early cancer prediction has become a crucial focus\nin cancer research to streamline patient care and improve treatment outcomes.\nManual tumor detection by histopathologists can be time consuming, prompting\nthe need for computerized methods to expedite treatment planning. Traditional\napproaches to tumor detection rely on supervised learning, necessitates a large\namount of annotated data for model training. However, acquiring such extensive\nlabeled data can be laborious and time\\-intensive. This research examines the\nthree learning environments: supervised learning (SL), semi\\-supervised\nlearning (Semi\\-SL), and self\\-supervised learning (Self\\-SL): to predict\nkidney, lung, and breast cancer. Three pre\\-trained deep learning models\n(Residual Network\\-50, Visual Geometry Group\\-16, and EfficientNetB0) are\nevaluated based on these learning settings using seven carefully curated\ntraining sets. To create the first training set (TS1), SL is applied to all\nannotated image samples. Five training sets (TS2\\-TS6) with different ratios of\nlabeled and unlabeled cancer images are used to evaluateSemi\\-SL. Unlabeled\ncancer images from the final training set (TS7) are utilized for Self\\-SL\nassessment. Among different learning environments, outcomes from the Semi\\-SL\nsetting show a strong degree of agreement with the outcomes achieved in the SL\nsetting. The uniform pattern of observations from the pre\\-trained models\nacross all three datasets validates the methodology and techniques of the\nresearch. Based on modest number of labeled samples and minimal computing cost,\nour study suggests that the Semi\\-SL option can be a highly viable replacement\nfor the SL option under label annotation constraint scenarios.\n","authors":["Samta Rani","Tanvir Ahmad","Sarfaraz Masood","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2408.07988v1.pdf","comment":"Submitted to the journal"},{"id":"http://arxiv.org/abs/2408.07985v1","updated":"2024-08-15T07:10:17Z","published":"2024-08-15T07:10:17Z","title":"Analytical Uncertainty-Based Loss Weighting in Multi-Task Learning","summary":" With the rise of neural networks in various domains, multi-task learning\n(MTL) gained significant relevance. A key challenge in MTL is balancing\nindividual task losses during neural network training to improve performance\nand efficiency through knowledge sharing across tasks. To address these\nchallenges, we propose a novel task-weighting method by building on the most\nprevalent approach of Uncertainty Weighting and computing analytically optimal\nuncertainty-based weights, normalized by a softmax function with tunable\ntemperature. Our approach yields comparable results to the combinatorially\nprohibitive, brute-force approach of Scalarization while offering a more\ncost-effective yet high-performing alternative. We conduct an extensive\nbenchmark on various datasets and architectures. Our method consistently\noutperforms six other common weighting methods. Furthermore, we report\nnoteworthy experimental findings for the practical application of MTL. For\nexample, larger networks diminish the influence of weighting methods, and\ntuning the weight decay has a low impact compared to the learning rate.\n","authors":["Lukas Kirchdorfer","Cathrin Elich","Simon Kutsche","Heiner Stuckenschmidt","Lukas Schott","Jan M. Köhler"],"pdf_url":"https://arxiv.org/pdf/2408.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07981v1","updated":"2024-08-15T07:00:20Z","published":"2024-08-15T07:00:20Z","title":"LLaVA-Surg: Towards Multimodal Surgical Assistant via Structured\n Surgical Video Learning","summary":" Multimodal large language models (LLMs) have achieved notable success across\nvarious domains, while research in the medical field has largely focused on\nunimodal images. Meanwhile, current general-domain multimodal models for videos\nstill lack the capabilities to understand and engage in conversations about\nsurgical videos. One major contributing factor is the absence of datasets in\nthe surgical field. In this paper, we create a new dataset, Surg-QA, consisting\nof 102,000 surgical video-instruction pairs, the largest of its kind so far. To\nbuild such a dataset, we propose a novel two-stage question-answer generation\npipeline with LLM to learn surgical knowledge in a structured manner from the\npublicly available surgical lecture videos. The pipeline breaks down the\ngeneration process into two stages to significantly reduce the task complexity,\nallowing us to use a more affordable, locally deployed open-source LLM than the\npremium paid LLM services. It also mitigates the risk of LLM hallucinations\nduring question-answer generation, thereby enhancing the overall quality of the\ngenerated data. We further train LLaVA-Surg, a novel vision-language\nconversational assistant capable of answering open-ended questions about\nsurgical videos, on this Surg-QA dataset, and conduct comprehensive evaluations\non zero-shot surgical video question-answering tasks. We show that LLaVA-Surg\nsignificantly outperforms all previous general-domain models, demonstrating\nexceptional multimodal conversational skills in answering open-ended questions\nabout surgical videos. We will release our code, model, and the\ninstruction-tuning dataset.\n","authors":["Jiajie Li","Garrett Skinner","Gene Yang","Brian R Quaranto","Steven D Schwaitzberg","Peter C W Kim","Jinjun Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.07981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07975v1","updated":"2024-08-15T06:40:38Z","published":"2024-08-15T06:40:38Z","title":"Polaris: Open-ended Interactive Robotic Manipulation via Syn2Real Visual\n Grounding and Large Language Models","summary":" This paper investigates the task of the open-ended interactive robotic\nmanipulation on table-top scenarios. While recent Large Language Models (LLMs)\nenhance robots' comprehension of user instructions, their lack of visual\ngrounding constrains their ability to physically interact with the environment.\nThis is because the robot needs to locate the target object for manipulation\nwithin the physical workspace. To this end, we introduce an interactive robotic\nmanipulation framework called Polaris, which integrates perception and\ninteraction by utilizing GPT-4 alongside grounded vision models. For precise\nmanipulation, it is essential that such grounded vision models produce detailed\nobject pose for the target object, rather than merely identifying pixels\nbelonging to them in the image. Consequently, we propose a novel\nSynthetic-to-Real (Syn2Real) pose estimation pipeline. This pipeline utilizes\nrendered synthetic data for training and is then transferred to real-world\nmanipulation tasks. The real-world performance demonstrates the efficacy of our\nproposed pipeline and underscores its potential for extension to more general\ncategories. Moreover, real-robot experiments have showcased the impressive\nperformance of our framework in grasping and executing multiple manipulation\ntasks. This indicates its potential to generalize to scenarios beyond the\ntabletop. More information and video results are available here:\nhttps://star-uu-wang.github.io/Polaris/\n","authors":["Tianyu Wang","Haitao Lin","Junqiu Yu","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2408.07975v1.pdf","comment":"Accepted by IROS 2024. 8 pages, 5 figures. See\n https://star-uu-wang.github.io/Polaris/"},{"id":"http://arxiv.org/abs/2408.07967v1","updated":"2024-08-15T06:27:42Z","published":"2024-08-15T06:27:42Z","title":"FlashGS: Efficient 3D Gaussian Splatting for Large-scale and\n High-resolution Rendering","summary":" This work introduces FlashGS, an open-source CUDA Python library, designed to\nfacilitate the efficient differentiable rasterization of 3D Gaussian Splatting\nthrough algorithmic and kernel-level optimizations. FlashGS is developed based\non the observations from a comprehensive analysis of the rendering process to\nenhance computational efficiency and bring the technique to wide adoption. The\npaper includes a suite of optimization strategies, encompassing redundancy\nelimination, efficient pipelining, refined control and scheduling mechanisms,\nand memory access optimizations, all of which are meticulously integrated to\namplify the performance of the rasterization process. An extensive evaluation\nof FlashGS' performance has been conducted across a diverse spectrum of\nsynthetic and real-world large-scale scenes, encompassing a variety of image\nresolutions. The empirical findings demonstrate that FlashGS consistently\nachieves an average 4x acceleration over mobile consumer GPUs, coupled with\nreduced memory consumption. These results underscore the superior performance\nand resource optimization capabilities of FlashGS, positioning it as a\nformidable tool in the domain of 3D rendering.\n","authors":["Guofeng Feng","Siyan Chen","Rong Fu","Zimu Liao","Yi Wang","Tao Liu","Zhilin Pei","Hengjie Li","Xingcheng Zhang","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2408.07967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00416v3","updated":"2024-08-15T05:56:32Z","published":"2023-06-01T07:48:34Z","title":"Interactive Character Control with Auto-Regressive Motion Diffusion\n Models","summary":" Real-time character control is an essential component for interactive\nexperiences, with a broad range of applications, including physics simulations,\nvideo games, and virtual reality. The success of diffusion models for image\nsynthesis has led to the use of these models for motion synthesis. However, the\nmajority of these motion diffusion models are primarily designed for offline\napplications, where space-time models are used to synthesize an entire sequence\nof frames simultaneously with a pre-specified length. To enable real-time\nmotion synthesis with diffusion model that allows time-varying controls, we\npropose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional\ndiffusion model takes an initial pose as input, and auto-regressively generates\nsuccessive motion frames conditioned on the previous frame. Despite its\nstreamlined network architecture, which uses simple MLPs, our framework is\ncapable of generating diverse, long-horizon, and high-fidelity motion\nsequences. Furthermore, we introduce a suite of techniques for incorporating\ninteractive controls into A-MDM, such as task-oriented sampling, in-painting,\nand hierarchical reinforcement learning. These techniques enable a pre-trained\nA-MDM to be efficiently adapted for a variety of new downstream tasks. We\nconduct a comprehensive suite of experiments to demonstrate the effectiveness\nof A-MDM, and compare its performance against state-of-the-art auto-regressive\nmethods.\n","authors":["Yi Shi","Jingbo Wang","Xuekun Jiang","Bingkun Lin","Bo Dai","Xue Bin Peng"],"pdf_url":"https://arxiv.org/pdf/2306.00416v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07947v1","updated":"2024-08-15T05:43:46Z","published":"2024-08-15T05:43:46Z","title":"Conditional Brownian Bridge Diffusion Model for VHR SAR to Optical Image\n Translation","summary":" Synthetic Aperture Radar (SAR) imaging technology provides the unique\nadvantage of being able to collect data regardless of weather conditions and\ntime. However, SAR images exhibit complex backscatter patterns and speckle\nnoise, which necessitate expertise for interpretation. To deal with this\nchallenge, research has been conducted on translating SAR images into\noptical-like representations to aid the interpretation of SAR data.\nNevertheless, existing studies have predominantly utilized low-resolution\nsatellite imagery datasets and have largely been based on Generative\nAdversarial Network (GAN) which are known for their training instability and\nlow fidelity. To overcome these limitations of low-resolution data usage and\nGAN-based approaches, this paper introduces a conditional image-to-image\ntranslation approach based on Brownian Bridge Diffusion Model (BBDM). We\nconducted comprehensive experiments on the MSAW dataset, a paired SAR and\noptical images collection of 0.5m Very-High-Resolution (VHR) images. The\nexperimental results indicate that our method surpasses both the Conditional\nDiffusion Model (CDM) and the GAN-based models in diverse perceptual quality\nmetrics.\n","authors":["Seon-Hoon Kim","Dae-won Chung"],"pdf_url":"https://arxiv.org/pdf/2408.07947v1.pdf","comment":"5 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.07944v1","updated":"2024-08-15T05:35:52Z","published":"2024-08-15T05:35:52Z","title":"Training Spatial-Frequency Visual Prompts and Probabilistic Clusters for\n Accurate Black-Box Transfer Learning","summary":" Despite the growing prevalence of black-box pre-trained models (PTMs) such as\nprediction API services, there remains a significant challenge in directly\napplying general models to real-world scenarios due to the data distribution\ngap. Considering a data deficiency and constrained computational resource\nscenario, this paper proposes a novel parameter-efficient transfer learning\nframework for vision recognition models in the black-box setting. Our framework\nincorporates two novel training techniques. First, we align the input space\n(i.e., image) of PTMs to the target data distribution by generating visual\nprompts of spatial and frequency domain. Along with the novel spatial-frequency\nhybrid visual prompter, we design a novel training technique based on\nprobabilistic clusters, which can enhance class separation in the output space\n(i.e., prediction probabilities). In experiments, our model demonstrates\nsuperior performance in a few-shot transfer learning setting across extensive\nvisual recognition datasets, surpassing state-of-the-art baselines.\nAdditionally, we show that the proposed method efficiently reduces\ncomputational costs for training and inference phases.\n","authors":["Wonwoo Cho","Kangyeol Kim","Saemee Choi","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2408.07944v1.pdf","comment":"ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.18552v2","updated":"2024-08-15T05:14:38Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Modigari Narendra","Vigya Sharma","Santhosh Malarvannan","Amir H. Gandomi"],"pdf_url":"https://arxiv.org/pdf/2407.18552v2.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2403.06517v2","updated":"2024-08-15T05:04:21Z","published":"2024-03-11T08:45:31Z","title":"Active Generation for Image Classification","summary":" Recently, the growing capabilities of deep generative models have underscored\ntheir potential in enhancing image classification accuracy. However, existing\nmethods often demand the generation of a disproportionately large number of\nimages compared to the original dataset, while having only marginal\nimprovements in accuracy. This computationally expensive and time-consuming\nprocess hampers the practicality of such approaches. In this paper, we propose\nto address the efficiency of image generation by focusing on the specific needs\nand characteristics of the model. With a central tenet of active learning, our\nmethod, named ActGen, takes a training-aware approach to image generation. It\naims to create images akin to the challenging or misclassified samples\nencountered by the current model and incorporates these generated images into\nthe training set to augment model performance. ActGen introduces an attentive\nimage guidance technique, using real images as guides during the denoising\nprocess of a diffusion model. The model's attention on class prompt is\nleveraged to ensure the preservation of similar foreground object while\ndiversifying the background. Furthermore, we introduce a gradient-based\ngeneration guidance method, which employs two losses to generate more\nchallenging samples and prevent the generated images from being too similar to\npreviously generated ones. Experimental results on the CIFAR and ImageNet\ndatasets demonstrate that our method achieves better performance with a\nsignificantly reduced number of generated images. Code is available at\nhttps://github.com/hunto/ActGen.\n","authors":["Tao Huang","Jiaqi Liu","Shan You","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.06517v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.07932v1","updated":"2024-08-15T05:03:14Z","published":"2024-08-15T05:03:14Z","title":"MobileMEF: Fast and Efficient Method for Multi-Exposure Fusion","summary":" Recent advances in camera design and imaging technology have enabled the\ncapture of high-quality images using smartphones. However, due to the limited\ndynamic range of digital cameras, the quality of photographs captured in\nenvironments with highly imbalanced lighting often results in poor-quality\nimages. To address this issue, most devices capture multi-exposure frames and\nthen use some multi-exposure fusion method to merge those frames into a final\nfused image. Nevertheless, most traditional and current deep learning\napproaches are unsuitable for real-time applications on mobile devices due to\ntheir heavy computational and memory requirements. We propose a new method for\nmulti-exposure fusion based on an encoder-decoder deep learning architecture\nwith efficient building blocks tailored for mobile devices. This efficient\ndesign makes our model capable of processing 4K resolution images in less than\n2 seconds on mid-range smartphones. Our method outperforms state-of-the-art\ntechniques regarding full-reference quality measures and computational\nefficiency (runtime and memory usage), making it ideal for real-time\napplications on hardware-constrained devices. Our code is available at:\nhttps://github.com/LucasKirsten/MobileMEF.\n","authors":["Lucas Nedel Kirsten","Zhicheng Fu","Nikhil Ambha Madhusudhana"],"pdf_url":"https://arxiv.org/pdf/2408.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07931v1","updated":"2024-08-15T04:59:12Z","published":"2024-08-15T04:59:12Z","title":"Surgical SAM 2: Real-time Segment Anything in Surgical Video by\n Efficient Frame Pruning","summary":" Surgical video segmentation is a critical task in computer-assisted surgery\nand is vital for enhancing surgical quality and patient outcomes. Recently, the\nSegment Anything Model 2 (SAM2) framework has shown superior advancements in\nimage and video segmentation. However, SAM2 struggles with efficiency due to\nthe high computational demands of processing high-resolution images and complex\nand long-range temporal dynamics in surgical videos. To address these\nchallenges, we introduce Surgical SAM 2 (SurgSAM-2), an advanced model to\nutilize SAM2 with an Efficient Frame Pruning (EFP) mechanism, to facilitate\nreal-time surgical video segmentation. The EFP mechanism dynamically manages\nthe memory bank by selectively retaining only the most informative frames,\nreducing memory usage and computational cost while maintaining high\nsegmentation accuracy. Our extensive experiments demonstrate that SurgSAM-2\nsignificantly improves both efficiency and segmentation accuracy compared to\nthe vanilla SAM2. Remarkably, SurgSAM-2 achieves a 3$\\times$ FPS compared with\nSAM2, while also delivering state-of-the-art performance after fine-tuning with\nlower-resolution data. These advancements establish SurgSAM-2 as a leading\nmodel for surgical video analysis, making real-time surgical video segmentation\nin resource-constrained environments a feasible reality.\n","authors":["Haofeng Liu","Erli Zhang","Junde Wu","Mingxuan Hong","Yueming Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07931v1.pdf","comment":"16 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.04024v2","updated":"2024-08-15T04:53:51Z","published":"2024-03-06T20:10:41Z","title":"Enhancing chest X-ray datasets with privacy-preserving large language\n models and multi-type annotations: a data-driven approach for improved\n classification","summary":" In chest X-ray (CXR) image analysis, rule-based systems are usually employed\nto extract labels from reports for dataset releases. However, there is still\nroom for improvement in label quality. These labelers typically output only\npresence labels, sometimes with binary uncertainty indicators, which limits\ntheir usefulness. Supervised deep learning models have also been developed for\nreport labeling but lack adaptability, similar to rule-based systems. In this\nwork, we present MAPLEZ (Medical report Annotations with Privacy-preserving\nLarge language model using Expeditious Zero shot answers), a novel approach\nleveraging a locally executable Large Language Model (LLM) to extract and\nenhance findings labels on CXR reports. MAPLEZ extracts not only binary labels\nindicating the presence or absence of a finding but also the location,\nseverity, and radiologists' uncertainty about the finding. Over eight\nabnormalities from five test sets, we show that our method can extract these\nannotations with an increase of 3.6 percentage points (pp) in macro F1 score\nfor categorical presence annotations and more than 20 pp increase in F1 score\nfor the location annotations over competing labelers. Additionally, using the\ncombination of improved annotations and multi-type annotations in\nclassification supervision, we demonstrate substantial advancements in model\nquality, with an increase of 1.1 pp in AUROC over models trained with\nannotations from the best alternative approach. We share code and annotations.\n","authors":["Ricardo Bigolin Lanfredi","Pritam Mukherjee","Ronald Summers"],"pdf_url":"https://arxiv.org/pdf/2403.04024v2.pdf","comment":"Code and data:\n https://github.com/rsummers11/CADLab/tree/master/MAPLEZ_LLM_report_labeler/"},{"id":"http://arxiv.org/abs/2408.03388v2","updated":"2024-08-15T04:24:00Z","published":"2024-08-06T18:18:37Z","title":"A Non-negative VAE:the Generalized Gamma Belief Network","summary":" The gamma belief network (GBN), often regarded as a deep topic model, has\ndemonstrated its potential for uncovering multi-layer interpretable latent\nrepresentations in text data. Its notable capability to acquire interpretable\nlatent factors is partially attributed to sparse and non-negative\ngamma-distributed latent variables. However, the existing GBN and its\nvariations are constrained by the linear generative model, thereby limiting\ntheir expressiveness and applicability. To address this limitation, we\nintroduce the generalized gamma belief network (Generalized GBN) in this paper,\nwhich extends the original linear generative model to a more expressive\nnon-linear generative model. Since the parameters of the Generalized GBN no\nlonger possess an analytic conditional posterior, we further propose an\nupward-downward Weibull inference network to approximate the posterior\ndistribution of the latent variables. The parameters of both the generative\nmodel and the inference network are jointly trained within the variational\ninference framework. Finally, we conduct comprehensive experiments on both\nexpressivity and disentangled representation learning tasks to evaluate the\nperformance of the Generalized GBN against state-of-the-art Gaussian\nvariational autoencoders serving as baselines.\n","authors":["Zhibin Duan","Tiansheng Wen","Muyao Wang","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.03388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07922v1","updated":"2024-08-15T04:18:40Z","published":"2024-08-15T04:18:40Z","title":"A Deep Features-Based Approach Using Modified ResNet50 and Gradient\n Boosting for Visual Sentiments Classification","summary":" The versatile nature of Visual Sentiment Analysis (VSA) is one reason for its\nrising profile. It isn't easy to efficiently manage social media data with\nvisual information since previous research has concentrated on Sentiment\nAnalysis (SA) of single modalities, like textual. In addition, most visual\nsentiment studies need to adequately classify sentiment because they are mainly\nfocused on simply merging modal attributes without investigating their\nintricate relationships. This prompted the suggestion of developing a fusion of\ndeep learning and machine learning algorithms. In this research, a deep\nfeature-based method for multiclass classification has been used to extract\ndeep features from modified ResNet50. Furthermore, gradient boosting algorithm\nhas been used to classify photos containing emotional content. The approach is\nthoroughly evaluated on two benchmarked datasets, CrowdFlower and GAPED.\nFinally, cutting-edge deep learning and machine learning models were used to\ncompare the proposed strategy. When compared to state-of-the-art approaches,\nthe proposed method demonstrates exceptional performance on the datasets\npresented.\n","authors":["Muhammad Arslan","Muhammad Mubeen","Arslan Akram","Saadullah Farooq Abbasi","Muhammad Salman Ali","Muhammad Usman Tariq"],"pdf_url":"https://arxiv.org/pdf/2408.07922v1.pdf","comment":"4 pages, 4 figures, 3 tables, IEEE International Conference on\n Multimedia Information Processing and Retrieval (MIPR) 2024"},{"id":"http://arxiv.org/abs/2408.07543v2","updated":"2024-08-15T04:01:53Z","published":"2024-08-14T13:23:43Z","title":"MathScape: Evaluating MLLMs in multimodal Math Scenarios through a\n Hierarchical Benchmark","summary":" With the development of Multimodal Large Language Models (MLLMs), the\nevaluation of multimodal models in the context of mathematical problems has\nbecome a valuable research field. Multimodal visual-textual mathematical\nreasoning serves as a critical indicator for evaluating the comprehension and\ncomplex multi-step quantitative reasoning abilities of MLLMs. However, previous\nmultimodal math benchmarks have not sufficiently integrated visual and textual\ninformation. To address this gap, we proposed MathScape, a new benchmark that\nemphasizes the understanding and application of combined visual and textual\ninformation. MathScape is designed to evaluate photo-based math problem\nscenarios, assessing the theoretical understanding and application ability of\nMLLMs through a categorical hierarchical approach. We conduct a\nmulti-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark\nis challenging even for the most sophisticated models. By analyzing the\nevaluation results, we identify the limitations of MLLMs, offering valuable\ninsights for enhancing model performance.\n","authors":["Minxuan Zhou","Hao Liang","Tianpeng Li","Zhiyu Wu","Mingan Lin","Linzhuang Sun","Yaqi Zhou","Yan Zhang","Xiaoqin Huang","Yicong Chen","Yujing Qiao","Weipeng Chen","Bin Cui","Wentao Zhang","Zenan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07576v2","updated":"2024-08-15T03:55:11Z","published":"2024-08-14T14:16:52Z","title":"MetaSeg: MetaFormer-based Global Contexts-aware Network for Efficient\n Semantic Segmentation","summary":" Beyond the Transformer, it is important to explore how to exploit the\ncapacity of the MetaFormer, an architecture that is fundamental to the\nperformance improvements of the Transformer. Previous studies have exploited it\nonly for the backbone network. Unlike previous studies, we explore the capacity\nof the Metaformer architecture more extensively in the semantic segmentation\ntask. We propose a powerful semantic segmentation network, MetaSeg, which\nleverages the Metaformer architecture from the backbone to the decoder. Our\nMetaSeg shows that the MetaFormer architecture plays a significant role in\ncapturing the useful contexts for the decoder as well as for the backbone. In\naddition, recent segmentation methods have shown that using a CNN-based\nbackbone for extracting the spatial information and a decoder for extracting\nthe global information is more effective than using a transformer-based\nbackbone with a CNN-based decoder. This motivates us to adopt the CNN-based\nbackbone using the MetaFormer block and design our MetaFormer-based decoder,\nwhich consists of a novel self-attention module to capture the global contexts.\nTo consider both the global contexts extraction and the computational\nefficiency of the self-attention for semantic segmentation, we propose a\nChannel Reduction Attention (CRA) module that reduces the channel dimension of\nthe query and key into the one dimension. In this way, our proposed MetaSeg\noutperforms the previous state-of-the-art methods with more efficient\ncomputational costs on popular semantic segmentation and a medical image\nsegmentation benchmark, including ADE20K, Cityscapes, COCO-stuff, and Synapse.\nThe code is available at https://github.com/hyunwoo137/MetaSeg.\n","authors":["Beoungwoo Kang","Seunghun Moon","Yubin Cho","Hyunwoo Yu","Suk-Ju Kang"],"pdf_url":"https://arxiv.org/pdf/2408.07576v2.pdf","comment":"Accepted by WACV 2024"},{"id":"http://arxiv.org/abs/2408.07917v1","updated":"2024-08-15T03:54:33Z","published":"2024-08-15T03:54:33Z","title":"GOReloc: Graph-based Object-Level Relocalization for Visual SLAM","summary":" This article introduces a novel method for object-level relocalization of\nrobotic systems. It determines the pose of a camera sensor by robustly\nassociating the object detections in the current frame with 3D objects in a\nlightweight object-level map. Object graphs, considering semantic\nuncertainties, are constructed for both the incoming camera frame and the\npre-built map. Objects are represented as graph nodes, and each node employs\nunique semantic descriptors based on our devised graph kernels. We extract a\nsubgraph from the target map graph by identifying potential object associations\nfor each object detection, then refine these associations and pose estimations\nusing a RANSAC-inspired strategy. Experiments on various datasets demonstrate\nthat our method achieves more accurate data association and significantly\nincreases relocalization success rates compared to baseline methods. The\nimplementation of our method is released at\n\\url{https://github.com/yutongwangBIT/GOReloc}.\n","authors":["Yutong Wang","Chaoyang Jiang","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07917v1.pdf","comment":"8 pages, accepted by IEEE RAL"},{"id":"http://arxiv.org/abs/2408.05426v2","updated":"2024-08-15T03:36:43Z","published":"2024-08-10T04:14:14Z","title":"SAM-FNet: SAM-Guided Fusion Network for Laryngo-Pharyngeal Tumor\n Detection","summary":" Laryngo-pharyngeal cancer (LPC) is a highly fatal malignant disease affecting\nthe head and neck region. Previous studies on endoscopic tumor detection,\nparticularly those leveraging dual-branch network architectures, have shown\nsignificant advancements in tumor detection. These studies highlight the\npotential of dual-branch networks in improving diagnostic accuracy by\neffectively integrating global and local (lesion) feature extraction. However,\nthey are still limited in their capabilities to accurately locate the lesion\nregion and capture the discriminative feature information between the global\nand local branches. To address these issues, we propose a novel SAM-guided\nfusion network (SAM-FNet), a dual-branch network for laryngo-pharyngeal tumor\ndetection. By leveraging the powerful object segmentation capabilities of the\nSegment Anything Model (SAM), we introduce the SAM into the SAM-FNet to\naccurately segment the lesion region. Furthermore, we propose a GAN-like\nfeature optimization (GFO) module to capture the discriminative features\nbetween the global and local branches, enhancing the fusion feature\ncomplementarity. Additionally, we collect two LPC datasets from the First\nAffiliated Hospital (FAHSYSU) and the Sixth Affiliated Hospital (SAHSYSU) of\nSun Yat-sen University. The FAHSYSU dataset is used as the internal dataset for\ntraining the model, while the SAHSYSU dataset is used as the external dataset\nfor evaluating the model's performance. Extensive experiments on both datasets\nof FAHSYSU and SAHSYSU demonstrate that the SAM-FNet can achieve competitive\nresults, outperforming the state-of-the-art counterparts. The source code of\nSAM-FNet is available at the URL of https://github.com/VVJia/SAM-FNet.\n","authors":["Jia Wei","Yun Li","Meiyu Qiu","Hongyu Chen","Xiaomao Fan","Wenbin Lei"],"pdf_url":"https://arxiv.org/pdf/2408.05426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07910v1","updated":"2024-08-15T03:34:02Z","published":"2024-08-15T03:34:02Z","title":"DM2RM: Dual-Mode Multimodal Ranking for Target Objects and Receptacles\n Based on Open-Vocabulary Instructions","summary":" In this study, we aim to develop a domestic service robot (DSR) that, guided\nby open-vocabulary instructions, can carry everyday objects to the specified\npieces of furniture. Few existing methods handle mobile manipulation tasks with\nopen-vocabulary instructions in the image retrieval setting, and most do not\nidentify both the target objects and the receptacles. We propose the Dual-Mode\nMultimodal Ranking model (DM2RM), which enables images of both the target\nobjects and receptacles to be retrieved using a single model based on\nmultimodal foundation models. We introduce a switching mechanism that leverages\na mode token and phrase identification via a large language model to switch the\nembedding space based on the prediction target. To evaluate the DM2RM, we\nconstruct a novel dataset including real-world images collected from hundreds\nof building-scale environments and crowd-sourced instructions with referring\nexpressions. The evaluation results show that the proposed DM2RM outperforms\nprevious approaches in terms of standard metrics in image retrieval settings.\nFurthermore, we demonstrate the application of the DM2RM on a standardized\nreal-world DSR platform including fetch-and-carry actions, where it achieves a\ntask success rate of 82% despite the zero-shot transfer setting. Demonstration\nvideos, code, and more materials are available at\nhttps://kkrr10.github.io/dm2rm/.\n","authors":["Ryosuke Korekata","Kanta Kaneda","Shunya Nagashima","Yuto Imai","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2408.07910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08556v2","updated":"2024-08-15T03:30:39Z","published":"2024-03-13T14:08:25Z","title":"SM4Depth: Seamless Monocular Metric Depth Estimation across Multiple\n Cameras and Scenes by One Model","summary":" In the last year, universal monocular metric depth estimation (universal\nMMDE) has gained considerable attention, serving as the foundation model for\nvarious multimedia tasks, such as video and image editing. Nonetheless, current\napproaches face challenges in maintaining consistent accuracy across diverse\nscenes without scene-specific parameters and pre-training, hindering the\npracticality of MMDE. Furthermore, these methods rely on extensive datasets\ncomprising millions, if not tens of millions, of data for training, leading to\nsignificant time and hardware expenses. This paper presents SM$^4$Depth, a\nmodel that seamlessly works for both indoor and outdoor scenes, without needing\nextensive training data and GPU clusters. Firstly, to obtain consistent depth\nacross diverse scenes, we propose a novel metric scale modeling, i.e.,\nvariation-based unnormalized depth bins. It reduces the ambiguity of the\nconventional metric bins and enables better adaptation to large depth gaps of\nscenes during training. Secondly, we propose a \"divide and conquer\" solution to\nreduce reliance on massive training data. Instead of estimating directly from\nthe vast solution space, the metric bins are estimated from multiple solution\nsub-spaces to reduce complexity. Additionally, we introduce an uncut depth\ndataset, BUPT Depth, to evaluate the depth accuracy and consistency across\nvarious indoor and outdoor scenes. Trained on a consumer-grade GPU using just\n150K RGB-D pairs, SM$^4$Depth achieves outstanding performance on the most\nnever-before-seen datasets, especially maintaining consistent accuracy across\nindoors and outdoors. The code can be found\nhttps://github.com/mRobotit/SM4Depth.\n","authors":["Yihao Liu","Feng Xue","Anlong Ming","Mingshuai Zhao","Huadong Ma","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2403.08556v2.pdf","comment":"Accepted by ACM MultiMedia 24, Project Page:\n xuefeng-cvr.github.io/SM4Depth"},{"id":"http://arxiv.org/abs/2207.09792v2","updated":"2024-08-15T03:25:15Z","published":"2022-07-20T10:09:53Z","title":"Unsupervised Industrial Anomaly Detection via Pattern Generative and\n Contrastive Networks","summary":" It is hard to collect enough flaw images for training deep learning network\nin industrial production. Therefore, existing industrial anomaly detection\nmethods prefer to use CNN-based unsupervised detection and localization network\nto achieve this task. However, these methods always fail when there are\nvarieties happened in new signals since traditional end-to-end networks suffer\nbarriers of fitting nonlinear model in high-dimensional space. Moreover, they\nhave a memory library by clustering the feature of normal images essentially,\nwhich cause it is not robust to texture change. To this end, we propose the\nVision Transformer based (VIT-based) unsupervised anomaly detection network. It\nutilizes a hierarchical task learning and human experience to enhance its\ninterpretability. Our network consists of pattern generation and comparison\nnetworks. Pattern generation network uses two VIT-based encoder modules to\nextract the feature of two consecutive image patches, then uses VIT-based\ndecoder module to learn the human designed style of these features and predict\nthe third image patch. After this, we use the Siamese-based network to compute\nthe similarity of the generation image patch and original image patch. Finally,\nwe refine the anomaly localization by the bi-directional inference strategy.\nComparison experiments on public dataset MVTec dataset show our method achieves\n99.8% AUC, which surpasses previous state-of-the-art methods. In addition, we\ngive a qualitative illustration on our own leather and cloth datasets. The\naccurate segment results strongly prove the accuracy of our method in anomaly\ndetection.\n","authors":["Jianfeng Huang","Chenyang Li","Yimin Lin","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2207.09792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07905v1","updated":"2024-08-15T03:24:00Z","published":"2024-08-15T03:24:00Z","title":"Persistence Image from 3D Medical Image: Superpixel and Optimized\n Gaussian Coefficient","summary":" Topological data analysis (TDA) uncovers crucial properties of objects in\nmedical imaging. Methods based on persistent homology have demonstrated their\nadvantages in capturing topological features that traditional deep learning\nmethods cannot detect in both radiology and pathology. However, previous\nresearch primarily focused on 2D image analysis, neglecting the comprehensive\n3D context. In this paper, we propose an innovative 3D TDA approach that\nincorporates the concept of superpixels to transform 3D medical image features\ninto point cloud data. By Utilizing Optimized Gaussian Coefficient, the\nproposed 3D TDA method, for the first time, efficiently generate holistic\nPersistence Images for 3D volumetric data. Our 3D TDA method exhibits superior\nperformance on the MedMNist3D dataset when compared to other traditional\nmethods, showcasing its potential effectiveness in modeling 3D persistent\nhomology-based topological analysis when it comes to classification tasks. The\nsource code is publicly available at\nhttps://github.com/hrlblab/TopologicalDataAnalysis3D.\n","authors":["Yanfan Zhu","Yash Singh","Khaled Younis","Shunxing Bao","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2408.07905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07903v1","updated":"2024-08-15T03:13:53Z","published":"2024-08-15T03:13:53Z","title":"Deep Joint Denoising and Detection for Enhanced Intracellular Particle\n Analysis","summary":" Reliable analysis of intracellular dynamic processes in time-lapse\nfluorescence microscopy images requires complete and accurate tracking of all\nsmall particles in all time frames of the image sequences. A fundamental first\nstep towards this goal is particle detection. Given the small size of the\nparticles, their detection is greatly affected by image noise. Recent studies\nhave shown that applying image denoising as a preprocessing step indeed\nimproves particle detection and their subsequent tracking. Deep learning based\nparticle detection methods have shown superior results compared to traditional\ndetection methods. However, they do not explicitly aim to remove noise from the\nimages to facilitate detection. Thus we hypothesize that their performance\ncould be further improved. In this paper, we propose a new deep neural network,\ncalled DENODET (denoising-detection network), which performs image denoising\nand particle detection simultaneously. We show that integrative denoising and\ndetection yields more accurate detection results. Our method achieves superior\nresults compared to state-of-the-art particle detection methods on the particle\ntracking challenge dataset and our own real fluorescence microscopy image data.\n","authors":["Yao Yao","Ihor Smal","Ilya Grigoriev","Anna Akhmanova","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2408.07903v1.pdf","comment":"11 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.07891v1","updated":"2024-08-15T02:32:50Z","published":"2024-08-15T02:32:50Z","title":"Quantum-inspired Interpretable Deep Learning Architecture for Text\n Sentiment Analysis","summary":" Text has become the predominant form of communication on social media,\nembedding a wealth of emotional nuances. Consequently, the extraction of\nemotional information from text is of paramount importance. Despite previous\nresearch making some progress, existing text sentiment analysis models still\nface challenges in integrating diverse semantic information and lack\ninterpretability. To address these issues, we propose a quantum-inspired deep\nlearning architecture that combines fundamental principles of quantum mechanics\n(QM principles) with deep learning models for text sentiment analysis.\nSpecifically, we analyze the commonalities between text representation and QM\nprinciples to design a quantum-inspired text representation method and further\ndevelop a quantum-inspired text embedding layer. Additionally, we design a\nfeature extraction layer based on long short-term memory (LSTM) networks and\nself-attention mechanisms (SAMs). Finally, we calculate the text density matrix\nusing the quantum complex numbers principle and apply 2D-convolution neural\nnetworks (CNNs) for feature condensation and dimensionality reduction. Through\na series of visualization, comparative, and ablation experiments, we\ndemonstrate that our model not only shows significant advantages in accuracy\nand efficiency compared to previous related models but also achieves a certain\nlevel of interpretability by integrating QM principles. Our code is available\nat QISA.\n","authors":["Bingyu Li","Da Zhang","Zhiyuan Zhao","Junyu Gao","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.07891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07889v1","updated":"2024-08-15T02:29:00Z","published":"2024-08-15T02:29:00Z","title":"MambaVT: Spatio-Temporal Contextual Modeling for robust RGB-T Tracking","summary":" Existing RGB-T tracking algorithms have made remarkable progress by\nleveraging the global interaction capability and extensive pre-trained models\nof the Transformer architecture. Nonetheless, these methods mainly adopt\nimagepair appearance matching and face challenges of the intrinsic high\nquadratic complexity of the attention mechanism, resulting in constrained\nexploitation of temporal information. Inspired by the recently emerged State\nSpace Model Mamba, renowned for its impressive long sequence modeling\ncapabilities and linear computational complexity, this work innovatively\nproposes a pure Mamba-based framework (MambaVT) to fully exploit\nspatio-temporal contextual modeling for robust visible-thermal tracking.\nSpecifically, we devise the long-range cross-frame integration component to\nglobally adapt to target appearance variations, and introduce short-term\nhistorical trajectory prompts to predict the subsequent target states based on\nlocal temporal location clues. Extensive experiments show the significant\npotential of vision Mamba for RGB-T tracking, with MambaVT achieving\nstate-of-the-art performance on four mainstream benchmarks while requiring\nlower computational costs. We aim for this work to serve as a simple yet strong\nbaseline, stimulating future research in this field. The code and pre-trained\nmodels will be made available.\n","authors":["Simiao Lai","Chang Liu","Jiawen Zhu","Ben Kang","Yang Liu","Dong Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2408.07889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07516v2","updated":"2024-08-15T02:14:18Z","published":"2024-08-14T12:49:50Z","title":"DIffSteISR: Harnessing Diffusion Prior for Superior Real-world Stereo\n Image Super-Resolution","summary":" We introduce DiffSteISR, a pioneering framework for reconstructing real-world\nstereo images. DiffSteISR utilizes the powerful prior knowledge embedded in\npre-trained text-to-image model to efficiently recover the lost texture details\nin low-resolution stereo images. Specifically, DiffSteISR implements a\ntime-aware stereo cross attention with temperature adapter (TASCATA) to guide\nthe diffusion process, ensuring that the generated left and right views exhibit\nhigh texture consistency thereby reducing disparity error between the\nsuper-resolved images and the ground truth (GT) images. Additionally, a stereo\nomni attention control network (SOA ControlNet) is proposed to enhance the\nconsistency of super-resolved images with GT images in the pixel, perceptual,\nand distribution space. Finally, DiffSteISR incorporates a stereo semantic\nextractor (SSE) to capture unique viewpoint soft semantic information and\nshared hard tag semantic information, thereby effectively improving the\nsemantic accuracy and consistency of the generated left and right images.\nExtensive experimental results demonstrate that DiffSteISR accurately\nreconstructs natural and precise textures from low-resolution stereo images\nwhile maintaining a high consistency of semantic and texture between the left\nand right views.\n","authors":["Yuanbo Zhou","Xinlin Zhang","Wei Deng","Tao Wang","Tao Tan","Qinquan Gao","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2408.07516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06070v2","updated":"2024-08-15T02:08:08Z","published":"2024-08-12T11:41:18Z","title":"ControlNeXt: Powerful and Efficient Control for Image and Video\n Generation","summary":" Diffusion models have demonstrated remarkable and robust abilities in both\nimage and video generation. To achieve greater control over generated results,\nresearchers introduce additional architectures, such as ControlNet, Adapters\nand ReferenceNet, to integrate conditioning controls. However, current\ncontrollable generation methods often require substantial additional\ncomputational resources, especially for video generation, and face challenges\nin training or exhibit weak control. In this paper, we propose ControlNeXt: a\npowerful and efficient method for controllable image and video generation. We\nfirst design a more straightforward and efficient architecture, replacing heavy\nadditional branches with minimal additional cost compared to the base model.\nSuch a concise structure also allows our method to seamlessly integrate with\nother LoRA weights, enabling style alteration without the need for additional\ntraining. As for training, we reduce up to 90% of learnable parameters compared\nto the alternatives. Furthermore, we propose another method called Cross\nNormalization (CN) as a replacement for Zero-Convolution' to achieve fast and\nstable training convergence. We have conducted various experiments with\ndifferent base models across images and videos, demonstrating the robustness of\nour method.\n","authors":["Bohao Peng","Jian Wang","Yuechen Zhang","Wenbo Li","Ming-Chang Yang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2408.06070v2.pdf","comment":"controllable generation"},{"id":"http://arxiv.org/abs/2305.12691v3","updated":"2024-08-15T01:59:26Z","published":"2023-05-22T03:58:25Z","title":"Hi-ResNet: Edge Detail Enhancement for High-Resolution Remote Sensing\n Segmentation","summary":" High-resolution remote sensing (HRS) semantic segmentation extracts key\nobjects from high-resolution coverage areas. However, objects of the same\ncategory within HRS images generally show significant differences in scale and\nshape across diverse geographical environments, making it difficult to fit the\ndata distribution. Additionally, a complex background environment causes\nsimilar appearances of objects of different categories, which precipitates a\nsubstantial number of objects into misclassification as background. These\nissues make existing learning algorithms sub-optimal. In this work, we solve\nthe above-mentioned problems by proposing a High-resolution remote sensing\nnetwork (Hi-ResNet) with efficient network structure designs, which consists of\na funnel module, a multi-branch module with stacks of information aggregation\n(IA) blocks, and a feature refinement module, sequentially, and Class-agnostic\nEdge Aware (CEA) loss. Specifically, we propose a funnel module to downsample,\nwhich reduces the computational cost, and extract high-resolution semantic\ninformation from the initial input image. Secondly, we downsample the processed\nfeature images into multi-resolution branches incrementally to capture image\nfeatures at different scales and apply IA blocks, which capture key latent\ninformation by leveraging attention mechanisms, for effective feature\naggregation, distinguishing image features of the same class with variant\nscales and shapes. Finally, our feature refinement module integrate the CEA\nloss function, which disambiguates inter-class objects with similar shapes and\nincreases the data distribution distance for correct predictions. With\neffective pre-training strategies, we demonstrated the superiority of Hi-ResNet\nover state-of-the-art methods on three HRS segmentation benchmarks.\n","authors":["Yuxia Chen","Pengcheng Fang","Jianhui Yu","Xiaoling Zhong","Xiaoming Zhang","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2305.12691v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07883v1","updated":"2024-08-15T01:54:39Z","published":"2024-08-15T01:54:39Z","title":"To Impute or Not: Recommendations for Multibiometric Fusion","summary":" Combining match scores from different biometric systems via fusion is a\nwell-established approach to improving recognition accuracy. However, missing\nscores can degrade performance as well as limit the possible fusion techniques\nthat can be applied. Imputation is a promising technique in multibiometric\nsystems for replacing missing data. In this paper, we evaluate various score\nimputation approaches on three multimodal biometric score datasets, viz. NIST\nBSSR1, BIOCOP2008, and MIT LL Trimodal, and investigate the factors which might\ninfluence the effectiveness of imputation. Our studies reveal three key\nobservations: (1) Imputation is preferable over not imputing missing scores,\neven when the fusion rule does not require complete score data. (2) Balancing\nthe classes in the training data is crucial to mitigate negative biases in the\nimputation technique towards the under-represented class, even if it involves\ndropping a substantial number of score vectors. (3) Multivariate imputation\napproaches seem to be beneficial when scores between modalities are correlated,\nwhile univariate approaches seem to benefit scenarios where scores between\nmodalities are less correlated.\n","authors":["Melissa R Dale","Elliot Singer","Bengt J. Borgström","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2408.07883v1.pdf","comment":"Proc. of IEEE International Workshop on Information Forensics and\n Security (WIFS), (Nuremberg, Germany), December 2023"},{"id":"http://arxiv.org/abs/2408.07666v2","updated":"2024-08-15T01:49:29Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04461v2","updated":"2024-08-15T01:31:29Z","published":"2024-07-05T12:11:33Z","title":"VCD-Texture: Variance Alignment based 3D-2D Co-Denoising for Text-Guided\n Texturing","summary":" Recent research on texture synthesis for 3D shapes benefits a lot from\ndramatically developed 2D text-to-image diffusion models, including\ninpainting-based and optimization-based approaches. However, these methods\nignore the modal gap between the 2D diffusion model and 3D objects, which\nprimarily render 3D objects into 2D images and texture each image separately.\nIn this paper, we revisit the texture synthesis and propose a Variance\nalignment based 3D-2D Collaborative Denoising framework, dubbed VCD-Texture, to\naddress these issues. Formally, we first unify both 2D and 3D latent feature\nlearning in diffusion self-attention modules with re-projected 3D attention\nreceptive fields. Subsequently, the denoised multi-view 2D latent features are\naggregated into 3D space and then rasterized back to formulate more consistent\n2D predictions. However, the rasterization process suffers from an intractable\nvariance bias, which is theoretically addressed by the proposed variance\nalignment, achieving high-fidelity texture synthesis. Moreover, we present an\ninpainting refinement to further improve the details with conflicting regions.\nNotably, there is not a publicly available benchmark to evaluate texture\nsynthesis, which hinders its development. Thus we construct a new evaluation\nset built upon three open-source 3D datasets and propose to use four metrics to\nthoroughly validate the texturing performance. Comprehensive experiments\ndemonstrate that VCD-Texture achieves superior performance against other\ncounterparts.\n","authors":["Shang Liu","Chaohui Yu","Chenjie Cao","Wen Qian","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04461v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.02906v2","updated":"2024-08-15T01:30:06Z","published":"2024-08-06T02:38:22Z","title":"Dual-View Pyramid Pooling in Deep Neural Networks for Improved Medical\n Image Classification and Confidence Calibration","summary":" Spatial pooling (SP) and cross-channel pooling (CCP) operators have been\napplied to aggregate spatial features and pixel-wise features from feature maps\nin deep neural networks (DNNs), respectively. Their main goal is to reduce\ncomputation and memory overhead without visibly weakening the performance of\nDNNs. However, SP often faces the problem of losing the subtle feature\nrepresentations, while CCP has a high possibility of ignoring salient feature\nrepresentations, which may lead to both miscalibration of confidence issues and\nsuboptimal medical classification results. To address these problems, we\npropose a novel dual-view framework, the first to systematically investigate\nthe relative roles of SP and CCP by analyzing the difference between spatial\nfeatures and pixel-wise features. Based on this framework, we propose a new\npooling method, termed dual-view pyramid pooling (DVPP), to aggregate\nmulti-scale dual-view features. DVPP aims to boost both medical image\nclassification and confidence calibration performance by fully leveraging the\nmerits of SP and CCP operators from a dual-axis perspective. Additionally, we\ndiscuss how to fulfill DVPP with five parameter-free implementations. Extensive\nexperiments on six 2D/3D medical image classification tasks show that our DVPP\nsurpasses state-of-the-art pooling methods in terms of medical image\nclassification results and confidence calibration across different DNNs.\n","authors":["Xiaoqing Zhang","Qiushi Nie","Zunjie Xiao","Jilu Zhao","Xiao Wu","Pengxin Guo","Runzhi Li","Jin Liu","Yanjie Wei","Yi Pan"],"pdf_url":"https://arxiv.org/pdf/2408.02906v2.pdf","comment":"30"},{"id":"http://arxiv.org/abs/2303.08815v3","updated":"2024-08-15T01:04:32Z","published":"2023-03-15T17:59:13Z","title":"Lane Graph as Path: Continuity-preserving Path-wise Modeling for Online\n Lane Graph Construction","summary":" Online lane graph construction is a promising but challenging task in\nautonomous driving. Previous methods usually model the lane graph at the pixel\nor piece level, and recover the lane graph by pixel-wise or piece-wise\nconnection, which breaks down the continuity of the lane and results in\nsuboptimal performance. Human drivers focus on and drive along the continuous\nand complete paths instead of considering lane pieces. Autonomous vehicles also\nrequire path-specific guidance from lane graph for trajectory planning. We\nargue that the path, which indicates the traffic flow, is the primitive of the\nlane graph. Motivated by this, we propose to model the lane graph in a novel\npath-wise manner, which well preserves the continuity of the lane and encodes\ntraffic information for planning. We present a path-based online lane graph\nconstruction method, termed LaneGAP, which end-to-end learns the path and\nrecovers the lane graph via a Path2Graph algorithm. We qualitatively and\nquantitatively demonstrate the superior accuracy and efficiency of LaneGAP over\nconventional pixel-based and piece-based methods on the challenging nuScenes\nand Argoverse2 datasets under controllable and fair conditions. Compared to the\nrecent state-of-the-art piece-wise method TopoNet on the OpenLane-V2 dataset,\nLaneGAP still outperforms by 1.6 mIoU, further validating the effectiveness of\npath-wise modeling. Abundant visualizations in the supplementary material show\nLaneGAP can cope with diverse traffic conditions. Code is released at\n\\url{https://github.com/hustvl/LaneGAP}.\n","authors":["Bencheng Liao","Shaoyu Chen","Bo Jiang","Tianheng Cheng","Qian Zhang","Wenyu Liu","Chang Huang","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.08815v3.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.07867v1","updated":"2024-08-15T00:45:21Z","published":"2024-08-15T00:45:21Z","title":"Continuous Perception Benchmark","summary":" Humans continuously perceive and process visual signals. However, current\nvideo models typically either sample key frames sparsely or divide videos into\nchunks and densely sample within each chunk. This approach stems from the fact\nthat most existing video benchmarks can be addressed by analyzing key frames or\naggregating information from separate chunks. We anticipate that the next\ngeneration of vision models will emulate human perception by processing visual\ninput continuously and holistically. To facilitate the development of such\nmodels, we propose the Continuous Perception Benchmark, a video question\nanswering task that cannot be solved by focusing solely on a few frames or by\ncaptioning small chunks and then summarizing using language models. Extensive\nexperiments demonstrate that existing models, whether commercial or\nopen-source, struggle with these tasks, indicating the need for new technical\nadvancements in this direction.\n","authors":["Zeyu Wang","Zhenzhen Weng","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2408.07867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07860v1","updated":"2024-08-15T00:17:00Z","published":"2024-08-15T00:17:00Z","title":"A Novel Generative Artificial Intelligence Method for Interference Study\n on Multiplex Brightfield Immunohistochemistry Images","summary":" Multiplex brightfield imaging offers the advantage of simultaneously\nanalyzing multiple biomarkers on a single slide, as opposed to single biomarker\nlabeling on multiple consecutive slides. To accurately analyze multiple\nbiomarkers localized at the same cellular compartment, two representative\nbiomarker sets were selected as assay models - cMET-PDL1-EGFR and\nCD8-LAG3-PDL1, where all three biomarkers can co-localize on the cell membrane.\nOne of the most crucial preliminary stages for analyzing such assay is\nidentifying each unique chromogen on individual cells. This is a challenging\nproblem due to the co-localization of membrane stains from all the three\nbiomarkers. It requires advanced color unmixing for creating the equivalent\nsingleplex images from each triplex image for each biomarker.\n In this project, we developed a cycle-Generative Adversarial Network\n(cycle-GAN) method for unmixing the triplex images generated from the\nabove-mentioned assays. Three different models were designed to generate the\nsingleplex image for each of the three stains Tamra (purple), QM-Dabsyl\n(yellow) and Green. A notable novelty of our approach was that the input to the\nnetwork were images in the optical density domain instead of conventionally\nused RGB images. The use of the optical density domain helped in reducing the\nblurriness of the synthetic singleplex images, which was often observed when\nthe network was trained on RGB images.\n The cycle-GAN models were validated on 10,800 lung, gastric and colon images\nfor the cMET-PDL1-EGFR assay and 3600 colon images for the CD8-LAG3-PDL1 assay.\nVisual as well as quantified assessments demonstrated that the proposed method\nis effective and efficient when compared with the manual reviewing results and\nis readily applicable to various multiplex assays.\n","authors":["Satarupa Mukherjee","Jim Martin","Yao Nie"],"pdf_url":"https://arxiv.org/pdf/2408.07860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08459v1","updated":"2024-08-15T23:57:02Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08456v1","updated":"2024-08-15T23:46:37Z","published":"2024-08-15T23:46:37Z","title":"Efficient Data-Sketches and Fine-Tuning for Early Detection of\n Distributional Drift in Medical Imaging","summary":" Distributional drift detection is important in medical applications as it\nhelps ensure the accuracy and reliability of models by identifying changes in\nthe underlying data distribution that could affect diagnostic or treatment\ndecisions. However, current methods have limitations in detecting drift; for\nexample, the inclusion of abnormal datasets can lead to unfair comparisons.\nThis paper presents an accurate and sensitive approach to detect distributional\ndrift in CT-scan medical images by leveraging data-sketching and fine-tuning\ntechniques. We developed a robust baseline library model for real-time anomaly\ndetection, allowing for efficient comparison of incoming images and\nidentification of anomalies. Additionally, we fine-tuned a vision transformer\npre-trained model to extract relevant features using breast cancer images as an\nexample, significantly enhancing model accuracy to 99.11\\%. Combining with\ndata-sketches and fine-tuning, our feature extraction evaluation demonstrated\nthat cosine similarity scores between similar datasets provide greater\nimprovements, from around 50\\% increased to 100\\%. Finally, the sensitivity\nevaluation shows that our solutions are highly sensitive to even 1\\%\nsalt-and-pepper and speckle noise, and it is not sensitive to lighting noise\n(e.g., lighting conditions have no impact on data drift). The proposed methods\noffer a scalable and reliable solution for maintaining the accuracy of\ndiagnostic models in dynamic clinical environments.\n","authors":["Yusen Wu","Hao Chen","Alex Pissinou Makki","Phuong Nguyen","Yelena Yesha"],"pdf_url":"https://arxiv.org/pdf/2408.08456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08454v1","updated":"2024-08-15T23:34:04Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":" The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.08447v1","updated":"2024-08-15T22:55:59Z","published":"2024-08-15T22:55:59Z","title":"SpectralEarth: Training Hyperspectral Foundation Models at Scale","summary":" Foundation models have triggered a paradigm shift in computer vision and are\nincreasingly being adopted in remote sensing, particularly for multispectral\nimagery. Yet, their potential in hyperspectral imaging (HSI) remains untapped\ndue to the absence of comprehensive and globally representative hyperspectral\ndatasets. To close this gap, we introduce SpectralEarth, a large-scale\nmulti-temporal dataset designed to pretrain hyperspectral foundation models\nleveraging data from the Environmental Mapping and Analysis Program (EnMAP).\nSpectralEarth comprises 538,974 image patches covering 415,153 unique locations\nfrom more than 11,636 globally distributed EnMAP scenes spanning two years of\narchive. Additionally, 17.5% of these locations include multiple timestamps,\nenabling multi-temporal HSI analysis. Utilizing state-of-the-art\nself-supervised learning (SSL) algorithms, we pretrain a series of foundation\nmodels on SpectralEarth. We integrate a spectral adapter into classical vision\nbackbones to accommodate the unique characteristics of HSI. In tandem, we\nconstruct four downstream datasets for land-cover and crop-type mapping,\nproviding benchmarks for model evaluation. Experimental results support the\nversatility of our models, showcasing their generalizability across different\ntasks and sensors. We also highlight computational efficiency during model\nfine-tuning. The dataset, models, and source code will be made publicly\navailable.\n","authors":["Nassim Ait Ali Braham","Conrad M Albrecht","Julien Mairal","Jocelyn Chanussot","Yi Wang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.07588v3","updated":"2024-08-15T22:33:25Z","published":"2020-08-12T20:08:04Z","title":"Uncertainty Quantification using Variational Inference for Biomedical\n Image Segmentation","summary":" Deep learning motivated by convolutional neural networks has been highly\nsuccessful in a range of medical imaging problems like image classification,\nimage segmentation, image synthesis etc. However for validation and\ninterpretability, not only do we need the predictions made by the model but\nalso how confident it is while making those predictions. This is important in\nsafety critical applications for the people to accept it. In this work, we used\nan encoder decoder architecture based on variational inference techniques for\nsegmenting brain tumour images. We evaluate our work on the publicly available\nBRATS dataset using Dice Similarity Coefficient (DSC) and Intersection Over\nUnion (IOU) as the evaluation metrics. Our model is able to segment brain\ntumours while taking into account both aleatoric uncertainty and epistemic\nuncertainty in a principled bayesian manner.\n","authors":["Abhinav Sagar"],"pdf_url":"https://arxiv.org/pdf/2008.07588v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08437v1","updated":"2024-08-15T22:10:10Z","published":"2024-08-15T22:10:10Z","title":"PQV-Mobile: A Combined Pruning and Quantization Toolkit to Optimize\n Vision Transformers for Mobile Applications","summary":" While Vision Transformers (ViTs) are extremely effective at computer vision\ntasks and are replacing convolutional neural networks as the new\nstate-of-the-art, they are complex and memory-intensive models. In order to\neffectively run these models on resource-constrained mobile/edge systems, there\nis a need to not only compress these models but also to optimize them and\nconvert them into deployment-friendly formats. To this end, this paper presents\na combined pruning and quantization tool, called PQV-Mobile, to optimize vision\ntransformers for mobile applications. The tool is able to support different\ntypes of structured pruning based on magnitude importance, Taylor importance,\nand Hessian importance. It also supports quantization from FP32 to FP16 and\nint8, targeting different mobile hardware backends. We demonstrate the\ncapabilities of our tool and show important latency-memory-accuracy trade-offs\nfor different amounts of pruning and int8 quantization with Facebook Data\nEfficient Image Transformer (DeiT) models. Our results show that even pruning a\nDeiT model by 9.375% and quantizing it to int8 from FP32 followed by optimizing\nfor mobile applications, we find a latency reduction by 7.18X with a small\naccuracy loss of 2.24%. The tool is open source.\n","authors":["Kshitij Bhardwaj"],"pdf_url":"https://arxiv.org/pdf/2408.08437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05269v4","updated":"2024-08-15T21:51:01Z","published":"2022-11-10T00:04:46Z","title":"Generative Adversarial Networks for Weakly Supervised Generation and\n Evaluation of Brain Tumor Segmentations on MR Images","summary":" Segmentation of regions of interest (ROIs) for identifying abnormalities is a\nleading problem in medical imaging. Using machine learning for this problem\ngenerally requires manually annotated ground-truth segmentations, demanding\nextensive time and resources from radiologists. This work presents a weakly\nsupervised approach that utilizes binary image-level labels, which are much\nsimpler to acquire, to effectively segment anomalies in 2D magnetic resonance\nimages without ground truth annotations. We train a generative adversarial\nnetwork (GAN) that converts cancerous images to healthy variants, which are\nused along with localization seeds as priors to generate improved weakly\nsupervised segmentations. The non-cancerous variants can also be used to\nevaluate the segmentations in a weakly supervised fashion, which allows for the\nmost effective segmentations to be identified and then applied to downstream\nclinical classification tasks. On the Multimodal Brain Tumor Segmentation\n(BraTS) 2020 dataset, our proposed method generates and identifies\nsegmentations that achieve test Dice coefficients of 83.91%. Using these\nsegmentations for pathology classification results with a test AUC of 93.32%\nwhich is comparable to the test AUC of 95.80% achieved when using true\nsegmentations.\n","authors":["Jay J. Yoo","Khashayar Namdar","Matthias W. Wagner","Liana Nobre","Uri Tabori","Cynthia Hawkins","Birgit B. Ertl-Wagner","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2211.05269v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08432v1","updated":"2024-08-15T21:49:43Z","published":"2024-08-15T21:49:43Z","title":"Predictive uncertainty estimation in deep learning for lung carcinoma\n classification in digital pathology under real dataset shifts","summary":" Deep learning has shown tremendous progress in a wide range of digital\npathology and medical image classification tasks. Its integration into safe\nclinical decision-making support requires robust and reliable models. However,\nreal-world data comes with diversities that often lie outside the intended\nsource distribution. Moreover, when test samples are dramatically different,\nclinical decision-making is greatly affected. Quantifying predictive\nuncertainty in models is crucial for well-calibrated predictions and\ndetermining when (or not) to trust a model. Unfortunately, many works have\noverlooked the importance of predictive uncertainty estimation. This paper\nevaluates whether predictive uncertainty estimation adds robustness to deep\nlearning-based diagnostic decision-making systems. We investigate the effect of\nvarious carcinoma distribution shift scenarios on predictive performance and\ncalibration. We first systematically investigate three popular methods for\nimproving predictive uncertainty: Monte Carlo dropout, deep ensemble, and\nfew-shot learning on lung adenocarcinoma classification as a primary disease in\nwhole slide images. Secondly, we compare the effectiveness of the methods in\nterms of performance and calibration under clinically relevant distribution\nshifts such as in-distribution shifts comprising primary disease sub-types and\nother characterization analysis data; out-of-distribution shifts comprising\nwell-differentiated cases, different organ origin, and imaging modality shifts.\nWhile studies on uncertainty estimation exist, to our best knowledge, no\nrigorous large-scale benchmark compares predictive uncertainty estimation\nincluding these dataset shifts for lung carcinoma classification.\n","authors":["Abdur R. Fayjie","Jutika Borah","Florencia Carbone","Jan Tack","Patrick Vandewalle"],"pdf_url":"https://arxiv.org/pdf/2408.08432v1.pdf","comment":"17 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.09875v3","updated":"2024-08-15T21:46:42Z","published":"2024-03-14T21:09:59Z","title":"Touch-GS: Visual-Tactile Supervised 3D Gaussian Splatting","summary":" In this work, we propose a novel method to supervise 3D Gaussian Splatting\n(3DGS) scenes using optical tactile sensors. Optical tactile sensors have\nbecome widespread in their use in robotics for manipulation and object\nrepresentation; however, raw optical tactile sensor data is unsuitable to\ndirectly supervise a 3DGS scene. Our representation leverages a Gaussian\nProcess Implicit Surface to implicitly represent the object, combining many\ntouches into a unified representation with uncertainty. We merge this model\nwith a monocular depth estimation network, which is aligned in a two stage\nprocess, coarsely aligning with a depth camera and then finely adjusting to\nmatch our touch data. For every training image, our method produces a\ncorresponding fused depth and uncertainty map. Utilizing this additional\ninformation, we propose a new loss function, variance weighted depth supervised\nloss, for training the 3DGS scene model. We leverage the DenseTact optical\ntactile sensor and RealSense RGB-D camera to show that combining touch and\nvision in this manner leads to quantitatively and qualitatively better results\nthan vision or touch alone in a few-view scene syntheses on opaque as well as\non reflective and transparent objects. Please see our project page at\nhttp://armlabstanford.github.io/touch-gs\n","authors":["Aiden Swann","Matthew Strong","Won Kyung Do","Gadiel Sznaier Camps","Mac Schwager","Monroe Kennedy III"],"pdf_url":"https://arxiv.org/pdf/2403.09875v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.16260v2","updated":"2024-08-15T21:30:42Z","published":"2024-03-24T18:43:04Z","title":"Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble","summary":" Recent research underscores the pivotal role of the Out-of-Distribution (OOD)\nfeature representation field scale in determining the efficacy of models in OOD\ndetection. Consequently, the adoption of model ensembles has emerged as a\nprominent strategy to augment this feature representation field, capitalizing\non anticipated model diversity.\n However, our introduction of novel qualitative and quantitative model\nensemble evaluation methods, specifically Loss Basin/Barrier Visualization and\nthe Self-Coupling Index, reveals a critical drawback in existing ensemble\nmethods. We find that these methods incorporate weights that are\naffine-transformable, exhibiting limited variability and thus failing to\nachieve the desired diversity in feature representation.\n To address this limitation, we elevate the dimensions of traditional model\nensembles, incorporating various factors such as different weight\ninitializations, data holdout, etc., into distinct supervision tasks. This\ninnovative approach, termed Multi-Comprehension (MC) Ensemble, leverages\ndiverse training tasks to generate distinct comprehensions of the data and\nlabels, thereby extending the feature representation field.\n Our experimental results demonstrate the superior performance of the MC\nEnsemble strategy in OOD detection compared to both the naive Deep Ensemble\nmethod and a standalone model of comparable size. This underscores the\neffectiveness of our proposed approach in enhancing the model's capability to\ndetect instances outside its training distribution.\n","authors":["Chenhui Xu","Fuxun Yu","Zirui Xu","Nathan Inkawhich","Xiang Chen"],"pdf_url":"https://arxiv.org/pdf/2403.16260v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2407.16126v3","updated":"2024-08-15T21:11:10Z","published":"2024-07-23T02:21:11Z","title":"MxT: Mamba x Transformer for Image Inpainting","summary":" Image inpainting, or image completion, is a crucial task in computer vision\nthat aims to restore missing or damaged regions of images with semantically\ncoherent content. This technique requires a precise balance of local texture\nreplication and global contextual understanding to ensure the restored image\nintegrates seamlessly with its surroundings. Traditional methods using\nConvolutional Neural Networks (CNNs) are effective at capturing local patterns\nbut often struggle with broader contextual relationships due to the limited\nreceptive fields. Recent advancements have incorporated transformers,\nleveraging their ability to understand global interactions. However, these\nmethods face computational inefficiencies and struggle to maintain fine-grained\ndetails. To overcome these challenges, we introduce MxT composed of the\nproposed Hybrid Module (HM), which combines Mamba with the transformer in a\nsynergistic manner. Mamba is adept at efficiently processing long sequences\nwith linear computational costs, making it an ideal complement to the\ntransformer for handling long-scale data interactions. Our HM facilitates\ndual-level interaction learning at both pixel and patch levels, greatly\nenhancing the model to reconstruct images with high quality and contextual\naccuracy. We evaluate MxT on the widely-used CelebA-HQ and Places2-standard\ndatasets, where it consistently outperformed existing state-of-the-art methods.\nThe code will be released: {\\url{https://github.com/ChrisChen1023/MxT}}.\n","authors":["Shuang Chen","Amir Atapour-Abarghouei","Haozheng Zhang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2407.16126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09066v2","updated":"2024-08-15T21:07:45Z","published":"2024-03-14T03:13:01Z","title":"Hyperparameters in Continual Learning: A Reality Check","summary":" In this paper, we argue that the conventional evaluation protocol in\ncontinual learning (CL) research deviates from the fundamental principle in\nmachine learning evaluation. The primary objective of CL algorithm is to\nbalance the trade-off between plasticity (learning new knowledge from new\ntasks) and stability (retaining knowledge from previous tasks). To evaluate it,\na CL scenario is constructed by using a benchmark dataset, where a neural\nnetwork model is continually trained on the training data of each task, and the\nbest hyperparameters for a CL algorithm are selected based on validation\ndata.The final evaluation involves assessing the model trained with these\nhyperparameters on the test data from the same scenario. This evaluation\nprotocol primarily aims to assess how well a CL algorithm performs on unseen\ndata within that specific scenario. However, to accurately evaluate the CL\nalgorithm, the focus should be on assessing generalizability of each\nalgorithm's CL capacity to handle unseen scenarios. To achieve this evaluation\ngoal, we propose a revised evaluation protocol. Our protocol consists of two\nphases: hyperparameter tuning and evaluation. Both phases share the same\nscenario configuration (e.g., the number of tasks) but the scenarios for each\nphase are generated from different datasets. During the hyperparameter tuning\nphase, the best hyperparameters are identified, which are then used to train\nthe model using the CL algorithm in the evaluation phase. Finally, the result\nfrom this phase is reported as the final evaluation. We apply the proposed\nevaluation protocol to class-incremental learning algorithms, both with and\nwithout a pretrained model. Through extensive experiments involving\napproximately 5000 trials, we demonstrate that most state-of-the-art algorithms\nfail to exhibit the reported performance, revealing a lack of generalizability.\n","authors":["Sungmin Cha","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2403.09066v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.06382v2","updated":"2024-08-15T20:58:24Z","published":"2024-08-10T00:32:39Z","title":"FedRobo: Federated Learning Driven Autonomous Inter Robots Communication\n For Optimal Chemical Sprays","summary":" Federated Learning enables robots to learn from each other's experiences\nwithout relying on centralized data collection. Each robot independently\nmaintains a model of crop conditions and chemical spray effectiveness, which is\nperiodically shared with other robots in the fleet. A communication protocol is\ndesigned to optimize chemical spray applications by facilitating the exchange\nof information about crop conditions, weather, and other critical factors. The\nfederated learning algorithm leverages this shared data to continuously refine\nthe chemical spray strategy, reducing waste and improving crop yields. This\napproach has the potential to revolutionize the agriculture industry by\noffering a scalable and efficient solution for crop protection. However,\nsignificant challenges remain, including the development of a secure and robust\ncommunication protocol, the design of a federated learning algorithm that\neffectively integrates data from multiple sources, and ensuring the safety and\nreliability of autonomous robots. The proposed cluster-based federated learning\napproach also effectively reduces the computational load on the global server\nand minimizes communication overhead among clients.\n","authors":["Jannatul Ferdaus","Sameera Pisupati","Mahedi Hasan","Sathwick Paladugu"],"pdf_url":"https://arxiv.org/pdf/2408.06382v2.pdf","comment":"This research article is going to be submitted to a best-fit\n conference. We are looking for a conference"},{"id":"http://arxiv.org/abs/2408.08412v1","updated":"2024-08-15T20:38:31Z","published":"2024-08-15T20:38:31Z","title":"Penny-Wise and Pound-Foolish in Deepfake Detection","summary":" The diffusion of deepfake technologies has sparked serious concerns about its\npotential misuse across various domains, prompting the urgent need for robust\ndetection methods. Despite advancement, many current approaches prioritize\nshort-term gains at expense of long-term effectiveness. This paper critiques\nthe overly specialized approach of fine-tuning pre-trained models solely with a\npenny-wise objective on a single deepfake dataset, while disregarding the\npound-wise balance for generalization and knowledge retention. To address this\n\"Penny-Wise and Pound-Foolish\" issue, we propose a novel learning framework\n(PoundNet) for generalization of deepfake detection on a pre-trained\nvision-language model. PoundNet incorporates a learnable prompt design and a\nbalanced objective to preserve broad knowledge from upstream tasks (object\nclassification) while enhancing generalization for downstream tasks (deepfake\ndetection). We train PoundNet on a standard single deepfake dataset, following\ncommon practice in the literature. We then evaluate its performance across 10\npublic large-scale deepfake datasets with 5 main evaluation metrics-forming the\nlargest benchmark test set for assessing the generalization ability of deepfake\ndetection models, to our knowledge. The comprehensive benchmark evaluation\ndemonstrates the proposed PoundNet is significantly less \"Penny-Wise and\nPound-Foolish\", achieving a remarkable improvement of 19% in deepfake detection\nperformance compared to state-of-the-art methods, while maintaining a strong\nperformance of 63% on object classification tasks, where other deepfake\ndetection models tend to be ineffective. Code and data are open-sourced at\nhttps://github.com/iamwangyabin/PoundNet.\n","authors":["Yabin Wang","Zhiwu Huang","Su Zhou","Adam Prugel-Bennett","Xiaopeng Hong"],"pdf_url":"https://arxiv.org/pdf/2408.08412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03208v2","updated":"2024-08-15T19:57:59Z","published":"2024-08-06T14:06:53Z","title":"Personalizing Federated Instrument Segmentation with Visual Trait Priors\n in Robotic Surgery","summary":" Personalized federated learning (PFL) for surgical instrument segmentation\n(SIS) is a promising approach. It enables multiple clinical sites to\ncollaboratively train a series of models in privacy, with each model tailored\nto the individual distribution of each site. Existing PFL methods rarely\nconsider the personalization of multi-headed self-attention, and do not account\nfor appearance diversity and instrument shape similarity, both inherent in\nsurgical scenes. We thus propose PFedSIS, a novel PFL method with visual trait\npriors for SIS, incorporating global-personalized disentanglement (GPD),\nappearance-regulation personalized enhancement (APE), and shape-similarity\nglobal enhancement (SGE), to boost SIS performance in each site. GPD represents\nthe first attempt at head-wise assignment for multi-headed self-attention\npersonalization. To preserve the unique appearance representation of each site\nand gradually leverage the inter-site difference, APE introduces appearance\nregulation and provides customized layer-wise aggregation solutions via\nhypernetworks for each site's personalized parameters. The mutual shape\ninformation of instruments is maintained and shared via SGE, which enhances the\ncross-style shape consistency on the image level and computes the\nshape-similarity contribution of each site on the prediction level for updating\nthe global parameters. PFedSIS outperforms state-of-the-art methods with +1.51%\nDice, +2.11% IoU, -2.79 ASSD, -15.55 HD95 performance gains. The corresponding\ncode and models will be released at https://github.com/wzjialang/PFedSIS.\n","authors":["Jialang Xu","Jiacheng Wang","Lequan Yu","Danail Stoyanov","Yueming Jin","Evangelos B. Mazomenos"],"pdf_url":"https://arxiv.org/pdf/2408.03208v2.pdf","comment":"9 pages, 3 figures, under review"},{"id":"http://arxiv.org/abs/2408.08396v1","updated":"2024-08-15T19:46:21Z","published":"2024-08-15T19:46:21Z","title":"Level Up Your Tutorials: VLMs for Game Tutorials Quality Assessment","summary":" Designing effective game tutorials is crucial for a smooth learning curve for\nnew players, especially in games with many rules and complex core mechanics.\nEvaluating the effectiveness of these tutorials usually requires multiple\niterations with testers who have no prior knowledge of the game. Recent\nVision-Language Models (VLMs) have demonstrated significant capabilities in\nunderstanding and interpreting visual content. VLMs can analyze images, provide\ndetailed insights, and answer questions about their content. They can recognize\nobjects, actions, and contexts in visual data, making them valuable tools for\nvarious applications, including automated game testing. In this work, we\npropose an automated game-testing solution to evaluate the quality of game\ntutorials. Our approach leverages VLMs to analyze frames from video game\ntutorials, answer relevant questions to simulate human perception, and provide\nfeedback. This feedback is compared with expected results to identify confusing\nor problematic scenes and highlight potential errors for developers. In\naddition, we publish complete tutorial videos and annotated frames from\ndifferent game versions used in our tests. This solution reduces the need for\nextensive manual testing, especially by speeding up and simplifying the initial\ndevelopment stages of the tutorial to improve the final game experience.\n","authors":["Daniele Rege Cambrin","Gabriele Scaffidi Militone","Luca Colomba","Giovanni Malnati","Daniele Apiletti","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2408.08396v1.pdf","comment":"Accepted at ECCV 2024 CV2 Workshop"},{"id":"http://arxiv.org/abs/2405.01461v3","updated":"2024-08-15T19:41:15Z","published":"2024-05-02T16:50:41Z","title":"SATO: Stable Text-to-Motion Framework","summary":" Is the Text to Motion model robust? Recent advancements in Text to Motion\nmodels primarily stem from more accurate predictions of specific actions.\nHowever, the text modality typically relies solely on pre-trained Contrastive\nLanguage-Image Pretraining (CLIP) models. Our research has uncovered a\nsignificant issue with the text-to-motion model: its predictions often exhibit\ninconsistent outputs, resulting in vastly different or even incorrect poses\nwhen presented with semantically similar or identical text inputs. In this\npaper, we undertake an analysis to elucidate the underlying causes of this\ninstability, establishing a clear link between the unpredictability of model\noutputs and the erratic attention patterns of the text encoder module.\nConsequently, we introduce a formal framework aimed at addressing this issue,\nwhich we term the Stable Text-to-Motion Framework (SATO). SATO consists of\nthree modules, each dedicated to stable attention, stable prediction, and\nmaintaining a balance between accuracy and robustness trade-off. We present a\nmethodology for constructing an SATO that satisfies the stability of attention\nand prediction. To verify the stability of the model, we introduced a new\ntextual synonym perturbation dataset based on HumanML3D and KIT-ML. Results\nshow that SATO is significantly more stable against synonyms and other slight\nperturbations while keeping its high accuracy performance.\n","authors":["Wenshuo Chen","Hongru Xiao","Erhang Zhang","Lijie Hu","Lei Wang","Mengyuan Liu","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01461v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18383v2","updated":"2024-08-15T19:04:26Z","published":"2024-05-28T17:25:43Z","title":"Brain Tumor Segmentation (BraTS) Challenge 2024: Meningioma Radiotherapy\n Planning Automated Segmentation","summary":" The 2024 Brain Tumor Segmentation Meningioma Radiotherapy (BraTS-MEN-RT)\nchallenge aims to advance automated segmentation algorithms using the largest\nknown multi-institutional dataset of radiotherapy planning brain MRIs with\nexpert-annotated target labels for patients with intact or postoperative\nmeningioma that underwent either conventional external beam radiotherapy or\nstereotactic radiosurgery. Each case includes a defaced 3D post-contrast\nT1-weighted radiotherapy planning MRI in its native acquisition space,\naccompanied by a single-label \"target volume\" representing the gross tumor\nvolume (GTV) and any at-risk postoperative site. Target volume annotations\nadhere to established radiotherapy planning protocols, ensuring consistency\nacross cases and institutions. For preoperative meningiomas, the target volume\nencompasses the entire GTV and associated nodular dural tail, while for\npostoperative cases, it includes at-risk resection cavity margins as determined\nby the treating institution. Case annotations were reviewed and approved by\nexpert neuroradiologists and radiation oncologists. Participating teams will\ndevelop, containerize, and evaluate automated segmentation models using this\ncomprehensive dataset. Model performance will be assessed using an adapted\nlesion-wise Dice Similarity Coefficient and the 95% Hausdorff distance. The\ntop-performing teams will be recognized at the Medical Image Computing and\nComputer Assisted Intervention Conference in October 2024. BraTS-MEN-RT is\nexpected to significantly advance automated radiotherapy planning by enabling\nprecise tumor segmentation and facilitating tailored treatment, ultimately\nimproving patient outcomes.\n","authors":["Dominic LaBella","Katherine Schumacher","Michael Mix","Kevin Leu","Shan McBurney-Lin","Pierre Nedelec","Javier Villanueva-Meyer","Jonathan Shapey","Tom Vercauteren","Kazumi Chia","Omar Al-Salihi","Justin Leu","Lia Halasz","Yury Velichko","Chunhao Wang","John Kirkpatrick","Scott Floyd","Zachary J. Reitman","Trey Mullikin","Ulas Bagci","Sean Sachdev","Jona A. Hattangadi-Gluth","Tyler Seibert","Nikdokht Farid","Connor Puett","Matthew W. Pease","Kevin Shiue","Syed Muhammad Anwar","Shahriar Faghani","Muhammad Ammar Haider","Pranav Warman","Jake Albrecht","András Jakab","Mana Moassefi","Verena Chung","Alejandro Aristizabal","Alexandros Karargyris","Hasan Kassem","Sarthak Pati","Micah Sheller","Christina Huang","Aaron Coley","Siddharth Ghanta","Alex Schneider","Conrad Sharp","Rachit Saluja","Florian Kofler","Philipp Lohmann","Phillipp Vollmuth","Louis Gagnon","Maruf Adewole","Hongwei Bran Li","Anahita Fathi Kazerooni","Nourel Hoda Tahon","Udunna Anazodo","Ahmed W. Moawad","Bjoern Menze","Marius George Linguraru","Mariam Aboian","Benedikt Wiestler","Ujjwal Baid","Gian-Marco Conte","Andreas M. Rauschecker","Ayman Nada","Aly H. Abayazeed","Raymond Huang","Maria Correia de Verdier","Jeffrey D. Rudie","Spyridon Bakas","Evan Calabrese"],"pdf_url":"https://arxiv.org/pdf/2405.18383v2.pdf","comment":"14 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.08381v1","updated":"2024-08-15T18:54:31Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations evolve\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations evolves through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that the shape of this ID evolution curve\ndiffers noticeably between natural and medical image models: medical image\nmodels peak in representation ID earlier in the network, implying a difference\nin the image features and their abstractness that are typically used for\ndownstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07075v2","updated":"2024-08-15T18:27:45Z","published":"2024-07-29T23:15:15Z","title":"UniFed: A Universal Federation of a Mixture of Highly Heterogeneous\n Medical Image Classification Tasks","summary":" A fundamental challenge in federated learning lies in mixing heterogeneous\ndatasets and classification tasks while minimizing the high communication cost\ncaused by clients as well as the exchange of weight updates with the server\nover a fixed number of rounds. This results in divergent model convergence\nrates and performance, which may hinder their deployment in precision medicine.\nIn real-world scenarios, client data is collected from different hospitals with\nextremely varying components (e.g., imaging modality, organ type, etc).\nPrevious studies often overlooked the convoluted heterogeneity during the\ntraining stage where the target learning tasks vary across clients as well as\nthe dataset type and their distributions. To address such limitations, we\nunprecedentedly introduce UniFed, a universal federated learning paradigm that\naims to classify any disease from any imaging modality. UniFed also handles the\nissue of varying convergence times in the client-specific optimization based on\nthe complexity of their learning tasks. Specifically, by dynamically adjusting\nboth local and global models, UniFed considers the varying task complexities of\nclients and the server, enhancing its adaptability to real-world scenarios,\nthereby mitigating issues related to overtraining and excessive communication.\nFurthermore, our framework incorporates a sequential model transfer mechanism\nthat takes into account the diverse tasks among hospitals and a dynamic\ntask-complexity based ordering. We demonstrate the superiority of our framework\nin terms of accuracy, communication cost, and convergence time over relevant\nbenchmarks in diagnosing retina, histopathology, and liver tumour diseases\nunder federated learning. Our UniFed code is available at\nhttps://github.com/basiralab/UniFed.\n","authors":["Atefe Hassani","Islem Rekik"],"pdf_url":"https://arxiv.org/pdf/2408.07075v2.pdf","comment":"MLMI@MICCAI 2024"},{"id":"http://arxiv.org/abs/2205.12224v2","updated":"2024-08-15T18:14:34Z","published":"2022-05-24T17:34:14Z","title":"GLObal Building heights for Urban Studies (UT-GLOBUS) for city- and\n street- scale urban simulations: Development and first applications","summary":" We introduce University of Texas - Global Building heights for Urban Studies\n(UT-GLOBUS), a dataset providing building heights and urban canopy parameters\n(UCPs) for more than 1200 cities or locales worldwide. UT-GLOBUS combines\nopen-source spaceborne altimetry (ICESat-2 and GEDI) and coarse-resolution\nurban canopy elevation data with a machine-learning model to estimate\nbuilding-level information. Validation using LiDAR data from six US cities\nshowed UT-GLOBUS-derived building heights had a root mean squared error (RMSE)\nof 9.1 meters. Validation of mean building heights within 1-km^2 grid cells,\nincluding data from Hamburg and Sydney, resulted in an RMSE of 7.8 meters.\nTesting the UCPs in the urban Weather Research and Forecasting (WRF-Urban)\nmodel resulted in a significant improvement (55% in RMSE) in intra-urban air\ntemperature representation compared to the existing table-based local climate\nzone approach in Houston, TX. Additionally, we demonstrated the dataset's\nutility for simulating heat mitigation strategies and building energy\nconsumption using WRF-Urban, with test cases in Chicago, IL, and Austin, TX.\nStreet-scale mean radiant temperature simulations using the Solar and LongWave\nEnvironmental Irradiance Geometry (SOLWEIG) model, incorporating UT-GLOBUS and\nLiDAR-derived building heights, confirmed the dataset's effectiveness in\nmodeling human thermal comfort in Baltimore, MD (daytime RMSE = 2.85 C). Thus,\nUT-GLOBUS can be used for modeling urban hazards with significant socioeconomic\nand biometeorological risks, enabling finer scale urban climate simulations and\novercoming previous limitations due to the lack of building information.\n","authors":["Harsh G. Kamath","Manmeet Singh","Neetiraj Malviya","Alberto Martilli","Liu He","Daniel Aliaga","Cenlin He","Fei Chen","Lori A. Magruder","Zong-Liang Yang","Dev Niyogi"],"pdf_url":"https://arxiv.org/pdf/2205.12224v2.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.08345v1","updated":"2024-08-15T17:58:10Z","published":"2024-08-15T17:58:10Z","title":"5%>100%: Breaking Performance Shackles of Full Fine-Tuning on Visual\n Recognition Tasks","summary":" Pre-training & fine-tuning can enhance the transferring efficiency and\nperformance in visual tasks. Recent delta-tuning methods provide more options\nfor visual classification tasks. Despite their success, existing visual\ndelta-tuning art fails to exceed the upper limit of full fine-tuning on\nchallenging tasks like object detection and segmentation. To find a competitive\nalternative to full fine-tuning, we propose the Multi-cognitive Visual Adapter\n(Mona) tuning, a novel adapter-based tuning method. First, we introduce\nmultiple vision-friendly filters into the adapter to enhance its ability to\nprocess visual signals, while previous methods mainly rely on language-friendly\nlinear filters. Second, we add the scaled normalization layer in the adapter to\nregulate the distribution of input features for visual filters. To fully\ndemonstrate the practicality and generality of Mona, we conduct experiments on\nmultiple representative visual tasks, including instance segmentation on COCO,\nsemantic segmentation on ADE20K, object detection on Pascal VOC, oriented\nobject detection on DOTA/STAR, and image classification on three common\ndatasets. Exciting results illustrate that Mona surpasses full fine-tuning on\nall these tasks, and is the only delta-tuning method outperforming full\nfine-tuning on the above various tasks. For example, Mona achieves 1%\nperformance gain on the COCO dataset compared to full fine-tuning.\nComprehensive results suggest that Mona-tuning is more suitable for retaining\nand utilizing the capabilities of pre-trained models than full fine-tuning. We\nwill make the code publicly available.\n","authors":["Dongshuo Yin","Leiyi Hu","Bin Li","Youqun Zhang","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08345v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2311.15010"},{"id":"http://arxiv.org/abs/2408.08342v1","updated":"2024-08-15T14:41:34Z","published":"2024-08-15T14:41:34Z","title":"CT4D: Consistent Text-to-4D Generation with Animatable Meshes","summary":" Text-to-4D generation has recently been demonstrated viable by integrating a\n2D image diffusion model with a video diffusion model. However, existing models\ntend to produce results with inconsistent motions and geometric structures over\ntime. To this end, we present a novel framework, coined CT4D, which directly\noperates on animatable meshes for generating consistent 4D content from\narbitrary user-supplied prompts. The primary challenges of our mesh-based\nframework involve stably generating a mesh with details that align with the\ntext prompt while directly driving it and maintaining surface continuity. Our\nCT4D framework incorporates a unique Generate-Refine-Animate (GRA) algorithm to\nenhance the creation of text-aligned meshes. To improve surface continuity, we\ndivide a mesh into several smaller regions and implement a uniform driving\nfunction within each area. Additionally, we constrain the animating stage with\na rigidity regulation to ensure cross-region continuity. Our experimental\nresults, both qualitative and quantitative, demonstrate that our CT4D framework\nsurpasses existing text-to-4D techniques in maintaining interframe consistency\nand preserving global geometry. Furthermore, we showcase that this enhanced\nrepresentation inherently possesses the capability for combinational 4D\ngeneration and texture editing.\n","authors":["Ce Chen","Shaoli Huang","Xuelin Chen","Guangyi Chen","Xiaoguang Han","Kun Zhang","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2408.08342v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.07278v2","updated":"2024-08-15T17:40:33Z","published":"2024-08-03T13:03:31Z","title":"Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization\n in CTR Prediction","summary":" In the realm of modern mobile E-commerce, providing users with nearby\ncommercial service recommendations through location-based online services has\nbecome increasingly vital. While machine learning approaches have shown promise\nin multi-scene recommendation, existing methodologies often struggle to address\ncold-start problems in unprecedented scenes: the increasing diversity of\ncommercial choices, along with the short online lifespan of scenes, give rise\nto the complexity of effective recommendations in online and dynamic scenes. In\nthis work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that\nemphasizes high-performance cold-start online recommendations for new scenes.\nOur approach introduces several crucial capabilities, including scene\nsimilarity learning, user-specific scene transition cognition, scene-specific\ninformation construction for the new scene, and enhancing the diverged logical\ninformation between scenes. We demonstrate SwAN's potential to optimize dynamic\nmulti-scene recommendation problems by effectively online handling cold-start\nrecommendations for any newly arrived scenes. More encouragingly, SwAN has been\nsuccessfully deployed in Meituan's online catering recommendation service,\nwhich serves millions of customers per day, and SwAN has achieved a 5.64% CTR\nindex improvement relative to the baselines and a 5.19% increase in daily order\nvolume proportion.\n","authors":["Wenhao Li","Jie Zhou","Chuan Luo","Chao Tang","Kun Zhang","Shixiong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.07278v2.pdf","comment":"10 pages, 6 figures, accepted by Recsys 2024"},{"id":"http://arxiv.org/abs/2408.08231v1","updated":"2024-08-15T15:56:23Z","published":"2024-08-15T15:56:23Z","title":"DaRec: A Disentangled Alignment Framework for Large Language Model and\n Recommender System","summary":" Benefiting from the strong reasoning capabilities, Large language models\n(LLMs) have demonstrated remarkable performance in recommender systems. Various\nefforts have been made to distill knowledge from LLMs to enhance collaborative\nmodels, employing techniques like contrastive learning for representation\nalignment. In this work, we prove that directly aligning the representations of\nLLMs and collaborative models is sub-optimal for enhancing downstream\nrecommendation tasks performance, based on the information theorem.\nConsequently, the challenge of effectively aligning semantic representations\nbetween collaborative models and LLMs remains unresolved. Inspired by this\nviewpoint, we propose a novel plug-and-play alignment framework for LLMs and\ncollaborative models. Specifically, we first disentangle the latent\nrepresentations of both LLMs and collaborative models into specific and shared\ncomponents via projection layers and representation regularization.\nSubsequently, we perform both global and local structure alignment on the\nshared representations to facilitate knowledge transfer. Additionally, we\ntheoretically prove that the specific and shared representations contain more\npertinent and less irrelevant information, which can enhance the effectiveness\nof downstream recommendation tasks. Extensive experimental results on benchmark\ndatasets demonstrate that our method is superior to existing state-of-the-art\nalgorithms.\n","authors":["Xihong Yang","Heming Jing","Zixing Zhang","Jindong Wang","Huakang Niu","Shuaiqiang Wang","Yu Lu","Junfeng Wang","Dawei Yin","Xinwang Liu","En Zhu","Defu Lian","Erxue Min"],"pdf_url":"https://arxiv.org/pdf/2408.08231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08209v1","updated":"2024-08-15T15:18:55Z","published":"2024-08-15T15:18:55Z","title":"Modeling Domain and Feedback Transitions for Cross-Domain Sequential\n Recommendation","summary":" Nowadays, many recommender systems encompass various domains to cater to\nusers' diverse needs, leading to user behaviors transitioning across different\ndomains. In fact, user behaviors across different domains reveal changes in\npreference toward recommended items. For instance, a shift from negative\nfeedback to positive feedback indicates improved user satisfaction. However,\nexisting cross-domain sequential recommendation methods typically model user\ninterests by focusing solely on information about domain transitions, often\noverlooking the valuable insights provided by users' feedback transitions. In\nthis paper, we propose $\\text{Transition}^2$, a novel method to model\ntransitions across both domains and types of user feedback. Specifically,\n$\\text{Transition}^2$ introduces a transition-aware graph encoder based on user\nhistory, assigning different weights to edges according to the feedback type.\nThis enables the graph encoder to extract historical embeddings that capture\nthe transition information between different domains and feedback types.\nSubsequently, we encode the user history using a cross-transition multi-head\nself-attention, incorporating various masks to distinguish different types of\ntransitions. Finally, we integrate these modules to make predictions across\ndifferent domains. Experimental results on two public datasets demonstrate the\neffectiveness of $\\text{Transition}^2$.\n","authors":["Changshuo Zhang","Teng Shi","Xiao Zhang","Qi Liu","Ruobing Xie","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08208v1","updated":"2024-08-15T15:18:46Z","published":"2024-08-15T15:18:46Z","title":"LLM4DSR: Leveraing Large Language Model for Denoising Sequential\n Recommendation","summary":" Sequential recommendation systems fundamentally rely on users' historical\ninteraction sequences, which are often contaminated by noisy interactions.\nIdentifying these noisy interactions accurately without additional information\nis particularly difficult due to the lack of explicit supervisory signals to\ndenote noise. Large Language Models (LLMs), equipped with extensive open\nknowledge and semantic reasoning abilities, present a promising avenue to\nbridge this information gap. However, employing LLMs for denoising in\nsequential recommendation introduces notable challenges: 1) Direct application\nof pretrained LLMs may not be competent for the denoising task, frequently\ngenerating nonsensical responses; 2) Even after fine-tuning, the reliability of\nLLM outputs remains questionable, especially given the complexity of the task\nand th inherent hallucinatory issue of LLMs.\n To tackle these challenges, we propose LLM4DSR, a tailored approach for\ndenoising sequential recommendation using LLMs. We constructed a\nself-supervised fine-tuning task to activate LLMs' capabilities to identify\nnoisy items and suggest replacements. Furthermore, we developed an uncertainty\nestimation module that ensures only high-confidence responses are utilized for\nsequence corrections. Remarkably, LLM4DSR is model-agnostic, allowing the\ncorrected sequences to be flexibly applied across various recommendation\nmodels. Extensive experiments validate the superiority of LLM4DSR over existing\nmethods across three datasets and three recommendation backbones.\n","authors":["Bohao Wang","Feng Liu","Jiawei Chen","Yudi Wu","Xingyu Lou","Jun Wang","Yan Feng","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08203v1","updated":"2024-08-15T15:11:06Z","published":"2024-08-15T15:11:06Z","title":"From Clicks to Carbon: The Environmental Toll of Recommender Systems","summary":" As global warming soars, evaluating the environmental impact of research is\nmore critical now than ever before. However, we find that few to no recommender\nsystems research papers document their impact on the environment. Consequently,\nin this paper, we conduct a comprehensive analysis of the environmental impact\nof recommender system research by reproducing a characteristic recommender\nsystems experimental pipeline. We focus on estimating the carbon footprint of\nrecommender systems research papers, highlighting the evolution of the\nenvironmental impact of recommender systems research experiments over time. We\nthoroughly evaluated all 79 full papers from the ACM RecSys conference in the\nyears 2013 and 2023 to analyze representative experimental pipelines for papers\nutilizing traditional, so-called good old-fashioned AI algorithms and deep\nlearning algorithms, respectively. We reproduced these representative\nexperimental pipelines, measured electricity consumption using a hardware\nenergy meter, and converted the measured energy consumption into CO2\nequivalents to estimate the environmental impact. Our results show that a\nrecommender systems research paper utilizing deep learning algorithms emits\napproximately 42 times more CO2 equivalents than a paper utilizing traditional\nalgorithms. Furthermore, on average, such a paper produces 3,297 kilograms of\nCO2 equivalents, which is more than one person produces by flying from New York\nCity to Melbourne or the amount one tree sequesters in 300 years.\n","authors":["Tobias Vente","Lukas Wegmeth","Alan Said","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2408.08203v1.pdf","comment":"Accepted for presentation at the 18th ACM Conference on Recommender\n Systems in the Reproducibility Track"},{"id":"http://arxiv.org/abs/2408.08088v1","updated":"2024-08-15T11:32:46Z","published":"2024-08-15T11:32:46Z","title":"KGV: Integrating Large Language Models with Knowledge Graphs for Cyber\n Threat Intelligence Credibility Assessment","summary":" Cyber threat intelligence is a critical tool that many organizations and\nindividuals use to protect themselves from sophisticated, organized,\npersistent, and weaponized cyber attacks. However, few studies have focused on\nthe quality assessment of threat intelligence provided by intelligence\nplatforms, and this work still requires manual analysis by cybersecurity\nexperts. In this paper, we propose a knowledge graph-based verifier, a novel\nCyber Threat Intelligence (CTI) quality assessment framework that combines\nknowledge graphs and Large Language Models (LLMs). Our approach introduces LLMs\nto automatically extract OSCTI key claims to be verified and utilizes a\nknowledge graph consisting of paragraphs for fact-checking. This method differs\nfrom the traditional way of constructing complex knowledge graphs with entities\nas nodes. By constructing knowledge graphs with paragraphs as nodes and\nsemantic similarity as edges, it effectively enhances the semantic\nunderstanding ability of the model and simplifies labeling requirements.\nAdditionally, to fill the gap in the research field, we created and made public\nthe first dataset for threat intelligence assessment from heterogeneous\nsources. To the best of our knowledge, this work is the first to create a\ndataset on threat intelligence reliability verification, providing a reference\nfor future research. Experimental results show that KGV (Knowledge Graph\nVerifier) significantly improves the performance of LLMs in intelligence\nquality assessment. Compared with traditional methods, we reduce a large amount\nof data annotation while the model still exhibits strong reasoning\ncapabilities. Finally, our method can achieve XXX accuracy in network threat\nassessment.\n","authors":["Zongzong Wu","Fengxiao Tang","Ming Zhao","Yufeng Li"],"pdf_url":"https://arxiv.org/pdf/2408.08088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08073v1","updated":"2024-08-15T10:54:55Z","published":"2024-08-15T10:54:55Z","title":"Extracting Sentence Embeddings from Pretrained Transformer Models","summary":" Background/introduction: Pre-trained transformer models shine in many natural\nlanguage processing tasks and therefore are expected to bear the representation\nof the input sentence or text meaning. These sentence-level embeddings are also\nimportant in retrieval-augmented generation. But do commonly used plain\naveraging or prompt templates surface it enough?\n Methods: Given 110M parameters BERT's hidden representations from multiple\nlayers and multiple tokens we tried various ways to extract optimal sentence\nrepresentations. We tested various token aggregation and representation\npost-processing techniques. We also tested multiple ways of using a general\nWikitext dataset to complement BERTs sentence representations. All methods were\ntested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12\nclassification tasks. We also evaluated our representation-shaping techniques\non other static models, including random token representations.\n Results: Proposed representation extraction methods improved the performance\non STS and clustering tasks for all models considered. Very high improvements\nfor static token-based models, especially random embeddings for STS tasks\nalmost reach the performance of BERT-derived representations.\n Conclusions: Our work shows that for multiple tasks simple baselines with\nrepresentation shaping techniques reach or even outperform more complex\nBERT-based models or are able to contribute to their performance.\n","authors":["Lukas Stankevičius","Mantas Lukoševičius"],"pdf_url":"https://arxiv.org/pdf/2408.08073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08066v1","updated":"2024-08-15T10:15:37Z","published":"2024-08-15T10:15:37Z","title":"Mamba Retriever: Utilizing Mamba for Effective and Efficient Dense\n Retrieval","summary":" In the information retrieval (IR) area, dense retrieval (DR) models use deep\nlearning techniques to encode queries and passages into embedding space to\ncompute their semantic relations. It is important for DR models to balance both\nefficiency and effectiveness. Pre-trained language models (PLMs), especially\nTransformer-based PLMs, have been proven to be effective encoders of DR models.\nHowever, the self-attention component in Transformer-based PLM results in a\ncomputational complexity that grows quadratically with sequence length, and\nthus exhibits a slow inference speed for long-text retrieval. Some recently\nproposed non-Transformer PLMs, especially the Mamba architecture PLMs, have\ndemonstrated not only comparable effectiveness to Transformer-based PLMs on\ngenerative language tasks but also better efficiency due to linear time scaling\nin sequence length. This paper implements the Mamba Retriever to explore\nwhether Mamba can serve as an effective and efficient encoder of DR model for\nIR tasks. We fine-tune the Mamba Retriever on the classic short-text MS MARCO\npassage ranking dataset and the long-text LoCoV0 dataset. Experimental results\nshow that (1) on the MS MARCO passage ranking dataset and BEIR, the Mamba\nRetriever achieves comparable or better effectiveness compared to\nTransformer-based retrieval models, and the effectiveness grows with the size\nof the Mamba model; (2) on the long-text LoCoV0 dataset, the Mamba Retriever\ncan extend to longer text length than its pre-trained length after fine-tuning\non retrieval task, and it has comparable or better effectiveness compared to\nother long-text retrieval models; (3) the Mamba Retriever has superior\ninference speed for long-text retrieval. In conclusion, Mamba Retriever is both\neffective and efficient, making it a practical model, especially for long-text\nretrieval.\n","authors":["Hanqi Zhang","Chong Chen","Lang Mei","Qi Liu","Jiaxin Mao"],"pdf_url":"https://arxiv.org/pdf/2408.08066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08047v1","updated":"2024-08-15T09:26:26Z","published":"2024-08-15T09:26:26Z","title":"An Efficient Continuous Control Perspective for\n Reinforcement-Learning-based Sequential Recommendation","summary":" Sequential recommendation, where user preference is dynamically inferred from\nsequential historical behaviors, is a critical task in recommender systems\n(RSs). To further optimize long-term user engagement, offline\nreinforcement-learning-based RSs have become a mainstream technique as they\nprovide an additional advantage in avoiding global explorations that may harm\nonline users' experiences. However, previous studies mainly focus on discrete\naction and policy spaces, which might have difficulties in handling\ndramatically growing items efficiently.\n To mitigate this issue, in this paper, we aim to design an algorithmic\nframework applicable to continuous policies. To facilitate the control in the\nlow-dimensional but dense user preference space, we propose an\n\\underline{\\textbf{E}}fficient \\underline{\\textbf{Co}}ntinuous\n\\underline{\\textbf{C}}ontrol framework (ECoC). Based on a statistically tested\nassumption, we first propose the novel unified action representation abstracted\nfrom normalized user and item spaces. Then, we develop the corresponding policy\nevaluation and policy improvement procedures. During this process, strategic\nexploration and directional control in terms of unified actions are carefully\ndesigned and crucial to final recommendation decisions. Moreover, beneficial\nfrom unified actions, the conservatism regularization for policies and value\nfunctions are combined and perfectly compatible with the continuous framework.\nThe resulting dual regularization ensures the successful offline training of\nRL-based recommendation policies. Finally, we conduct extensive experiments to\nvalidate the effectiveness of our framework. The results show that compared to\nthe discrete baselines, our ECoC is trained far more efficiently. Meanwhile,\nthe final policies outperform baselines in both capturing the offline data and\ngaining long-term rewards.\n","authors":["Jun Wang","Likang Wu","Qi Liu","Yu Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01609v2","updated":"2024-08-15T06:11:27Z","published":"2024-05-28T04:22:28Z","title":"Judgement Citation Retrieval using Contextual Similarity","summary":" Traditionally in the domain of legal research, the retrieval of pertinent\ncitations from intricate case descriptions has demanded manual effort and\nkeyword-based search applications that mandate expertise in understanding legal\njargon. Legal case descriptions hold pivotal information for legal\nprofessionals and researchers, necessitating more efficient and automated\napproaches. We propose a methodology that combines natural language processing\n(NLP) and machine learning techniques to enhance the organization and\nutilization of legal case descriptions. This approach revolves around the\ncreation of textual embeddings with the help of state-of-art embedding models.\nOur methodology addresses two primary objectives: unsupervised clustering and\nsupervised citation retrieval, both designed to automate the citation\nextraction process. Although the proposed methodology can be used for any\ndataset, we employed the Supreme Court of The United States (SCOTUS) dataset,\nyielding remarkable results. Our methodology achieved an impressive accuracy\nrate of 90.9%. By automating labor-intensive processes, we pave the way for a\nmore efficient, time-saving, and accessible landscape in legal research,\nbenefiting legal professionals, academics, and researchers.\n","authors":["Akshat Mohan Dasula","Hrushitha Tigulla","Preethika Bhukya"],"pdf_url":"https://arxiv.org/pdf/2406.01609v2.pdf","comment":"14 pages, 16 images"},{"id":"http://arxiv.org/abs/2408.07907v1","updated":"2024-08-15T03:25:56Z","published":"2024-08-15T03:25:56Z","title":"AIE: Auction Information Enhanced Framework for CTR Prediction in Online\n Advertising","summary":" Click-Through Rate (CTR) prediction is a fundamental technique for online\nadvertising recommendation and the complex online competitive auction process\nalso brings many difficulties to CTR optimization. Recent studies have shown\nthat introducing posterior auction information contributes to the performance\nof CTR prediction. However, existing work doesn't fully capitalize on the\nbenefits of auction information and overlooks the data bias brought by the\nauction, leading to biased and suboptimal results. To address these\nlimitations, we propose Auction Information Enhanced Framework (AIE) for CTR\nprediction in online advertising, which delves into the problem of insufficient\nutilization of auction signals and first reveals the auction bias.\nSpecifically, AIE introduces two pluggable modules, namely Adaptive\nMarket-price Auxiliary Module (AM2) and Bid Calibration Module (BCM), which\nwork collaboratively to excavate the posterior auction signals better and\nenhance the performance of CTR prediction. Furthermore, the two proposed\nmodules are lightweight, model-agnostic, and friendly to inference latency.\nExtensive experiments are conducted on a public dataset and an industrial\ndataset to demonstrate the effectiveness and compatibility of AIE. Besides, a\none-month online A/B test in a large-scale advertising platform shows that AIE\nimproves the base model by 5.76% and 2.44% in terms of eCPM and CTR,\nrespectively.\n","authors":["Yang Yang","Bo Chen","Chenxu Zhu","Menghui Zhu","Xinyi Dai","Huifeng Guo","Muyu Zhang","Zhenhua Dong","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2408.07907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07897v1","updated":"2024-08-15T03:01:02Z","published":"2024-08-15T03:01:02Z","title":"The Nah Bandit: Modeling User Non-compliance in Recommendation Systems","summary":" Recommendation systems now pervade the digital world, ranging from\nadvertising to entertainment. However, it remains challenging to implement\neffective recommendation systems in the physical world, such as in mobility or\nhealth. This work focuses on a key challenge: in the physical world, it is\noften easy for the user to opt out of taking any recommendation if they are not\nto her liking, and to fall back to her baseline behavior. It is thus crucial in\ncyber-physical recommendation systems to operate with an interaction model that\nis aware of such user behavior, lest the user abandon the recommendations\naltogether. This paper thus introduces the Nah Bandit, a tongue-in-cheek\nreference to describe a Bandit problem where users can say `nah' to the\nrecommendation and opt for their preferred option instead. As such, this\nproblem lies in between a typical bandit setup and supervised learning. We\nmodel the user non-compliance by parameterizing an anchoring effect of\nrecommendations on users. We then propose the Expert with Clustering (EWC)\nalgorithm, a hierarchical approach that incorporates feedback from both\nrecommended and non-recommended options to accelerate user preference learning.\nIn a recommendation scenario with $N$ users, $T$ rounds per user, and $K$\nclusters, EWC achieves a regret bound of $O(N\\sqrt{T\\log K} + NT)$, achieving\nsuperior theoretical performance in the short term compared to LinUCB\nalgorithm. Experimental results also highlight that EWC outperforms both\nsupervised learning and traditional contextual bandit approaches. This\nadvancement reveals that effective use of non-compliance feedback can\naccelerate preference learning and improve recommendation accuracy. This work\nlays the foundation for future research in Nah Bandit, providing a robust\nframework for more effective recommendation systems.\n","authors":["Tianyue Zhou","Jung-Hoon Cho","Cathy Wu"],"pdf_url":"https://arxiv.org/pdf/2408.07897v1.pdf","comment":"12 pages, 8 figures, under review"},{"id":"http://arxiv.org/abs/2408.08444v1","updated":"2024-08-15T22:34:44Z","published":"2024-08-15T22:34:44Z","title":"W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question\n Answering","summary":" In knowledge-intensive tasks such as open-domain question answering (OpenQA),\nLarge Language Models (LLMs) often struggle to generate factual answers relying\nsolely on their internal (parametric) knowledge. To address this limitation,\nRetrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving\nrelevant information from external sources, thereby positioning the retriever\nas a pivotal component. Although dense retrieval demonstrates state-of-the-art\nperformance, its training poses challenges due to the scarcity of ground-truth\nevidence, largely attributed to the high costs of human annotation. In this\npaper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create\nweakly labeled data for training dense retrievers. Specifically, we rerank the\ntop-$K$ passages retrieved via BM25 by assessing the probability that LLMs will\ngenerate the correct answer based on the question and each passage. The\nhighest-ranking passages are then used as positive training examples for dense\nretrieval. Our comprehensive experiments across four publicly available OpenQA\ndatasets demonstrate that our approach enhances both retrieval and OpenQA\nperformance compared to baseline models.\n","authors":["Jinming Nian","Zhiyuan Peng","Qifan Wang","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17587v2","updated":"2024-08-15T22:28:55Z","published":"2024-05-27T18:40:49Z","title":"RAGSys: Item-Cold-Start Recommender as RAG System","summary":" Large Language Models (LLM) hold immense promise for real-world applications,\nbut their generic knowledge often falls short of domain-specific needs.\nFine-tuning, a common approach, can suffer from catastrophic forgetting and\nhinder generalizability. In-Context Learning (ICL) offers an alternative, which\ncan leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant\ndemonstrations for few-shot learning tasks. This paper explores the desired\nqualities of a demonstration retrieval system for ICL. We argue that ICL\nretrieval in this context resembles item-cold-start recommender systems,\nprioritizing discovery and maximizing information gain over strict relevance.\nWe propose a novel evaluation method that measures the LLM's subsequent\nperformance on NLP tasks, eliminating the need for subjective diversity scores.\nOur findings demonstrate the critical role of diversity and quality bias in\nretrieved demonstrations for effective ICL, and highlight the potential of\nrecommender system techniques in this domain.\n","authors":["Emile Contal","Garrin McGoldrick"],"pdf_url":"https://arxiv.org/pdf/2405.17587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08379v1","updated":"2024-08-15T18:43:50Z","published":"2024-08-15T18:43:50Z","title":"Towards Realistic Synthetic User-Generated Content: A Scaffolding\n Approach to Generating Online Discussions","summary":" The emergence of synthetic data represents a pivotal shift in modern machine\nlearning, offering a solution to satisfy the need for large volumes of data in\ndomains where real data is scarce, highly private, or difficult to obtain. We\ninvestigate the feasibility of creating realistic, large-scale synthetic\ndatasets of user-generated content, noting that such content is increasingly\nprevalent and a source of frequently sought information. Large language models\n(LLMs) offer a starting point for generating synthetic social media discussion\nthreads, due to their ability to produce diverse responses that typify online\ninteractions. However, as we demonstrate, straightforward application of LLMs\nyields limited success in capturing the complex structure of online\ndiscussions, and standard prompting mechanisms lack sufficient control. We\ntherefore propose a multi-step generation process, predicated on the idea of\ncreating compact representations of discussion threads, referred to as\nscaffolds. Our framework is generic yet adaptable to the unique characteristics\nof specific social media platforms. We demonstrate its feasibility using data\nfrom two distinct online discussion platforms. To address the fundamental\nchallenge of ensuring the representativeness and realism of synthetic data, we\npropose a portfolio of evaluation measures to compare various instantiations of\nour framework.\n","authors":["Krisztian Balog","John Palowitch","Barbara Ikica","Filip Radlinski","Hamidreza Alvari","Mehdi Manshadi"],"pdf_url":"https://arxiv.org/pdf/2408.08379v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.08313v1","updated":"2024-08-15T17:59:57Z","published":"2024-08-15T17:59:57Z","title":"Can Large Language Models Understand Symbolic Graphics Programs?","summary":" Assessing the capabilities of large language models (LLMs) is often\nchallenging, in part, because it is hard to find tasks to which they have not\nbeen exposed during training. We take one step to address this challenge by\nturning to a new task: focusing on symbolic graphics programs, which are a\npopular representation for graphics content that procedurally generates visual\ndata. LLMs have shown exciting promise towards program synthesis, but do they\nunderstand symbolic graphics programs? Unlike conventional programs, symbolic\ngraphics programs can be translated to graphics content. Here, we characterize\nan LLM's understanding of symbolic programs in terms of their ability to answer\nquestions related to the graphics content. This task is challenging as the\nquestions are difficult to answer from the symbolic programs alone -- yet, they\nwould be easy to answer from the corresponding graphics content as we verify\nthrough a human experiment. To understand symbolic programs, LLMs may need to\npossess the ability to imagine how the corresponding graphics content would\nlook without directly accessing the rendered visual content. We use this task\nto evaluate LLMs by creating a large benchmark for the semantic understanding\nof symbolic graphics programs. This benchmark is built via program-graphics\ncorrespondence, hence requiring minimal human efforts. We evaluate current LLMs\non our benchmark to elucidate a preliminary assessment of their ability to\nreason about visual scenes from programs. We find that this task distinguishes\nexisting LLMs and models considered good at reasoning perform better. Lastly,\nwe introduce Symbolic Instruction Tuning (SIT) to improve this ability.\nSpecifically, we query GPT4-o with questions and images generated by symbolic\nprograms. Such data are then used to finetune an LLM. We also find that SIT\ndata can improve the general instruction following ability of LLMs.\n","authors":["Zeju Qiu","Weiyang Liu","Haiwen Feng","Zhen Liu","Tim Z. Xiao","Katherine M. Collins","Joshua B. Tenenbaum","Adrian Weller","Michael J. Black","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2408.08313v1.pdf","comment":"Technical Report v1 (44 pages, 23 figures, project page:\n https://sgp-bench.github.io/)"},{"id":"http://arxiv.org/abs/2408.08307v1","updated":"2024-08-15T17:59:06Z","published":"2024-08-15T17:59:06Z","title":"Understanding the Local Geometry of Generative Model Manifolds","summary":" Deep generative models learn continuous representations of complex data\nmanifolds using a finite number of samples during training. For a pre-trained\ngenerative model, the common way to evaluate the quality of the manifold\nrepresentation learned, is by computing global metrics like Fr\\'echet Inception\nDistance using a large number of generated and real samples. However,\ngenerative model performance is not uniform across the learned manifold, e.g.,\nfor \\textit{foundation models} like Stable Diffusion generation performance can\nvary significantly based on the conditioning or initial noise vector being\ndenoised. In this paper we study the relationship between the \\textit{local\ngeometry of the learned manifold} and downstream generation. Based on the\ntheory of continuous piecewise-linear (CPWL) generators, we use three geometric\ndescriptors - scaling ($\\psi$), rank ($\\nu$), and complexity ($\\delta$) - to\ncharacterize a pre-trained generative model manifold locally. We provide\nquantitative and qualitative evidence showing that for a given latent, the\nlocal descriptors are correlated with generation aesthetics, artifacts,\nuncertainty, and even memorization. Finally we demonstrate that training a\n\\textit{reward model} on the local geometry can allow controlling the\nlikelihood of a generated sample under the learned distribution.\n","authors":["Ahmed Imtiaz Humayun","Ibtihel Amara","Candice Schumann","Golnoosh Farnadi","Negar Rostamzadeh","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2408.08307v1.pdf","comment":"Pre-print. 11 pages main, 8 pages app., 28 figures"},{"id":"http://arxiv.org/abs/2303.06815v3","updated":"2024-08-15T17:58:42Z","published":"2023-03-13T02:14:42Z","title":"On Model Compression for Neural Networks: Framework, Algorithm, and\n Convergence Guarantee","summary":" Model compression is a crucial part of deploying neural networks (NNs),\nespecially when the memory and storage of computing devices are limited in many\napplications. This paper focuses on two model compression techniques: low-rank\napproximation and weight pruning in neural networks, which are very popular\nnowadays. However, training NN with low-rank approximation and weight pruning\nalways suffers significant accuracy loss and convergence issues. In this paper,\na holistic framework is proposed for model compression from a novel perspective\nof nonconvex optimization by designing an appropriate objective function. Then,\nwe introduce NN-BCD, a block coordinate descent (BCD) algorithm to solve the\nnonconvex optimization. One advantage of our algorithm is that an efficient\niteration scheme can be derived with closed-form, which is gradient-free.\nTherefore, our algorithm will not suffer from vanishing/exploding gradient\nproblems. Furthermore, with the Kurdyka-{\\L}ojasiewicz (K{\\L}) property of our\nobjective function, we show that our algorithm globally converges to a critical\npoint at the rate of O(1/k), where k denotes the number of iterations. Lastly,\nextensive experiments with tensor train decomposition and weight pruning\ndemonstrate the efficiency and superior performance of the proposed framework.\nOur code implementation is available at https://github.com/ChenyangLi-97/NN-BCD\n","authors":["Chenyang Li","Jihoon Chung","Mengnan Du","Haimin Wang","Xianlian Zhou","Bo Shen"],"pdf_url":"https://arxiv.org/pdf/2303.06815v3.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2408.08302v1","updated":"2024-08-15T17:55:45Z","published":"2024-08-15T17:55:45Z","title":"Benchmarking the Capabilities of Large Language Models in Transportation\n System Engineering: Accuracy, Consistency, and Reasoning Behaviors","summary":" In this paper, we explore the capabilities of state-of-the-art large language\nmodels (LLMs) such as GPT-4, GPT-4o, Claude 3.5 Sonnet, Claude 3 Opus, Gemini\n1.5 Pro, Llama 3, and Llama 3.1 in solving some selected undergraduate-level\ntransportation engineering problems. We introduce TransportBench, a benchmark\ndataset that includes a sample of transportation engineering problems on a wide\nrange of subjects in the context of planning, design, management, and control\nof transportation systems. This dataset is used by human experts to evaluate\nthe capabilities of various commercial and open-sourced LLMs, especially their\naccuracy, consistency, and reasoning behaviors, in solving transportation\nengineering problems. Our comprehensive analysis uncovers the unique strengths\nand limitations of each LLM, e.g. our analysis shows the impressive accuracy\nand some unexpected inconsistent behaviors of Claude 3.5 Sonnet in solving\nTransportBench problems. Our study marks a thrilling first step toward\nharnessing artificial general intelligence for complex transportation\nchallenges.\n","authors":["Usman Syed","Ethan Light","Xingang Guo","Huan Zhang","Lianhui Qin","Yanfeng Ouyang","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2408.08302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08300v1","updated":"2024-08-15T17:54:31Z","published":"2024-08-15T17:54:31Z","title":"HELP: Hierarchical Embeddings-based Log Parsing","summary":" Logs are a first-hand source of information for software maintenance and\nfailure diagnosis. Log parsing, which converts semi-structured log messages\ninto structured templates, is a prerequisite for automated log analysis tasks\nsuch as anomaly detection, troubleshooting, and root cause analysis. However,\nexisting log parsers fail in real-world systems for three main reasons. First,\ntraditional heuristics-based parsers require handcrafted features and domain\nknowledge, which are difficult to generalize at scale. Second, existing large\nlanguage model-based parsers rely on periodic offline processing, limiting\ntheir effectiveness in real-time use cases. Third, existing online parsing\nalgorithms are susceptible to log drift, where slight log changes create false\npositives that drown out real anomalies. To address these challenges, we\npropose HELP, a Hierarchical Embeddings-based Log Parser. HELP is the first\nonline semantic-based parser to leverage LLMs for performant and cost-effective\nlog parsing. We achieve this through a novel hierarchical embeddings module,\nwhich fine-tunes a text embedding model to cluster logs before parsing,\nreducing querying costs by multiple orders of magnitude. To combat log drift,\nwe also develop an iterative rebalancing module, which periodically updates\nexisting log groupings. We evaluate HELP extensively on 14 public large-scale\ndatasets, showing that HELP achieves significantly higher F1-weighted grouping\nand parsing accuracy than current state-of-the-art online log parsers. We also\nimplement HELP into Iudex's production observability platform, confirming\nHELP's practicality in a production environment. Our results show that HELP is\neffective and efficient for high-throughput real-world log parsing.\n","authors":["Andy Xu","Arno Gau"],"pdf_url":"https://arxiv.org/pdf/2408.08300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08295v1","updated":"2024-08-15T17:50:07Z","published":"2024-08-15T17:50:07Z","title":"SLCA++: Unleash the Power of Sequential Fine-tuning for Continual\n Learning with Pre-training","summary":" In recent years, continual learning with pre-training (CLPT) has received\nwidespread interest, instead of its traditional focus of training from scratch.\nThe use of strong pre-trained models (PTMs) can greatly facilitate knowledge\ntransfer and alleviate catastrophic forgetting, but also suffers from\nprogressive overfitting of pre-trained knowledge into specific downstream\ntasks. A majority of current efforts often keep the PTMs frozen and incorporate\ntask-specific prompts to instruct representation learning, coupled with a\nprompt selection process for inference. However, due to the limited capacity of\nprompt parameters, this strategy demonstrates only sub-optimal performance in\ncontinual learning. In comparison, tuning all parameters of PTMs often provides\nthe greatest potential for representation learning, making sequential\nfine-tuning (Seq FT) a fundamental baseline that has been overlooked in CLPT.\nTo this end, we present an in-depth analysis of the progressive overfitting\nproblem from the lens of Seq FT. Considering that the overly fast\nrepresentation learning and the biased classification layer constitute this\nparticular problem, we introduce the advanced Slow Learner with Classifier\nAlignment (SLCA++) framework to unleash the power of Seq FT, serving as a\nstrong baseline approach for CLPT. Our approach involves a Slow Learner to\nselectively reduce the learning rate of backbone parameters, and a Classifier\nAlignment to align the disjoint classification layers in a post-hoc fashion. We\nfurther enhance the efficacy of SL with a symmetric cross-entropy loss, as well\nas employ a parameter-efficient strategy to implement Seq FT with SLCA++.\nAcross a variety of continual learning scenarios on image classification\nbenchmarks, our approach provides substantial improvements and outperforms\nstate-of-the-art methods by a large margin. Code:\nhttps://github.com/GengDavid/SLCA.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.08295v1.pdf","comment":"This paper is an extension of our ICCV 23 paper (arXiv:2303.05118)"},{"id":"http://arxiv.org/abs/2408.08294v1","updated":"2024-08-15T17:49:24Z","published":"2024-08-15T17:49:24Z","title":"Aliasing and Label-Independent Decomposition of Risk: Beyond the\n bias-variance trade-off","summary":" A central problem in data science is to use potentially noisy samples of an\nunknown function to predict function values for unseen inputs. In classical\nstatistics, the predictive error is understood as a trade-off between the bias\nand the variance that balances model simplicity with its ability to fit complex\nfunctions. However, over-parameterized models exhibit counter-intuitive\nbehaviors, such as \"double descent\" in which models of increasing complexity\nexhibit decreasing generalization error. We introduce an alternative paradigm\ncalled the generalized aliasing decomposition. We explain the asymptotically\nsmall error of complex models as a systematic \"de-aliasing\" that occurs in the\nover-parameterized regime. In the limit of large models, the contribution due\nto aliasing vanishes, leaving an expression for the asymptotic total error we\ncall the invertibility failure of very large models on few training points.\nBecause the generalized aliasing decomposition can be explicitly calculated\nfrom the relationship between model class and samples without seeing any data\nlabels, it can answer questions related to experimental design and model\nselection before collecting data or performing experiments. We demonstrate this\napproach using several examples, including classical regression problems and a\ncluster expansion model used in materials science.\n","authors":["Mark K. Transtrum","Gus L. W. Hart","Tyler J. Jarvis","Jared P. Whitehead"],"pdf_url":"https://arxiv.org/pdf/2408.08294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08286v1","updated":"2024-08-15T17:40:11Z","published":"2024-08-15T17:40:11Z","title":"Absence of Closed-Form Descriptions for Gradient Flow in Two-Layer\n Narrow Networks","summary":" In the field of machine learning, comprehending the intricate training\ndynamics of neural networks poses a significant challenge. This paper explores\nthe training dynamics of neural networks, particularly whether these dynamics\ncan be expressed in a general closed-form solution. We demonstrate that the\ndynamics of the gradient flow in two-layer narrow networks is not an integrable\nsystem. Integrable systems are characterized by trajectories confined to\nsubmanifolds defined by level sets of first integrals (invariants),\nfacilitating predictable and reducible dynamics. In contrast, non-integrable\nsystems exhibit complex behaviors that are difficult to predict. To establish\nthe non-integrability, we employ differential Galois theory, which focuses on\nthe solvability of linear differential equations. We demonstrate that under\nmild conditions, the identity component of the differential Galois group of the\nvariational equations of the gradient flow is non-solvable. This result\nconfirms the system's non-integrability and implies that the training dynamics\ncannot be represented by Liouvillian functions, precluding a closed-form\nsolution for describing these dynamics. Our findings highlight the necessity of\nemploying numerical methods to tackle optimization problems within neural\nnetworks. The results contribute to a deeper understanding of neural network\ntraining dynamics and their implications for machine learning optimization\nstrategies.\n","authors":["Yeachan Park"],"pdf_url":"https://arxiv.org/pdf/2408.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08284v1","updated":"2024-08-15T17:37:36Z","published":"2024-08-15T17:37:36Z","title":"Accurate and efficient structure elucidation from routine\n one-dimensional NMR spectra using multitask machine learning","summary":" Rapid determination of molecular structures can greatly accelerate workflows\nacross many chemical disciplines. However, elucidating structure using only\none-dimensional (1D) NMR spectra, the most readily accessible data, remains an\nextremely challenging problem because of the combinatorial explosion of the\nnumber of possible molecules as the number of constituent atoms is increased.\nHere, we introduce a multitask machine learning framework that predicts the\nmolecular structure (formula and connectivity) of an unknown compound solely\nbased on its 1D 1H and/or 13C NMR spectra. First, we show how a transformer\narchitecture can be constructed to efficiently solve the task, traditionally\nperformed by chemists, of assembling large numbers of molecular fragments into\nmolecular structures. Integrating this capability with a convolutional neural\nnetwork (CNN), we build an end-to-end model for predicting structure from\nspectra that is fast and accurate. We demonstrate the effectiveness of this\nframework on molecules with up to 19 heavy (non-hydrogen) atoms, a size for\nwhich there are trillions of possible structures. Without relying on any prior\nchemical knowledge such as the molecular formula, we show that our approach\npredicts the exact molecule 69.6% of the time within the first 15 predictions,\nreducing the search space by up to 11 orders of magnitude.\n","authors":["Frank Hu","Michael S. Chen","Grant M. Rotskoff","Matthew W. Kanan","Thomas E. Markland"],"pdf_url":"https://arxiv.org/pdf/2408.08284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08282v1","updated":"2024-08-15T17:33:32Z","published":"2024-08-15T17:33:32Z","title":"Autonomous Behavior Planning For Humanoid Loco-manipulation Through\n Grounded Language Model","summary":" Enabling humanoid robots to perform autonomously loco-manipulation in\nunstructured environments is crucial and highly challenging for achieving\nembodied intelligence. This involves robots being able to plan their actions\nand behaviors in long-horizon tasks while using multi-modality to perceive\ndeviations between task execution and high-level planning. Recently, large\nlanguage models (LLMs) have demonstrated powerful planning and reasoning\ncapabilities for comprehension and processing of semantic information through\nrobot control tasks, as well as the usability of analytical judgment and\ndecision-making for multi-modal inputs. To leverage the power of LLMs towards\nhumanoid loco-manipulation, we propose a novel language-model based framework\nthat enables robots to autonomously plan behaviors and low-level execution\nunder given textual instructions, while observing and correcting failures that\nmay occur during task execution. To systematically evaluate this framework in\ngrounding LLMs, we created the robot 'action' and 'sensing' behavior library\nfor task planning, and conducted mobile manipulation tasks and experiments in\nboth simulated and real environments using the CENTAURO robot, and verified the\neffectiveness and application of this approach in robotic tasks with autonomous\nbehavioral planning.\n","authors":["Jin Wang","Arturo Laurenzi","Nikos Tsagarakis"],"pdf_url":"https://arxiv.org/pdf/2408.08282v1.pdf","comment":"Paper accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2408.08274v1","updated":"2024-08-15T17:19:12Z","published":"2024-08-15T17:19:12Z","title":"BAM! Just Like That: Simple and Efficient Parameter Upcycling for\n Mixture of Experts","summary":" The Mixture of Experts (MoE) framework has become a popular architecture for\nlarge language models due to its superior performance over dense models.\nHowever, training MoEs from scratch in a large-scale regime is prohibitively\nexpensive. Existing methods mitigate this by pre-training multiple dense expert\nmodels independently and using them to initialize an MoE. This is done by using\nexperts' feed-forward network (FFN) to initialize the MoE's experts while\nmerging other parameters. However, this method limits the reuse of dense model\nparameters to only the FFN layers, thereby constraining the advantages when\n\"upcycling\" these models into MoEs. We propose BAM (Branch-Attend-Mix), a\nsimple yet effective method that addresses this shortcoming. BAM makes full use\nof specialized dense models by not only using their FFN to initialize the MoE\nlayers but also leveraging experts' attention parameters fully by initializing\nthem into a soft-variant of Mixture of Attention (MoA) layers. We explore two\nmethods for upcycling attention parameters: 1) initializing separate attention\nexperts from dense models including all attention parameters for the best model\nperformance; and 2) sharing key and value parameters across all experts to\nfacilitate for better inference efficiency. To further improve efficiency, we\nadopt a parallel attention transformer architecture to MoEs, which allows the\nattention experts and FFN experts to be computed concurrently. Our experiments\non seed models ranging from 590 million to 2 billion parameters demonstrate\nthat BAM surpasses baselines in both perplexity and downstream task\nperformance, within the same computational and data constraints.\n","authors":["Qizhen Zhang","Nikolas Gritsch","Dwaraknath Gnaneshwar","Simon Guo","David Cairuz","Bharat Venkitesh","Jakob Foerster","Phil Blunsom","Sebastian Ruder","Ahmet Ustun","Acyr Locatelli"],"pdf_url":"https://arxiv.org/pdf/2408.08274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08272v1","updated":"2024-08-15T17:17:56Z","published":"2024-08-15T17:17:56Z","title":"Is Knowledge Power? On the (Im)possibility of Learning from Strategic\n Interaction","summary":" When learning in strategic environments, a key question is whether agents can\novercome uncertainty about their preferences to achieve outcomes they could\nhave achieved absent any uncertainty. Can they do this solely through\ninteractions with each other? We focus this question on the ability of agents\nto attain the value of their Stackelberg optimal strategy and study the impact\nof information asymmetry. We study repeated interactions in fully strategic\nenvironments where players' actions are decided based on learning algorithms\nthat take into account their observed histories and knowledge of the game. We\nstudy the pure Nash equilibria (PNE) of a meta-game where players choose these\nalgorithms as their actions. We demonstrate that if one player has perfect\nknowledge about the game, then any initial informational gap persists. That is,\nwhile there is always a PNE in which the informed agent achieves her\nStackelberg value, there is a game where no PNE of the meta-game allows the\npartially informed player to achieve her Stackelberg value. On the other hand,\nif both players start with some uncertainty about the game, the quality of\ninformation alone does not determine which agent can achieve her Stackelberg\nvalue. In this case, the concept of information asymmetry becomes nuanced and\ndepends on the game's structure. Overall, our findings suggest that repeated\nstrategic interactions alone cannot facilitate learning effectively enough to\nearn an uninformed player her Stackelberg value.\n","authors":["Nivasini Ananthakrishnan","Nika Haghtalab","Chara Podimata","Kunhe Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10237v3","updated":"2024-08-15T17:10:19Z","published":"2023-12-15T22:09:04Z","title":"A Distributed Privacy Preserving Model for the Detection of Alzheimer's\n Disease","summary":" BACKGROUND: Segmentation of medical data, concerns about personal health\ninformation (PHI) breaches, and the direct and indirect costs of consolidating\nand managing such segmented date should motivate diagnostic machine learning\n(DML) researchers to identify privacy-preserving machine learning algorithms\nthat can train on distributed or decentralized datasets of different\nmodalities. Federated learning models provide such a decentralized machine\nlearning framework in which multiple investigators in possession of disparate\ndatasets and working on different devices or servers can train collaboratively\na global machine learning models without ever having to exchange local data and\nthus can meet statutory PHI protections. To this end, a vertical federate\nlearning model is devised and tested for efficacy in the detection of\nAlzheimer's Disease (AD).\n METHODS: The second version of Open Access Series of Imaging Studies -- with\nits panoply of demographic, imaging, and clinical assessment datasets -- was\nused to test a multimodal vertical federated learning (VFL) model for AD\ndetection.\n RESULTS: By training and validating this VFL model on the demographic,\nclinical, and MRI data in OASIS-2, an 82.9\\% accuracy rate is achieved,\nconsistent with previously reported results.\n CONCLUSIONS: The VFL architecture proposed herein offers a novel distributed\narchitecture, enabling collaborative learning across diverse sources of medical\ndata while respecting statutory privacy constraints. By leveraging multiple\nmodalities of data, the robustness and accuracy of AD detection can be\nenhanced. This model not only contributes to the advancement of federated\nlearning techniques but also holds promise for overcoming the hurdles posed by\ndata segmentation in medical research.\n","authors":["Paul K. Mandal"],"pdf_url":"https://arxiv.org/pdf/2312.10237v3.pdf","comment":"17 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.08264v1","updated":"2024-08-15T17:07:40Z","published":"2024-08-15T17:07:40Z","title":"InVAErt networks for amortized inference and identifiability analysis of\n lumped parameter hemodynamic models","summary":" Estimation of cardiovascular model parameters from electronic health records\n(EHR) poses a significant challenge primarily due to lack of identifiability.\nStructural non-identifiability arises when a manifold in the space of\nparameters is mapped to a common output, while practical non-identifiability\ncan result due to limited data, model misspecification, or noise corruption. To\naddress the resulting ill-posed inverse problem, optimization-based or Bayesian\ninference approaches typically use regularization, thereby limiting the\npossibility of discovering multiple solutions. In this study, we use inVAErt\nnetworks, a neural network-based, data-driven framework for enhanced digital\ntwin analysis of stiff dynamical systems. We demonstrate the flexibility and\neffectiveness of inVAErt networks in the context of physiological inversion of\na six-compartment lumped parameter hemodynamic model from synthetic data to\nreal data with missing components.\n","authors":["Guoxiang Grayson Tong","Carlos A. Sing Long","Daniele E. Schiavazzi"],"pdf_url":"https://arxiv.org/pdf/2408.08264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08260v1","updated":"2024-08-15T17:01:00Z","published":"2024-08-15T17:01:00Z","title":"GSVD-NMF: Recovering Missing Features in Non-negative Matrix\n Factorization","summary":" Non-negative matrix factorization (NMF) is an important tool in signal\nprocessing and widely used to separate mixed sources into their components.\nHowever, NMF is NP-hard and thus may fail to discover the ideal factorization;\nmoreover, the number of components may not be known in advance and thus\nfeatures may be missed or incompletely separated. To recover missing components\nfrom under-complete NMF, we introduce GSVD-NMF, which proposes new components\nbased on the generalized singular value decomposition (GSVD) between\npreliminary NMF results and the SVD of the original matrix. Simulation and\nexperimental results demonstrate that GSVD-NMF often recovers missing features\nfrom under-complete NMF and helps NMF achieve better local optima.\n","authors":["Youdong Guo","Timothy E. Holy"],"pdf_url":"https://arxiv.org/pdf/2408.08260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08258v1","updated":"2024-08-15T16:59:15Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce\n\\textbf{\\textit{Snuffy}} architecture, a novel MIL-pooling method based on\nsparse transformers that mitigates performance loss with limited pre-training\nand enables continual few-shot pre-training as a competitive option. Our\nsparsity pattern is tailored for pathology and is theoretically proven to be a\nuniversal approximator with the tightest probabilistic sharp bound on the\nnumber of layers for sparse transformers, to date. We demonstrate Snuffy's\neffectiveness on CAMELYON16 and TCGA Lung cancer datasets, achieving superior\nWSI and patch-level accuracies. The code is available on\n\\url{https://github.com/jafarinia/snuffy}.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v1.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2408.08252v1","updated":"2024-08-15T16:47:59Z","published":"2024-08-15T16:47:59Z","title":"Derivative-Free Guidance in Continuous and Discrete Diffusion Models\n with Soft Value-Based Decoding","summary":" Diffusion models excel at capturing the natural design spaces of images,\nmolecules, DNA, RNA, and protein sequences. However, rather than merely\ngenerating designs that are natural, we often aim to optimize downstream reward\nfunctions while preserving the naturalness of these design spaces. Existing\nmethods for achieving this goal often require ``differentiable'' proxy models\n(\\textit{e.g.}, classifier guidance or DPS) or involve computationally\nexpensive fine-tuning of diffusion models (\\textit{e.g.}, classifier-free\nguidance, RL-based fine-tuning). In our work, we propose a new method to\naddress these challenges. Our algorithm is an iterative sampling method that\nintegrates soft value functions, which looks ahead to how intermediate noisy\nstates lead to high rewards in the future, into the standard inference\nprocedure of pre-trained diffusion models. Notably, our approach avoids\nfine-tuning generative models and eliminates the need to construct\ndifferentiable models. This enables us to (1) directly utilize\nnon-differentiable features/reward feedback, commonly used in many scientific\ndomains, and (2) apply our method to recent discrete diffusion models in a\nprincipled way. Finally, we demonstrate the effectiveness of our algorithm\nacross several domains, including image generation, molecule generation, and\nDNA/RNA sequence generation. The code is available at\n\\href{https://github.com/masa-ue/SVDD}{https://github.com/masa-ue/SVDD}.\n","authors":["Xiner Li","Yulai Zhao","Chenyu Wang","Gabriele Scalia","Gokcen Eraslan","Surag Nair","Tommaso Biancalani","Aviv Regev","Sergey Levine","Masatoshi Uehara"],"pdf_url":"https://arxiv.org/pdf/2408.08252v1.pdf","comment":"The code is available at https://github.com/masa-ue/SVDD"},{"id":"http://arxiv.org/abs/2408.08242v1","updated":"2024-08-15T16:10:25Z","published":"2024-08-15T16:10:25Z","title":"A Conflicts-free, Speed-lossless KAN-based Reinforcement Learning\n Decision System for Interactive Driving in Roundabouts","summary":" Safety and efficiency are crucial for autonomous driving in roundabouts,\nespecially in the context of mixed traffic where autonomous vehicles (AVs) and\nhuman-driven vehicles coexist. This paper introduces a learning-based algorithm\ntailored to foster safe and efficient driving behaviors across varying levels\nof traffic flows in roundabouts. The proposed algorithm employs a deep\nQ-learning network to effectively learn safe and efficient driving strategies\nin complex multi-vehicle roundabouts. Additionally, a KAN (Kolmogorov-Arnold\nnetwork) enhances the AVs' ability to learn their surroundings robustly and\nprecisely. An action inspector is integrated to replace dangerous actions to\navoid collisions when the AV interacts with the environment, and a route\nplanner is proposed to enhance the driving efficiency and safety of the AVs.\nMoreover, a model predictive control is adopted to ensure stability and\nprecision of the driving actions. The results show that our proposed system\nconsistently achieves safe and efficient driving whilst maintaining a stable\ntraining process, as evidenced by the smooth convergence of the reward function\nand the low variance in the training curves across various traffic flows.\nCompared to state-of-the-art benchmarks, the proposed algorithm achieves a\nlower number of collisions and reduced travel time to destination.\n","authors":["Zhihao Lin","Zhen Tian","Qi Zhang","Ziyang Ye","Hanyang Zhuang","Jianglin Lan"],"pdf_url":"https://arxiv.org/pdf/2408.08242v1.pdf","comment":"15 pages, 12 figures, submitted to an IEEE journal"},{"id":"http://arxiv.org/abs/2402.10686v2","updated":"2024-08-15T16:04:29Z","published":"2024-02-16T13:41:18Z","title":"On the Impact of Uncertainty and Calibration on Likelihood-Ratio\n Membership Inference Attacks","summary":" In a membership inference attack (MIA), an attacker exploits the\noverconfidence exhibited by typical machine learning models to determine\nwhether a specific data point was used to train a target model. In this paper,\nwe analyze the performance of the state-of-the-art likelihood ratio attack\n(LiRA) within an information-theoretical framework that allows the\ninvestigation of the impact of the aleatoric uncertainty in the true data\ngeneration process, of the epistemic uncertainty caused by a limited training\ndata set, and of the calibration level of the target model. We compare three\ndifferent settings, in which the attacker receives decreasingly informative\nfeedback from the target model: confidence vector (CV) disclosure, in which the\noutput probability vector is released; true label confidence (TLC) disclosure,\nin which only the probability assigned to the true label is made available by\nthe model; and decision set (DS) disclosure, in which an adaptive prediction\nset is produced as in conformal prediction. We derive bounds on the advantage\nof an MIA adversary with the aim of offering insights into the impact of\nuncertainty and calibration on the effectiveness of MIAs. Simulation results\ndemonstrate that the derived analytical bounds predict well the effectiveness\nof MIAs.\n","authors":["Meiyi Zhu","Caili Guo","Chunyan Feng","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2402.10686v2.pdf","comment":"13 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.08233v1","updated":"2024-08-15T15:58:07Z","published":"2024-08-15T15:58:07Z","title":"The Z-Gromov-Wasserstein Distance","summary":" The Gromov-Wasserstein (GW) distance is a powerful tool for comparing metric\nmeasure spaces which has found broad applications in data science and machine\nlearning. Driven by the need to analyze datasets whose objects have\nincreasingly complex structure (such as node and edge-attributed graphs),\nseveral variants of GW distance have been introduced in the recent literature.\nWith a view toward establishing a general framework for the theory of GW-like\ndistances, this paper considers a vast generalization of the notion of a metric\nmeasure space: for an arbitrary metric space $Z$, we define a $Z$-network to be\na measure space endowed with a kernel valued in $Z$. We introduce a method for\ncomparing $Z$-networks by defining a generalization of GW distance, which we\nrefer to as $Z$-Gromov-Wasserstein ($Z$-GW) distance. This construction\nsubsumes many previously known metrics and offers a unified approach to\nunderstanding their shared properties. The paper demonstrates that the $Z$-GW\ndistance defines a metric on the space of $Z$-networks which retains desirable\nproperties of $Z$, such as separability, completeness, and geodesicity. Many of\nthese properties were unknown for existing variants of GW distance that fall\nunder our framework. Our focus is on foundational theory, but our results also\ninclude computable lower bounds and approximations of the distance which will\nbe useful for practical applications.\n","authors":["Martin Bauer","Facundo Mémoli","Tom Needham","Mao Nishino"],"pdf_url":"https://arxiv.org/pdf/2408.08233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08230v1","updated":"2024-08-15T15:56:15Z","published":"2024-08-15T15:56:15Z","title":"Explaining an Agent's Future Beliefs through Temporally Decomposing\n Future Reward Estimators","summary":" Future reward estimation is a core component of reinforcement learning\nagents; i.e., Q-value and state-value functions, predicting an agent's sum of\nfuture rewards. Their scalar output, however, obfuscates when or what\nindividual future rewards an agent may expect to receive. We address this by\nmodifying an agent's future reward estimator to predict their next N expected\nrewards, referred to as Temporal Reward Decomposition (TRD). This unlocks novel\nexplanations of agent behaviour. Through TRD we can: estimate when an agent may\nexpect to receive a reward, the value of the reward and the agent's confidence\nin receiving it; measure an input feature's temporal importance to the agent's\naction decisions; and predict the influence of different actions on future\nrewards. Furthermore, we show that DQN agents trained on Atari environments can\nbe efficiently retrained to incorporate TRD with minimal impact on performance.\n","authors":["Mark Towers","Yali Du","Christopher Freeman","Timothy J. Norman"],"pdf_url":"https://arxiv.org/pdf/2408.08230v1.pdf","comment":"7 pages + 3 pages of supplementary material. Published at ECAI 2024"},{"id":"http://arxiv.org/abs/2312.01236v2","updated":"2024-08-15T15:53:11Z","published":"2023-12-02T22:01:49Z","title":"Evetac: An Event-based Optical Tactile Sensor for Robotic Manipulation","summary":" Optical tactile sensors have recently become popular. They provide high\nspatial resolution, but struggle to offer fine temporal resolutions. To\novercome this shortcoming, we study the idea of replacing the RGB camera with\nan event-based camera and introduce a new event-based optical tactile sensor\ncalled Evetac. Along with hardware design, we develop touch processing\nalgorithms to process its measurements online at 1000 Hz. We devise an\nefficient algorithm to track the elastomer's deformation through the imprinted\nmarkers despite the sensor's sparse output. Benchmarking experiments\ndemonstrate Evetac's capabilities of sensing vibrations up to 498 Hz,\nreconstructing shear forces, and significantly reducing data rates compared to\nRGB optical tactile sensors. Moreover, Evetac's output and the marker tracking\nprovide meaningful features for learning data-driven slip detection and\nprediction models. The learned models form the basis for a robust and adaptive\nclosed-loop grasp controller capable of handling a wide range of objects. We\nbelieve that fast and efficient event-based tactile sensors like Evetac will be\nessential for bringing human-like manipulation capabilities to robotics. The\nsensor design is open-sourced at https://sites.google.com/view/evetac .\n","authors":["Niklas Funk","Erik Helmut","Georgia Chalvatzaki","Roberto Calandra","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2312.01236v2.pdf","comment":"Accepted at IEEE Transactions On Robotics. Project Website:\n https://sites.google.com/view/evetac"},{"id":"http://arxiv.org/abs/2408.06292v2","updated":"2024-08-15T15:42:50Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aides to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08222v1","updated":"2024-08-15T15:40:57Z","published":"2024-08-15T15:40:57Z","title":"Enhancing Sharpness-Aware Minimization by Learning Perturbation Radius","summary":" Sharpness-aware minimization (SAM) is to improve model generalization by\nsearching for flat minima in the loss landscape. The SAM update consists of one\nstep for computing the perturbation and the other for computing the update\ngradient. Within the two steps, the choice of the perturbation radius is\ncrucial to the performance of SAM, but finding an appropriate perturbation\nradius is challenging. In this paper, we propose a bilevel optimization\nframework called LEarning the perTurbation radiuS (LETS) to learn the\nperturbation radius for sharpness-aware minimization algorithms. Specifically,\nin the proposed LETS method, the upper-level problem aims at seeking a good\nperturbation radius by minimizing the squared generalization gap between the\ntraining and validation losses, while the lower-level problem is the SAM\noptimization problem. Moreover, the LETS method can be combined with any\nvariant of SAM. Experimental results on various architectures and benchmark\ndatasets in computer vision and natural language processing demonstrate the\neffectiveness of the proposed LETS method in improving the performance of SAM.\n","authors":["Xuehao Wang","Weisen Jiang","Shuai Fu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08222v1.pdf","comment":"Accepted by ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2408.08217v1","updated":"2024-08-15T15:28:37Z","published":"2024-08-15T15:28:37Z","title":"RED-CT: A Systems Design Methodology for Using LLM-labeled Data to Train\n and Deploy Edge Classifiers for Computational Social Science","summary":" Large language models (LLMs) have enhanced our ability to rapidly analyze and\nclassify unstructured natural language data. However, concerns regarding cost,\nnetwork limitations, and security constraints have posed challenges for their\nintegration into work processes. In this study, we adopt a systems design\napproach to employing LLMs as imperfect data annotators for downstream\nsupervised learning tasks, introducing novel system intervention measures aimed\nat improving classification performance. Our methodology outperforms\nLLM-generated labels in seven of eight tests, demonstrating an effective\nstrategy for incorporating LLMs into the design and deployment of specialized,\nsupervised learning models present in many industry use cases.\n","authors":["David Farr","Nico Manzonelli","Iain Cruickshank","Jevin West"],"pdf_url":"https://arxiv.org/pdf/2408.08217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02969v2","updated":"2024-08-15T15:28:34Z","published":"2023-07-06T13:12:19Z","title":"DPM: Clustering Sensitive Data through Separation","summary":" Clustering is an important tool for data exploration where the goal is to\nsubdivide a data set into disjoint clusters that fit well into the underlying\ndata structure. When dealing with sensitive data, privacy-preserving algorithms\naim to approximate the non-private baseline while minimising the leakage of\nsensitive information. State-of-the-art privacy-preserving clustering\nalgorithms tend to output clusters that are good in terms of the standard\nmetrics, inertia, silhouette score, and clustering accuracy, however, the\nclustering result strongly deviates from the non-private KMeans baseline. In\nthis work, we present a privacy-preserving clustering algorithm called \\DPM\nthat recursively separates a data set into clusters based on a geometrical\nclustering approach. In addition, \\DPM estimates most of the data-dependent\nhyper-parameters in a privacy-preserving way. We prove that \\DPM preserves\nDifferential Privacy and analyse the utility guarantees of \\DPM. Finally, we\nconduct an extensive empirical evaluation for synthetic and real-life data\nsets. We show that \\DPM achieves state-of-the-art utility on the standard\nclustering metrics and yields a clustering result much closer to that of the\npopular non-private KMeans algorithm without requiring the number of classes.\n","authors":["Johannes Liebenow","Yara Schütt","Tanya Braun","Marcel Gehrke","Florian Thaeter","Esfandiar Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2307.02969v2.pdf","comment":"The first two authors equally contributed to this work"},{"id":"http://arxiv.org/abs/2405.18299v2","updated":"2024-08-15T15:25:13Z","published":"2024-05-28T15:51:18Z","title":"Deep Learning Innovations for Underwater Waste Detection: An In-Depth\n Analysis","summary":" Addressing the issue of submerged underwater trash is crucial for\nsafeguarding aquatic ecosystems and preserving marine life. While identifying\ndebris present on the surface of water bodies is straightforward, assessing the\nunderwater submerged waste is a challenge due to the image distortions caused\nby factors such as light refraction, absorption, suspended particles, color\nshifts, and occlusion. This paper conducts a comprehensive review of\nstate-of-the-art architectures and on the existing datasets to establish a\nbaseline for submerged waste and trash detection. The primary goal remains to\nestablish the benchmark of the object localization techniques to be leveraged\nby advanced underwater sensors and autonomous underwater vehicles. The ultimate\nobjective is to explore the underwater environment, to identify, and remove\nunderwater debris. The absence of benchmarks (dataset or algorithm) in many\nresearches emphasizes the need for a more robust algorithmic solution. Through\nthis research, we aim to give performance comparative analysis of various\nunderwater trash detection algorithms.\n","authors":["Jaskaran Singh Walia","Pavithra L K"],"pdf_url":"https://arxiv.org/pdf/2405.18299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08215v1","updated":"2024-08-15T15:23:37Z","published":"2024-08-15T15:23:37Z","title":"Moving Healthcare AI-Support Systems for Visually Detectable Diseases\n onto Constrained Devices","summary":" Image classification usually requires connectivity and access to the cloud\nwhich is often limited in many parts of the world, including hard to reach\nrural areas. TinyML aims to solve this problem by hosting AI assistants on\nconstrained devices, eliminating connectivity issues by processing data within\nthe device itself, without internet or cloud access. This pilot study explores\nthe use of tinyML to provide healthcare support with low spec devices in low\nconnectivity environments, focusing on diagnosis of skin diseases and the\nethical use of AI assistants in a healthcare setting. To investigate this,\n10,000 images of skin lesions were used to train a model for classifying\nvisually detectable diseases (VDDs). The model weights were then offloaded to a\nRaspberry Pi with a webcam attached, to be used for the classification of skin\nlesions without internet access. It was found that the developed prototype\nachieved a test accuracy of 78% and a test loss of 1.08.\n","authors":["Tess Watt","Christos Chrysoulas","Peter J Barclay"],"pdf_url":"https://arxiv.org/pdf/2408.08215v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08214v1","updated":"2024-08-15T15:23:32Z","published":"2024-08-15T15:23:32Z","title":"Federated Fairness Analytics: Quantifying Fairness in Federated Learning","summary":" Federated Learning (FL) is a privacy-enhancing technology for distributed ML.\nBy training models locally and aggregating updates - a federation learns\ntogether, while bypassing centralised data collection. FL is increasingly\npopular in healthcare, finance and personal computing. However, it inherits\nfairness challenges from classical ML and introduces new ones, resulting from\ndifferences in data quality, client participation, communication constraints,\naggregation methods and underlying hardware. Fairness remains an unresolved\nissue in FL and the community has identified an absence of succinct definitions\nand metrics to quantify fairness; to address this, we propose Federated\nFairness Analytics - a methodology for measuring fairness. Our definition of\nfairness comprises four notions with novel, corresponding metrics. They are\nsymptomatically defined and leverage techniques originating from XAI,\ncooperative game-theory and networking engineering. We tested a range of\nexperimental settings, varying the FL approach, ML task and data settings. The\nresults show that statistical heterogeneity and client participation affect\nfairness and fairness conscious approaches such as Ditto and q-FedAvg\nmarginally improve fairness-performance trade-offs. Using our techniques, FL\npractitioners can uncover previously unobtainable insights into their system's\nfairness, at differing levels of granularity in order to address fairness\nchallenges in FL. We have open-sourced our work at:\nhttps://github.com/oscardilley/federated-fairness.\n","authors":["Oscar Dilley","Juan Marcelo Parra-Ullauri","Rasheed Hussain","Dimitra Simeonidou"],"pdf_url":"https://arxiv.org/pdf/2408.08214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08210v1","updated":"2024-08-15T15:19:11Z","published":"2024-08-15T15:19:11Z","title":"Does Reasoning Emerge? Examining the Probabilities of Causation in Large\n Language Models","summary":" Recent advances in AI have been significantly driven by the capabilities of\nlarge language models (LLMs) to solve complex problems in ways that resemble\nhuman thinking. However, there is an ongoing debate about the extent to which\nLLMs are capable of actual reasoning. Central to this debate are two key\nprobabilistic concepts that are essential for connecting causes to their\neffects: the probability of necessity (PN) and the probability of sufficiency\n(PS). This paper introduces a framework that is both theoretical and practical,\naimed at assessing how effectively LLMs are able to replicate real-world\nreasoning mechanisms using these probabilistic measures. By viewing LLMs as\nabstract machines that process information through a natural language\ninterface, we examine the conditions under which it is possible to compute\nsuitable approximations of PN and PS. Our research marks an important step\ntowards gaining a deeper understanding of when LLMs are capable of reasoning,\nas illustrated by a series of math examples.\n","authors":["Javier González","Aditya V. Nori"],"pdf_url":"https://arxiv.org/pdf/2408.08210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04698v5","updated":"2024-08-15T15:19:01Z","published":"2023-11-08T14:10:19Z","title":"Examining Common Paradigms in Multi-Task Learning","summary":" While multi-task learning (MTL) has gained significant attention in recent\nyears, its underlying mechanisms remain poorly understood. Recent methods did\nnot yield consistent performance improvements over single task learning (STL)\nbaselines, underscoring the importance of gaining more profound insights about\nchallenges specific to MTL. In our study, we investigate paradigms in MTL in\nthe context of STL: First, the impact of the choice of optimizer has only been\nmildly investigated in MTL. We show the pivotal role of common STL tools such\nas the Adam optimizer in MTL empirically in various experiments. To further\ninvestigate Adam's effectiveness, we theoretical derive a partial loss-scale\ninvariance under mild assumptions. Second, the notion of gradient conflicts has\noften been phrased as a specific problem in MTL. We delve into the role of\ngradient conflicts in MTL and compare it to STL. For angular gradient alignment\nwe find no evidence that this is a unique problem in MTL. We emphasize\ndifferences in gradient magnitude as the main distinguishing factor. Overall,\nwe find surprising similarities between STL and MTL suggesting to consider\nmethods from both fields in a broader context.\n","authors":["Cathrin Elich","Lukas Kirchdorfer","Jan M. Köhler","Lukas Schott"],"pdf_url":"https://arxiv.org/pdf/2311.04698v5.pdf","comment":"Accepted for publication in German Conference for Pattern Recognition\n (GCPR), 2024"},{"id":"http://arxiv.org/abs/2402.03468v2","updated":"2024-08-15T15:07:32Z","published":"2024-02-02T13:26:38Z","title":"Exact Tensor Completion Powered by Slim Transforms","summary":" In this work, a tensor completion problem is studied, which aims to perfectly\nrecover the tensor from partial observations. The existing theoretical\nguarantee requires the involved transform to be orthogonal, which hinders its\napplications. In this paper, jumping out of the constraints of isotropy and\nself-adjointness, the theoretical guarantee of exact tensor completion with\narbitrary linear transforms is established by directly operating the tensors in\nthe transform domain. With the enriched choices of transforms, a new analysis\nobtained by the proof discloses why slim transforms outperform their square\ncounterparts from a theoretical level. Our model and proof greatly enhance the\nflexibility of tensor completion and extensive experiments validate the\nsuperiority of the proposed method.\n","authors":["Li Ge","Lin Chen","Yudong Chen","Xue Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08192v1","updated":"2024-08-15T14:51:50Z","published":"2024-08-15T14:51:50Z","title":"Stochastic Semi-Gradient Descent for Learning Mean Field Games with\n Population-Aware Function Approximation","summary":" Mean field games (MFGs) model the interactions within a large-population\nmulti-agent system using the population distribution. Traditional learning\nmethods for MFGs are based on fixed-point iteration (FPI), which calculates\nbest responses and induced population distribution separately and sequentially.\nHowever, FPI-type methods suffer from inefficiency and instability, due to\noscillations caused by the forward-backward procedure. This paper considers an\nonline learning method for MFGs, where an agent updates its policy and\npopulation estimates simultaneously and fully asynchronously, resulting in a\nsimple stochastic gradient descent (SGD) type method called SemiSGD. Not only\ndoes SemiSGD exhibit numerical stability and efficiency, but it also provides a\nnovel perspective by treating the value function and population distribution as\na unified parameter. We theoretically show that SemiSGD directs this unified\nparameter along a descent direction to the mean field equilibrium. Motivated by\nthis perspective, we develop a linear function approximation (LFA) for both the\nvalue function and the population distribution, resulting in the first\npopulation-aware LFA for MFGs on continuous state-action space. Finite-time\nconvergence and approximation error analysis are provided for SemiSGD equipped\nwith population-aware LFA.\n","authors":["Chenyu Zhang","Xu Chen","Xuan Di"],"pdf_url":"https://arxiv.org/pdf/2408.08192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08185v1","updated":"2024-08-15T14:42:28Z","published":"2024-08-15T14:42:28Z","title":"Data-driven identification of latent port-Hamiltonian systems","summary":" Conventional physics-based modeling techniques involve high effort, e.g.,\ntime and expert knowledge, while data-driven methods often lack\ninterpretability, structure, and sometimes reliability. To mitigate this, we\npresent a data-driven system identification framework that derives models in\nthe port-Hamiltonian (pH) formulation. This formulation is suitable for\nmulti-physical systems while guaranteeing the useful system theoretical\nproperties of passivity and stability. Our framework combines linear and\nnonlinear reduction with structured, physics-motivated system identification.\nIn this process, high-dimensional state data obtained from possibly nonlinear\nsystems serves as input for an autoencoder, which then performs two tasks: (i)\nnonlinearly transforming and (ii) reducing this data onto a low-dimensional\nlatent space. In this space, a linear pH system, that satisfies the pH\nproperties per construction, is parameterized by the weights of a neural\nnetwork. The mathematical requirements are met by defining the pH matrices\nthrough Cholesky factorizations. The neural networks that define the coordinate\ntransformation and the pH system are identified in a joint optimization process\nto match the dynamics observed in the data while defining a linear pH system in\nthe latent space. The learned, low-dimensional pH system can describe even\nnonlinear systems and is rapidly computable due to its small size. The method\nis exemplified by a parametric mass-spring-damper and a nonlinear pendulum\nexample, as well as the high-dimensional model of a disc brake with linear\nthermoelastic behavior.\n","authors":["Johannes Rettberg","Jonas Kneifl","Julius Herb","Patrick Buchfink","Jörg Fehr","Bernard Haasdonk"],"pdf_url":"https://arxiv.org/pdf/2408.08185v1.pdf","comment":"33 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.08184v1","updated":"2024-08-15T14:42:02Z","published":"2024-08-15T14:42:02Z","title":"Not Every Image is Worth a Thousand Words: Quantifying Originality in\n Stable Diffusion","summary":" This work addresses the challenge of quantifying originality in text-to-image\n(T2I) generative diffusion models, with a focus on copyright originality. We\nbegin by evaluating T2I models' ability to innovate and generalize through\ncontrolled experiments, revealing that stable diffusion models can effectively\nrecreate unseen elements with sufficiently diverse training data. Then, our key\ninsight is that concepts and combinations of image elements the model is\nfamiliar with, and saw more during training, are more concisly represented in\nthe model's latent space. We hence propose a method that leverages textual\ninversion to measure the originality of an image based on the number of tokens\nrequired for its reconstruction by the model. Our approach is inspired by legal\ndefinitions of originality and aims to assess whether a model can produce\noriginal content without relying on specific prompts or having the training\ndata of the model. We demonstrate our method using both a pre-trained stable\ndiffusion model and a synthetic dataset, showing a correlation between the\nnumber of tokens and image originality. This work contributes to the\nunderstanding of originality in generative models and has implications for\ncopyright infringement cases.\n","authors":["Adi Haviv","Shahar Sarfaty","Uri Hacohen","Niva Elkin-Koren","Roi Livni","Amit H Bermano"],"pdf_url":"https://arxiv.org/pdf/2408.08184v1.pdf","comment":"GenLaw ICML 2024"},{"id":"http://arxiv.org/abs/2408.07673v2","updated":"2024-08-15T14:35:24Z","published":"2024-08-14T17:16:50Z","title":"Deep Learning: a Heuristic Three-stage Mechanism for Grid Searches to\n Optimize the Future Risk Prediction of Breast Cancer Metastasis Using\n EHR-based Clinical Data","summary":" A grid search, at the cost of training and testing a large number of models,\nis an effective way to optimize the prediction performance of deep learning\nmodels. A challenging task concerning grid search is the time management.\nWithout a good time management scheme, a grid search can easily be set off as a\nmission that will not finish in our lifetime. In this study, we introduce a\nheuristic three-stage mechanism for managing the running time of low-budget\ngrid searches, and the sweet-spot grid search (SSGS) and randomized grid search\n(RGS) strategies for improving model prediction performance, in predicting the\n5-year, 10-year, and 15-year risk of breast cancer metastasis. We develop deep\nfeedforward neural network (DFNN) models and optimize them through grid\nsearches. We conduct eight cycles of grid searches by applying our three-stage\nmechanism and SSGS and RGS strategies. We conduct various SHAP analyses\nincluding unique ones that interpret the importance of the DFNN-model\nhyperparameters. Our results show that grid search can greatly improve model\nprediction. The grid searches we conducted improved the risk prediction of\n5-year, 10-year, and 15-year breast cancer metastasis by 18.6%, 16.3%, and\n17.3% respectively, over the average performance of all corresponding models we\ntrained using the RGS strategy. We not only demonstrate best model performance\nbut also characterize grid searches from various aspects such as their\ncapabilities of discovering decent models and the unit grid search time. The\nthree-stage mechanism worked effectively. It made our low-budget grid searches\nfeasible and manageable, and in the meantime helped improve model prediction\nperformance. Our SHAP analyses identified both clinical risk factors important\nfor the prediction of future risk of breast cancer metastasis, and DFNN-model\nhyperparameters important to the prediction of performance scores.\n","authors":["Xia Jiang","Yijun Zhou","Chuhan Xu","Adam Brufsky","Alan Wells"],"pdf_url":"https://arxiv.org/pdf/2408.07673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08179v1","updated":"2024-08-15T14:33:09Z","published":"2024-08-15T14:33:09Z","title":"Machine learning empowered Modulation detection for OFDM-based signals","summary":" We propose a blind ML-based modulation detection for OFDM-based technologies.\nUnlike previous works that assume an ideal environment with precise knowledge\nof subcarrier count and cyclic prefix location, we consider blind modulation\ndetection while accounting for realistic environmental parameters and\nimperfections. Our approach employs a ResNet network to simultaneously detect\nthe modulation type and accurately locate the cyclic prefix. Specifically,\nafter eliminating the environmental impact from the signal and accurately\nextracting the OFDM symbols, we convert these symbols into scatter plots. Due\nto their unique shapes, these scatter plots are then classified using ResNet.\nAs a result, our proposed modulation classification method can be applied to\nany OFDM-based technology without prior knowledge of the transmitted signal. We\nevaluate its performance across various modulation schemes and subcarrier\nnumbers. Simulation results show that our method achieves a modulation\ndetection accuracy exceeding $80\\%$ at an SNR of $10$ dB and $95\\%$ at an SNR\nof $25$ dB.\n","authors":["Ali Pourranjbar","Georges Kaddoum","Verdier Assoume Mba","Sahil Garg","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2408.08179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01069v4","updated":"2024-08-15T14:30:44Z","published":"2023-09-03T03:54:43Z","title":"Separable Hamiltonian Neural Networks","summary":" Hamiltonian neural networks (HNNs) are state-of-the-art models that regress\nthe vector field of a dynamical system under the learning bias of Hamilton's\nequations. A recent observation is that embedding a bias regarding the additive\nseparability of the Hamiltonian reduces the regression complexity and improves\nregression performance. We propose separable HNNs that embed additive\nseparability within HNNs using observational, learning, and inductive biases.\nWe show that the proposed models are more effective than the HNN at regressing\nthe Hamiltonian and the vector field. Consequently, the proposed models predict\nthe dynamics and conserve the total energy of the Hamiltonian system more\naccurately.\n","authors":["Zi-Yu Khoo","Dawen Wu","Jonathan Sze Choong Low","Stéphane Bressan"],"pdf_url":"https://arxiv.org/pdf/2309.01069v4.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2403.17695v2","updated":"2024-08-15T14:30:02Z","published":"2024-03-26T13:35:10Z","title":"PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition","summary":" We present PlainMamba: a simple non-hierarchical state space model (SSM)\ndesigned for general visual recognition. The recent Mamba model has shown how\nSSMs can be highly competitive with other architectures on sequential data and\ninitial attempts have been made to apply it to images. In this paper, we\nfurther adapt the selective scanning process of Mamba to the visual domain,\nenhancing its ability to learn features from two-dimensional images by (i) a\ncontinuous 2D scanning process that improves spatial continuity by ensuring\nadjacency of tokens in the scanning sequence, and (ii) direction-aware updating\nwhich enables the model to discern the spatial relations of tokens by encoding\ndirectional information. Our architecture is designed to be easy to use and\neasy to scale, formed by stacking identical PlainMamba blocks, resulting in a\nmodel with constant width throughout all layers. The architecture is further\nsimplified by removing the need for special tokens. We evaluate PlainMamba on a\nvariety of visual recognition tasks, achieving performance gains over previous\nnon-hierarchical models and is competitive with hierarchical alternatives. For\ntasks requiring high-resolution inputs, in particular, PlainMamba requires much\nless computing while maintaining high performance. Code and models are\navailable at: https://github.com/ChenhongyiYang/PlainMamba .\n","authors":["Chenhongyi Yang","Zehui Chen","Miguel Espinosa","Linus Ericsson","Zhenyu Wang","Jiaming Liu","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2403.17695v2.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2408.08172v1","updated":"2024-08-15T14:19:13Z","published":"2024-08-15T14:19:13Z","title":"Towards flexible perception with visual memory","summary":" Training a neural network is a monolithic endeavor, akin to carving knowledge\ninto stone: once the process is completed, editing the knowledge in a network\nis nearly impossible, since all information is distributed across the network's\nweights. We here explore a simple, compelling alternative by marrying the\nrepresentational power of deep neural networks with the flexibility of a\ndatabase. Decomposing the task of image classification into image similarity\n(from a pre-trained embedding) and search (via fast nearest neighbor retrieval\nfrom a knowledge database), we build a simple and flexible visual memory that\nhas the following key capabilities: (1.) The ability to flexibly add data\nacross scales: from individual samples all the way to entire classes and\nbillion-scale data; (2.) The ability to remove data through unlearning and\nmemory pruning; (3.) An interpretable decision-mechanism on which we can\nintervene to control its behavior. Taken together, these capabilities\ncomprehensively demonstrate the benefits of an explicit visual memory. We hope\nthat it might contribute to a conversation on how knowledge should be\nrepresented in deep vision models -- beyond carving it in ``stone'' weights.\n","authors":["Robert Geirhos","Priyank Jaini","Austin Stone","Sourabh Medapati","Xi Yi","George Toderici","Abhijit Ogale","Jonathon Shlens"],"pdf_url":"https://arxiv.org/pdf/2408.08172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21372v2","updated":"2024-08-15T14:01:43Z","published":"2024-07-31T06:54:24Z","title":"Two Completely Parameter-Free Alternating Gradient Projection Algorithms\n for Nonconvex-(strongly) Concave Minimax Problems","summary":" Due to their importance in various emerging applications, efficient\nalgorithms for solving minimax problems have recently received increasing\nattention. However, many existing algorithms require prior knowledge of the\nproblem parameters in order to achieve optimal iteration complexity. In this\npaper, we propose two completely parameter-free alternating gradient projection\nalgorithms, i.e., the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm, to\nsolve the smooth nonconvex-strongly concave and nonconvex-concave minimax\nproblems respectively using a backtracking strategy, which does not require\nprior knowledge of parameters such as the Lipschtiz constant $L$ or the\nstrongly concave constant $\\mu$. Moreover, we show that the total number of\ngradient calls of the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm to\nobtain an $\\varepsilon$-stationary point is upper bounded by $\\mathcal{O}\\left(\nL\\kappa^3\\varepsilon^{-2} \\right)$ and $\\mathcal{O}\\left( L^4\\varepsilon^{-4}\n\\right)$ respectively, where $\\kappa$ is the condition number. As far as we\nknow, the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm are the first\ncompletely parameter-free algorithms for solving nonconvex-strongly concave\nminimax problems and nonconvex-concave minimax problems respectively. Numerical\nresults validate the efficiency of the proposed PF-AGP algorithm.\n","authors":["Junnan Yang","Huiling Zhang","Zi Xu"],"pdf_url":"https://arxiv.org/pdf/2407.21372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06162v3","updated":"2024-08-15T13:59:08Z","published":"2024-04-09T09:34:25Z","title":"Characterizing Multimodal Long-form Summarization: A Case Study on\n Financial Reports","summary":" As large language models (LLMs) expand the power of natural language\nprocessing to handle long inputs, rigorous and systematic analyses are\nnecessary to understand their abilities and behavior. A salient application is\nsummarization, due to its ubiquity and controversy (e.g., researchers have\ndeclared the death of summarization). In this paper, we use financial report\nsummarization as a case study because financial reports are not only long but\nalso use numbers and tables extensively. We propose a computational framework\nfor characterizing multimodal long-form summarization and investigate the\nbehavior of Claude 2.0/2.1, GPT-4/3.5, and Cohere. We find that GPT-3.5 and\nCohere fail to perform this summarization task meaningfully. For Claude 2 and\nGPT-4, we analyze the extractiveness of the summary and identify a position\nbias in LLMs. This position bias disappears after shuffling the input for\nClaude, which suggests that Claude seems to recognize important information. We\nalso conduct a comprehensive investigation on the use of numeric data in\nLLM-generated summaries and offer a taxonomy of numeric hallucination. We\nemploy prompt engineering to improve GPT-4's use of numbers with limited\nsuccess. Overall, our analyses highlight the strong capability of Claude 2 in\nhandling long multimodal inputs compared to GPT-4. The generated summaries and\nevaluation code are available at\nhttps://github.com/ChicagoHAI/characterizing-multimodal-long-form-summarization.\n","authors":["Tianyu Cao","Natraj Raman","Danial Dervovic","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2404.06162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16927v3","updated":"2024-08-15T13:55:30Z","published":"2023-06-29T14:17:24Z","title":"End-to-end Autonomous Driving: Challenges and Frontiers","summary":" The autonomous driving community has witnessed a rapid growth in approaches\nthat embrace an end-to-end algorithm framework, utilizing raw sensor input to\ngenerate vehicle motion plans, instead of concentrating on individual tasks\nsuch as detection and motion prediction. End-to-end systems, in comparison to\nmodular pipelines, benefit from joint feature optimization for perception and\nplanning. This field has flourished due to the availability of large-scale\ndatasets, closed-loop evaluation, and the increasing need for autonomous\ndriving algorithms to perform effectively in challenging scenarios. In this\nsurvey, we provide a comprehensive analysis of more than 270 papers, covering\nthe motivation, roadmap, methodology, challenges, and future trends in\nend-to-end autonomous driving. We delve into several critical challenges,\nincluding multi-modality, interpretability, causal confusion, robustness, and\nworld models, amongst others. Additionally, we discuss current advancements in\nfoundation models and visual pre-training, as well as how to incorporate these\ntechniques within the end-to-end driving framework. we maintain an active\nrepository that contains up-to-date literature and open-source projects at\nhttps://github.com/OpenDriveLab/End-to-end-Autonomous-Driving.\n","authors":["Li Chen","Penghao Wu","Kashyap Chitta","Bernhard Jaeger","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.16927v3.pdf","comment":"Accepted by IEEE TPAMI"},{"id":"http://arxiv.org/abs/2408.08152v1","updated":"2024-08-15T13:40:03Z","published":"2024-08-15T13:40:03Z","title":"DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for\n Reinforcement Learning and Monte-Carlo Tree Search","summary":" We introduce DeepSeek-Prover-V1.5, an open-source language model designed for\ntheorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both\ntraining and inference processes. Pre-trained on DeepSeekMath-Base with\nspecialization in formal mathematical languages, the model undergoes supervised\nfine-tuning using an enhanced formal theorem proving dataset derived from\nDeepSeek-Prover-V1. Further refinement is achieved through reinforcement\nlearning from proof assistant feedback (RLPAF). Beyond the single-pass\nwhole-proof generation approach of DeepSeek-Prover-V1, we propose RMaxTS, a\nvariant of Monte-Carlo tree search that employs an intrinsic-reward-driven\nexploration strategy to generate diverse proof paths. DeepSeek-Prover-V1.5\ndemonstrates significant improvements over DeepSeek-Prover-V1, achieving new\nstate-of-the-art results on the test set of the high school level miniF2F\nbenchmark ($63.5\\%$) and the undergraduate level ProofNet benchmark ($25.3\\%$).\n","authors":["Huajian Xin","Z. Z. Ren","Junxiao Song","Zhihong Shao","Wanjia Zhao","Haocheng Wang","Bo Liu","Liyue Zhang","Xuan Lu","Qiushi Du","Wenjun Gao","Qihao Zhu","Dejian Yang","Zhibin Gou","Z. F. Wu","Fuli Luo","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2408.08152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12532v5","updated":"2024-08-15T13:38:10Z","published":"2022-06-25T02:15:22Z","title":"Inferring Effect Ordering Without Causal Effect Estimation","summary":" Predictive models are often employed to guide interventions across various\ndomains, such as advertising, customer retention, and personalized medicine.\nThese models often do not estimate the actual effects of interventions but\nserve as proxies, suggesting potential effectiveness based on predicted\noutcomes. Our paper addresses the critical question of when and how these\npredictive models can be interpreted causally, specifically focusing on using\nthe models for inferring effect ordering rather than precise effect sizes. We\nformalize two assumptions, full latent mediation and latent monotonicity, that\nare jointly sufficient for inferring effect ordering without direct causal\neffect estimation. We explore the utility of these assumptions in assessing the\nfeasibility of proxies for inferring effect ordering in scenarios where there\nis no data on how individuals behave when intervened or no data on the primary\noutcome of interest. Additionally, we provide practical guidelines for\npractitioners to make their own assessments about proxies. Our findings reveal\nnot only when it is possible to reasonably infer effect ordering from proxies,\nbut also conditions under which modeling these proxies can outperform direct\neffect estimation. This study underscores the importance of broadening causal\ninference to encompass alternative causal interpretations beyond effect\nestimation, offering a foundation for future research to enhance\ndecision-making processes when direct effect estimation is not feasible.\n","authors":["Carlos Fernández-Loría","Jorge Loría"],"pdf_url":"https://arxiv.org/pdf/2206.12532v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08147v1","updated":"2024-08-15T13:32:25Z","published":"2024-08-15T13:32:25Z","title":"P/D-Serve: Serving Disaggregated Large Language Model at Scale","summary":" Serving disaggregated large language models (LLMs) over tens of thousands of\nxPU devices (GPUs or NPUs) with reliable performance faces multiple challenges.\n1) Ignoring the diversity (various prefixes and tidal requests), treating all\nthe prompts in a mixed pool is inadequate. To facilitate the similarity per\nscenario and minimize the inner mismatch on P/D (prefill and decoding)\nprocessing, fine-grained organization is required, dynamically adjusting P/D\nratios for better performance. 2) Due to inaccurate estimation on workload\n(queue status or maintained connections), the global scheduler easily incurs\nunnecessary timeouts in prefill. 3) Block-fixed device-to-device (D2D) KVCache\ntransfer over cluster-level RDMA (remote direct memory access) fails to achieve\ndesired D2D utilization as expected. To overcome previous problems, this paper\nproposes an end-to-end system P/D-Serve, complying with the paradigm of MLOps\n(machine learning operations), which models end-to-end (E2E) P/D performance\nand enables: 1) fine-grained P/D organization, mapping the service with RoCE\n(RDMA over converged ethernet) as needed, to facilitate similar processing and\ndynamic adjustments on P/D ratios; 2) on-demand forwarding upon rejections for\nidle prefill, decoupling the scheduler from regular inaccurate reports and\nlocal queues, to avoid timeouts in prefill; and 3) efficient KVCache transfer\nvia optimized D2D access. P/D-Serve is implemented upon Ascend and MindSpore,\nhas been deployed over tens of thousands of NPUs for more than eight months in\ncommercial use, and further achieves 60\\%, 42\\% and 46\\% improvements on E2E\nthroughput, time-to-first-token (TTFT) SLO (service level objective) and D2D\ntransfer time. As the E2E system with optimizations, P/D-Serve achieves 6.7x\nincrease on throughput, compared with aggregated LLMs.\n","authors":["Yibo Jin","Tao Wang","Huimin Lin","Mingyang Song","Peiyang Li","Yipeng Ma","Yicheng Shan","Zhengfan Yuan","Cailong Li","Yajing Sun","Tiandeng Wu","Xing Chu","Ruizhi Huan","Li Ma","Xiao You","Wenting Zhou","Yunpeng Ye","Wen Liu","Xiangkun Xu","Yongsheng Zhang","Tiantian Dong","Jiawei Zhu","Zhe Wang","Xijian Ju","Jianxun Song","Haoliang Cheng","Xiaojing Li","Jiandong Ding","Hefei Guo","Zhengyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08142v1","updated":"2024-08-15T13:23:59Z","published":"2024-08-15T13:23:59Z","title":"Impact of Comprehensive Data Preprocessing on Predictive Modelling of\n COVID-19 Mortality","summary":" Accurate predictive models are crucial for analysing COVID-19 mortality\ntrends. This study evaluates the impact of a custom data preprocessing pipeline\non ten machine learning models predicting COVID-19 mortality using data from\nOur World in Data (OWID). Our pipeline differs from a standard preprocessing\npipeline through four key steps. Firstly, it transforms weekly reported totals\ninto daily updates, correcting reporting biases and providing more accurate\nestimates. Secondly, it uses localised outlier detection and processing to\npreserve data variance and enhance accuracy. Thirdly, it utilises computational\ndependencies among columns to ensure data consistency. Finally, it incorporates\nan iterative feature selection process to optimise the feature set and improve\nmodel performance. Results show a significant improvement with the custom\npipeline: the MLP Regressor achieved a test RMSE of 66.556 and a test R-squared\nof 0.991, surpassing the DecisionTree Regressor from the standard pipeline,\nwhich had a test RMSE of 222.858 and a test R-squared of 0.817. These findings\nhighlight the importance of tailored preprocessing techniques in enhancing\npredictive modelling accuracy for COVID-19 mortality. Although specific to this\nstudy, these methodologies offer valuable insights into diverse datasets and\ndomains, improving predictive performance across various contexts.\n","authors":["Sangita Das","Subhrajyoti Maji"],"pdf_url":"https://arxiv.org/pdf/2408.08142v1.pdf","comment":"8 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.03492v2","updated":"2024-08-15T13:21:26Z","published":"2023-12-06T13:32:17Z","title":"Learning From Scenarios for Stochastic Repairable Scheduling","summary":" When optimizing problems with uncertain parameter values in a linear\nobjective, decision-focused learning enables end-to-end learning of these\nvalues. We are interested in a stochastic scheduling problem, in which\nprocessing times are uncertain, which brings uncertain values in the\nconstraints, and thus repair of an initial schedule may be needed. Historical\nrealizations of the stochastic processing times are available. We show how\nexisting decision-focused learning techniques based on stochastic smoothing can\nbe adapted to this scheduling problem. We include an extensive experimental\nevaluation to investigate in which situations decision-focused learning\noutperforms the state of the art for such situations: scenario-based stochastic\noptimization.\n","authors":["Kim van den Houten","David M. J. Tax","Esteban Freydell","Mathijs de Weerdt"],"pdf_url":"https://arxiv.org/pdf/2312.03492v2.pdf","comment":"8 pages, updated according to camera-ready version CPAIOR'24"},{"id":"http://arxiv.org/abs/2408.08137v1","updated":"2024-08-15T13:13:17Z","published":"2024-08-15T13:13:17Z","title":"Normalized AOPC: Fixing Misleading Faithfulness Metrics for Feature\n Attribution Explainability","summary":" Deep neural network predictions are notoriously difficult to interpret.\nFeature attribution methods aim to explain these predictions by identifying the\ncontribution of each input feature. Faithfulness, often evaluated using the\narea over the perturbation curve (AOPC), reflects feature attributions'\naccuracy in describing the internal mechanisms of deep neural networks.\nHowever, many studies rely on AOPC to compare faithfulness across different\nmodels, which we show can lead to false conclusions about models' faithfulness.\nSpecifically, we find that AOPC is sensitive to variations in the model,\nresulting in unreliable cross-model comparisons. Moreover, AOPC scores are\ndifficult to interpret in isolation without knowing the model-specific lower\nand upper limits. To address these issues, we propose a normalization approach,\nNormalized AOPC (NAOPC), enabling consistent cross-model evaluations and more\nmeaningful interpretation of individual scores. Our experiments demonstrate\nthat this normalization can radically change AOPC results, questioning the\nconclusions of earlier studies and offering a more robust framework for\nassessing feature attribution faithfulness.\n","authors":["Joakim Edin","Andreas Geert Motzfeldt","Casper L. Christensen","Tuukka Ruotsalo","Lars Maaløe","Maria Maistro"],"pdf_url":"https://arxiv.org/pdf/2408.08137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17885v3","updated":"2024-08-15T13:08:00Z","published":"2024-06-25T18:47:50Z","title":"Enabling Regional Explainability by Automatic and Model-agnostic Rule\n Extraction","summary":" In Explainable AI, rule extraction translates model knowledge into logical\nrules, such as IF-THEN statements, crucial for understanding patterns learned\nby black-box models. This could significantly aid in fields like disease\ndiagnosis, disease progression estimation, or drug discovery. However, such\napplication domains often contain imbalanced data, with the class of interest\nunderrepresented. Existing methods inevitably compromise the performance of\nrules for the minor class to maximise the overall performance. As the first\nattempt in this field, we propose a model-agnostic approach for extracting\nrules from specific subgroups of data, featuring automatic rule generation for\nnumerical features. This method enhances the regional explainability of machine\nlearning models and offers wider applicability compared to existing methods. We\nadditionally introduce a new method for selecting features to compose rules,\nreducing computational costs in high-dimensional spaces. Experiments across\nvarious datasets and models demonstrate the effectiveness of our methods.\n","authors":["Yu Chen","Tianyu Cui","Alexander Capstick","Nan Fletcher-Loyd","Payam Barnaghi"],"pdf_url":"https://arxiv.org/pdf/2406.17885v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08133v1","updated":"2024-08-15T13:07:51Z","published":"2024-08-15T13:07:51Z","title":"EXPLAIN, AGREE, LEARN: Scaling Learning for Neural Probabilistic Logic","summary":" Neural probabilistic logic systems follow the neuro-symbolic (NeSy) paradigm\nby combining the perceptive and learning capabilities of neural networks with\nthe robustness of probabilistic logic. Learning corresponds to likelihood\noptimization of the neural networks. However, to obtain the likelihood exactly,\nexpensive probabilistic logic inference is required. To scale learning to more\ncomplex systems, we therefore propose to instead optimize a sampling based\nobjective. We prove that the objective has a bounded error with respect to the\nlikelihood, which vanishes when increasing the sample count. Furthermore, the\nerror vanishes faster by exploiting a new concept of sample diversity. We then\ndevelop the EXPLAIN, AGREE, LEARN (EXAL) method that uses this objective.\nEXPLAIN samples explanations for the data. AGREE reweighs each explanation in\nconcordance with the neural component. LEARN uses the reweighed explanations as\na signal for learning. In contrast to previous NeSy methods, EXAL can scale to\nlarger problem sizes while retaining theoretical guarantees on the error.\nExperimentally, our theoretical claims are verified and EXAL outperforms recent\nNeSy methods when scaling up the MNIST addition and Warcraft pathfinding\nproblems.\n","authors":["Victor Verreet","Lennert De Smet","Luc De Raedt","Emanuele Sansone"],"pdf_url":"https://arxiv.org/pdf/2408.08133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08119v1","updated":"2024-08-15T12:38:10Z","published":"2024-08-15T12:38:10Z","title":"The Unreasonable Effectiveness of Solving Inverse Problems with Neural\n Networks","summary":" Finding model parameters from data is an essential task in science and\nengineering, from weather and climate forecasts to plasma control. Previous\nworks have employed neural networks to greatly accelerate finding solutions to\ninverse problems. Of particular interest are end-to-end models which utilize\ndifferentiable simulations in order to backpropagate feedback from the\nsimulated process to the network weights and enable roll-out of multiple time\nsteps. So far, it has been assumed that, while model inference is faster than\nclassical optimization, this comes at the cost of a decrease in solution\naccuracy. We show that this is generally not true. In fact, neural networks\ntrained to learn solutions to inverse problems can find better solutions than\nclassical optimizers even on their training set. To demonstrate this, we\nperform both a theoretical analysis as well an extensive empirical evaluation\non challenging problems involving local minima, chaos, and zero-gradient\nregions. Our findings suggest an alternative use for neural networks: rather\nthan generalizing to new data for fast inference, they can also be used to find\nbetter solutions on known data.\n","authors":["Philipp Holl","Nils Thuerey"],"pdf_url":"https://arxiv.org/pdf/2408.08119v1.pdf","comment":"Source code to follow soon: https://ge.in.tum.de"},{"id":"http://arxiv.org/abs/2408.08115v1","updated":"2024-08-15T12:24:22Z","published":"2024-08-15T12:24:22Z","title":"Learned denoising with simulated and experimental low-dose CT data","summary":" Like in many other research fields, recent developments in computational\nimaging have focused on developing machine learning (ML) approaches to tackle\nits main challenges. To improve the performance of computational imaging\nalgorithms, machine learning methods are used for image processing tasks such\nas noise reduction. Generally, these ML methods heavily rely on the\navailability of high-quality data on which they are trained. This work explores\nthe application of ML methods, specifically convolutional neural networks\n(CNNs), in the context of noise reduction for computed tomography (CT) imaging.\nWe utilize a large 2D computed tomography dataset for machine learning to carry\nout for the first time a comprehensive study on the differences between the\nobserved performances of algorithms trained on simulated noisy data and on\nreal-world experimental noisy data. The study compares the performance of two\ncommon CNN architectures, U-Net and MSD-Net, that are trained and evaluated on\nboth simulated and experimental noisy data. The results show that while\nsinogram denoising performed better with simulated noisy data if evaluated in\nthe sinogram domain, the performance did not carry over to the reconstruction\ndomain where training on experimental noisy data shows a higher performance in\ndenoising experimental noisy data. Training the algorithms in an end-to-end\nfashion from sinogram to reconstruction significantly improved model\nperformance, emphasizing the importance of matching raw measurement data to\nhigh-quality CT reconstructions. The study furthermore suggests the need for\nmore sophisticated noise simulation approaches to bridge the gap between\nsimulated and real-world data in CT image denoising applications and gives\ninsights into the challenges and opportunities in leveraging simulated data for\nmachine learning in computational imaging.\n","authors":["Maximilian B. Kiss","Ander Biguri","Carola-Bibiane Schönlieb","K. Joost Batenburg","Felix Lucka"],"pdf_url":"https://arxiv.org/pdf/2408.08115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03785v3","updated":"2024-08-15T12:17:11Z","published":"2024-01-08T10:06:52Z","title":"Identifying Important Group of Pixels using Interactions","summary":" To better understand the behavior of image classifiers, it is useful to\nvisualize the contribution of individual pixels to the model prediction. In\nthis study, we propose a method, MoXI ($\\textbf{Mo}$del e$\\textbf{X}$planation\nby $\\textbf{I}$nteractions), that efficiently and accurately identifies a group\nof pixels with high prediction confidence. The proposed method employs\ngame-theoretic concepts, Shapley values and interactions, taking into account\nthe effects of individual pixels and the cooperative influence of pixels on\nmodel confidence. Theoretical analysis and experiments demonstrate that our\nmethod better identifies the pixels that are highly contributing to the model\noutputs than widely-used visualization by Grad-CAM, Attention rollout, and\nShapley value. While prior studies have suffered from the exponential\ncomputational cost in the computation of Shapley value and interactions, we\nshow that this can be reduced to quadratic cost for our task. The code is\navailable at https://github.com/KosukeSumiyasu/MoXI.\n","authors":["Kosuke Sumiyasu","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2401.03785v3.pdf","comment":"CVPR 2024 (update: minor typos, new references, Eqs. (12) and (13))"},{"id":"http://arxiv.org/abs/2408.08109v1","updated":"2024-08-15T12:13:23Z","published":"2024-08-15T12:13:23Z","title":"Hearing Your Blood Sugar: Non-Invasive Glucose Measurement Through\n Simple Vocal Signals, Transforming any Speech into a Sensor with Machine\n Learning","summary":" Effective diabetes management relies heavily on the continuous monitoring of\nblood glucose levels, traditionally achieved through invasive and uncomfortable\nmethods. While various non-invasive techniques have been explored, such as\noptical, microwave, and electrochemical approaches, none have effectively\nsupplanted these invasive technologies due to issues related to complexity,\naccuracy, and cost. In this study, we present a transformative and\nstraightforward method that utilizes voice analysis to predict blood glucose\nlevels. Our research investigates the relationship between fluctuations in\nblood glucose and vocal characteristics, highlighting the influence of blood\nvessel dynamics during voice production. By applying advanced machine learning\nalgorithms, we analyzed vocal signal variations and established a significant\ncorrelation with blood glucose levels. We developed a predictive model using\nartificial intelligence, based on voice recordings and corresponding glucose\nmeasurements from participants, utilizing logistic regression and Ridge\nregularization. Our findings indicate that voice analysis may serve as a viable\nnon-invasive alternative for glucose monitoring. This innovative approach not\nonly has the potential to streamline and reduce the costs associated with\ndiabetes management but also aims to enhance the quality of life for\nindividuals living with diabetes by providing a painless and user-friendly\nmethod for monitoring blood sugar levels.\n","authors":["Nihat Ahmadli","Mehmet Ali Sarsil","Onur Ergen"],"pdf_url":"https://arxiv.org/pdf/2408.08109v1.pdf","comment":"5 figure and 5 tables. This manuscript is a pre-print to be submitted\n to a journal or/and a conference. arXiv admin note: substantial text overlap\n with arXiv:2402.13812"},{"id":"http://arxiv.org/abs/2408.08106v1","updated":"2024-08-15T12:10:50Z","published":"2024-08-15T12:10:50Z","title":"Adaptation of uncertainty-penalized Bayesian information criterion for\n parametric partial differential equation discovery","summary":" Data-driven discovery of partial differential equations (PDEs) has emerged as\na promising approach for deriving governing physics when domain knowledge about\nobserved data is limited. Despite recent progress, the identification of\ngoverning equations and their parametric dependencies using conventional\ninformation criteria remains challenging in noisy situations, as the criteria\ntend to select overly complex PDEs. In this paper, we introduce an extension of\nthe uncertainty-penalized Bayesian information criterion (UBIC), which is\nadapted to solve parametric PDE discovery problems efficiently without\nrequiring computationally expensive PDE simulations. This extended UBIC uses\nquantified PDE uncertainty over different temporal or spatial points to prevent\noverfitting in model selection. The UBIC is computed with data transformation\nbased on power spectral densities to discover the governing parametric PDE that\ntruly captures qualitative features in frequency space with a few significant\nterms and their parametric dependencies (i.e., the varying PDE coefficients),\nevaluated with confidence intervals. Numerical experiments on canonical PDEs\ndemonstrate that our extended UBIC can identify the true number of terms and\ntheir varying coefficients accurately, even in the presence of noise. The code\nis available at\n\\url{https://github.com/Pongpisit-Thanasutives/parametric-discovery}.\n","authors":["Pongpisit Thanasutives","Ken-ichi Fukui"],"pdf_url":"https://arxiv.org/pdf/2408.08106v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.07680v2","updated":"2024-08-15T12:07:00Z","published":"2024-08-14T17:28:58Z","title":"A Spitting Image: Modular Superpixel Tokenization in Vision Transformers","summary":" Vision Transformer (ViT) architectures traditionally employ a grid-based\napproach to tokenization independent of the semantic content of an image. We\npropose a modular superpixel tokenization strategy which decouples tokenization\nand feature extraction; a shift from contemporary approaches where these are\ntreated as an undifferentiated whole. Using on-line content-aware tokenization\nand scale- and shape-invariant positional embeddings, we perform experiments\nand ablations that contrast our approach with patch-based tokenization and\nrandomized partitions as baselines. We show that our method significantly\nimproves the faithfulness of attributions, gives pixel-level granularity on\nzero-shot unsupervised dense prediction tasks, while maintaining predictive\nperformance in classification tasks. Our approach provides a modular\ntokenization framework commensurable with standard architectures, extending the\nspace of ViTs to a larger class of semantically-rich models.\n","authors":["Marius Aasan","Odd Kolbjørnsen","Anne Schistad Solberg","Adín Ramirez Rivera"],"pdf_url":"https://arxiv.org/pdf/2408.07680v2.pdf","comment":"To appear in ECCV (MELEX) 2024 Workshop Proceedings"},{"id":"http://arxiv.org/abs/2403.07818v2","updated":"2024-08-15T11:51:57Z","published":"2024-03-12T16:57:56Z","title":"Label Dropout: Improved Deep Learning Echocardiography Segmentation\n Using Multiple Datasets With Domain Shift and Partial Labelling","summary":" Echocardiography (echo) is the first imaging modality used when assessing\ncardiac function. The measurement of functional biomarkers from echo relies\nupon the segmentation of cardiac structures and deep learning models have been\nproposed to automate the segmentation process. However, in order to translate\nthese tools to widespread clinical use it is important that the segmentation\nmodels are robust to a wide variety of images (e.g. acquired from different\nscanners, by operators with different levels of expertise etc.). To achieve\nthis level of robustness it is necessary that the models are trained with\nmultiple diverse datasets. A significant challenge faced when training with\nmultiple diverse datasets is the variation in label presence, i.e. the combined\ndata are often partially-labelled. Adaptations of the cross entropy loss\nfunction have been proposed to deal with partially labelled data. In this paper\nwe show that training naively with such a loss function and multiple diverse\ndatasets can lead to a form of shortcut learning, where the model associates\nlabel presence with domain characteristics, leading to a drop in performance.\nTo address this problem, we propose a novel label dropout scheme to break the\nlink between domain characteristics and the presence or absence of labels. We\ndemonstrate that label dropout improves echo segmentation Dice score by 62% and\n25% on two cardiac structures when training using multiple diverse partially\nlabelled datasets.\n","authors":["Iman Islam","Esther Puyol-Antón","Bram Ruijsink","Andrew J. Reader","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2403.07818v2.pdf","comment":"10 pages, 5 figures, ASMUS 2024, Held in Conjunction with MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.12614v2","updated":"2024-08-15T11:43:23Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of trustworthy and\ndisinformation articles related to pro-Kremlin themes. It is sourced directly\nfrom the debunk articles written by experts leading the EUvsDisinfo project.\nOur dataset is the largest to-date resource in terms of the overall number of\narticles and distinct languages. It also provides the largest topical and\ntemporal coverage. Using this dataset, we investigate the dissemination of\npro-Kremlin disinformation across different languages, uncovering\nlanguage-specific patterns targeting specific disinformation topics. We further\nanalyse the evolution of topic distribution over an eight-year period, noting a\nsignificant surge in disinformation content before the full-scale invasion of\nUkraine in 2022. Lastly, we demonstrate the dataset's applicability in training\nmodels to effectively distinguish between disinformation and trustworthy\ncontent in multilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v2.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2403.00680v2","updated":"2024-08-15T11:30:39Z","published":"2024-03-01T17:12:53Z","title":"Scalable Learning of Item Response Theory Models","summary":" Item Response Theory (IRT) models aim to assess latent abilities of $n$\nexaminees along with latent difficulty characteristics of $m$ test items from\ncategorical data that indicates the quality of their corresponding answers.\nClassical psychometric assessments are based on a relatively small number of\nexaminees and items, say a class of $200$ students solving an exam comprising\n$10$ problems. More recent global large scale assessments such as PISA, or\ninternet studies, may lead to significantly increased numbers of participants.\nAdditionally, in the context of Machine Learning where algorithms take the role\nof examinees and data analysis problems take the role of items, both $n$ and\n$m$ may become very large, challenging the efficiency and scalability of\ncomputations. To learn the latent variables in IRT models from large data, we\nleverage the similarity of these models to logistic regression, which can be\napproximated accurately using small weighted subsets called coresets. We\ndevelop coresets for their use in alternating IRT training algorithms,\nfacilitating scalable learning from large data.\n","authors":["Susanne Frick","Amer Krivošija","Alexander Munteanu"],"pdf_url":"https://arxiv.org/pdf/2403.00680v2.pdf","comment":"Published in AISTATS 2024. V2: References updated"},{"id":"http://arxiv.org/abs/2408.08084v1","updated":"2024-08-15T11:26:28Z","published":"2024-08-15T11:26:28Z","title":"An Efficient Replay for Class-Incremental Learning with Pre-trained\n Models","summary":" In general class-incremental learning, researchers typically use sample sets\nas a tool to avoid catastrophic forgetting during continuous learning. At the\nsame time, researchers have also noted the differences between\nclass-incremental learning and Oracle training and have attempted to make\ncorrections. In recent years, researchers have begun to develop\nclass-incremental learning algorithms utilizing pre-trained models, achieving\nsignificant results. This paper observes that in class-incremental learning,\nthe steady state among the weight guided by each class center is disrupted,\nwhich is significantly correlated with catastrophic forgetting. Based on this,\nwe propose a new method to overcoming forgetting . In some cases, by retaining\nonly a single sample unit of each class in memory for replay and applying\nsimple gradient constraints, very good results can be achieved. Experimental\nresults indicate that under the condition of pre-trained models, our method can\nachieve competitive performance with very low computational cost and by simply\nusing the cross-entropy loss.\n","authors":["Weimin Yin","Bin Chen adn Chunzhao Xie","Zhenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2408.08084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08075v1","updated":"2024-08-15T11:02:05Z","published":"2024-08-15T11:02:05Z","title":"Independent Policy Mirror Descent for Markov Potential Games: Scaling to\n Large Number of Players","summary":" Markov Potential Games (MPGs) form an important sub-class of Markov games,\nwhich are a common framework to model multi-agent reinforcement learning\nproblems. In particular, MPGs include as a special case the identical-interest\nsetting where all the agents share the same reward function. Scaling the\nperformance of Nash equilibrium learning algorithms to a large number of agents\nis crucial for multi-agent systems. To address this important challenge, we\nfocus on the independent learning setting where agents can only have access to\ntheir local information to update their own policy. In prior work on MPGs, the\niteration complexity for obtaining $\\epsilon$-Nash regret scales linearly with\nthe number of agents $N$. In this work, we investigate the iteration complexity\nof an independent policy mirror descent (PMD) algorithm for MPGs. We show that\nPMD with KL regularization, also known as natural policy gradient, enjoys a\nbetter $\\sqrt{N}$ dependence on the number of agents, improving over PMD with\nEuclidean regularization and prior work. Furthermore, the iteration complexity\nis also independent of the sizes of the agents' action spaces.\n","authors":["Pragnya Alatur","Anas Barakat","Niao He"],"pdf_url":"https://arxiv.org/pdf/2408.08075v1.pdf","comment":"16 pages, CDC 2024"},{"id":"http://arxiv.org/abs/2408.08074v1","updated":"2024-08-15T11:01:35Z","published":"2024-08-15T11:01:35Z","title":"A Survey on Integrated Sensing, Communication, and Computation","summary":" The forthcoming generation of wireless technology, 6G, promises a\nrevolutionary leap beyond traditional data-centric services. It aims to usher\nin an era of ubiquitous intelligent services, where everything is\ninterconnected and intelligent. This vision requires the seamless integration\nof three fundamental modules: Sensing for information acquisition,\ncommunication for information sharing, and computation for information\nprocessing and decision-making. These modules are intricately linked,\nespecially in complex tasks such as edge learning and inference. However, the\nperformance of these modules is interdependent, creating a resource competition\nfor time, energy, and bandwidth. Existing techniques like integrated\ncommunication and computation (ICC), integrated sensing and computation (ISC),\nand integrated sensing and communication (ISAC) have made partial strides in\naddressing this challenge, but they fall short of meeting the extreme\nperformance requirements. To overcome these limitations, it is essential to\ndevelop new techniques that comprehensively integrate sensing, communication,\nand computation. This integrated approach, known as Integrated Sensing,\nCommunication, and Computation (ISCC), offers a systematic perspective for\nenhancing task performance. This paper begins with a comprehensive survey of\nhistoric and related techniques such as ICC, ISC, and ISAC, highlighting their\nstrengths and limitations. It then explores the state-of-the-art signal designs\nfor ISCC, along with network resource management strategies specifically\ntailored for ISCC. Furthermore, this paper discusses the exciting research\nopportunities that lie ahead for implementing ISCC in future advanced networks.\nBy embracing ISCC, we can unlock the full potential of intelligent\nconnectivity, paving the way for groundbreaking applications and services.\n","authors":["Dingzhu Wen","Yong Zhou","Xiaoyang Li","Yuanming Shi","Kaibin Huang","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2408.08074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18458v2","updated":"2024-08-15T10:58:17Z","published":"2024-05-28T17:27:20Z","title":"Asymmetrical estimator for training encapsulated deep photonic neural\n networks","summary":" Scalable isomorphic physical neural networks (PNNs) are emerging NN\nacceleration paradigms for their high-bandwidth, in-propagation computation.\nDespite backpropagation (BP)-based training is often the industry standard for\nits robustness and fast gradient convergences, existing BP-PNN training methods\nneed to truncate the propagation of analogue signal at each layer and acquire\naccurate hidden neuron readouts for deep networks. This compromises the\nincentive of PNN for fast in-propagation processing. In addition, the required\nreadouts introduce massive bottlenecks due to the conversions between the\nanalogue-digital interfaces to shuttle information across. These factors limit\nboth the time and energy efficiency during training. Here we introduce the\nasymmetrical training (AT) method, a BP-based method that can perform training\non an encapsulated deep network, where the information propagation is\nmaintained within the analogue domain until the output layer. AT's minimum\ninformation access bypass analogue-digital interface bottleneck wherever\npossible. For any deep network structure, AT offers significantly improved time\nand energy efficiency compared to existing BP-PNN methods, and scales well for\nlarge network sizes. We demonstrated AT's error-tolerant and calibration-free\ntraining for encapsulated integrated photonic deep networks to achieve near\nideal BP performances. AT's well-behaved training is demonstrated repeatably\nacross different datasets and network structures\n","authors":["Yizhi Wang","Minjia Chen","Chunhui Yao","Jie Ma","Ting Yan","Richard Penty","Qixiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.18458v2.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.08073v1","updated":"2024-08-15T10:54:55Z","published":"2024-08-15T10:54:55Z","title":"Extracting Sentence Embeddings from Pretrained Transformer Models","summary":" Background/introduction: Pre-trained transformer models shine in many natural\nlanguage processing tasks and therefore are expected to bear the representation\nof the input sentence or text meaning. These sentence-level embeddings are also\nimportant in retrieval-augmented generation. But do commonly used plain\naveraging or prompt templates surface it enough?\n Methods: Given 110M parameters BERT's hidden representations from multiple\nlayers and multiple tokens we tried various ways to extract optimal sentence\nrepresentations. We tested various token aggregation and representation\npost-processing techniques. We also tested multiple ways of using a general\nWikitext dataset to complement BERTs sentence representations. All methods were\ntested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12\nclassification tasks. We also evaluated our representation-shaping techniques\non other static models, including random token representations.\n Results: Proposed representation extraction methods improved the performance\non STS and clustering tasks for all models considered. Very high improvements\nfor static token-based models, especially random embeddings for STS tasks\nalmost reach the performance of BERT-derived representations.\n Conclusions: Our work shows that for multiple tasks simple baselines with\nrepresentation shaping techniques reach or even outperform more complex\nBERT-based models or are able to contribute to their performance.\n","authors":["Lukas Stankevičius","Mantas Lukoševičius"],"pdf_url":"https://arxiv.org/pdf/2408.08073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08071v1","updated":"2024-08-15T10:44:33Z","published":"2024-08-15T10:44:33Z","title":"Universality of Real Minimal Complexity Reservoir","summary":" Reservoir Computing (RC) models, a subclass of recurrent neural networks, are\ndistinguished by their fixed, non-trainable input layer and dynamically coupled\nreservoir, with only the static readout layer being trained. This design\ncircumvents the issues associated with backpropagating error signals through\ntime, thereby enhancing both stability and training efficiency. RC models have\nbeen successfully applied across a broad range of application domains.\nCrucially, they have been demonstrated to be universal approximators of\ntime-invariant dynamic filters with fading memory, under various settings of\napproximation norms and input driving sources.\n Simple Cycle Reservoirs (SCR) represent a specialized class of RC models with\na highly constrained reservoir architecture, characterized by uniform ring\nconnectivity and binary input-to-reservoir weights with an aperiodic sign\npattern. For linear reservoirs, given the reservoir size, the reservoir\nconstruction has only one degree of freedom -- the reservoir cycle weight. Such\narchitectures are particularly amenable to hardware implementations without\nsignificant performance degradation in many practical tasks. In this study we\nendow these observations with solid theoretical foundations by proving that\nSCRs operating in real domain are universal approximators of time-invariant\ndynamic filters with fading memory. Our results supplement recent research\nshowing that SCRs in the complex domain can approximate, to arbitrary\nprecision, any unrestricted linear reservoir with a non-linear readout. We\nfurthermore introduce a novel method to drastically reduce the number of SCR\nunits, making such highly constrained architectures natural candidates for\nlow-complexity hardware implementations. Our findings are supported by\nempirical studies on real-world time series datasets.\n","authors":["Robert Simon Fong","Boyu Li","Peter Tiňo"],"pdf_url":"https://arxiv.org/pdf/2408.08071v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.13764v3","updated":"2024-08-15T10:03:37Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v3.pdf","comment":"Accepted to ECCV 2024. Code is available at\n https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2408.08062v1","updated":"2024-08-15T10:03:30Z","published":"2024-08-15T10:03:30Z","title":"BINDy -- Bayesian identification of nonlinear dynamics with\n reversible-jump Markov-chain Monte-Carlo","summary":" Model parsimony is an important \\emph{cognitive bias} in data-driven\nmodelling that aids interpretability and helps to prevent over-fitting. Sparse\nidentification of nonlinear dynamics (SINDy) methods are able to learn sparse\nrepresentations of complex dynamics directly from data, given a basis of\nlibrary functions. In this work, a novel Bayesian treatment of dictionary\nlearning system identification, as an alternative to SINDy, is envisaged. The\nproposed method -- Bayesian identification of nonlinear dynamics (BINDy) -- is\ndistinct from previous approaches in that it targets the full joint posterior\ndistribution over both the terms in the library and their parameterisation in\nthe model. This formulation confers the advantage that an arbitrary prior may\nbe placed over the model structure to produce models that are sparse in the\nmodel space rather than in parameter space. Because this posterior is defined\nover parameter vectors that can change in dimension, the inference cannot be\nperformed by standard techniques. Instead, a Gibbs sampler based on\nreversible-jump Markov-chain Monte-Carlo is proposed. BINDy is shown to compare\nfavourably to ensemble SINDy in three benchmark case-studies. In particular, it\nis seen that the proposed method is better able to assign high probability to\ncorrect model terms.\n","authors":["Max D. Champneys","Timothy J. Rogers"],"pdf_url":"https://arxiv.org/pdf/2408.08062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16663v4","updated":"2024-08-15T10:03:16Z","published":"2024-04-25T15:04:27Z","title":"Conditional Fairness for Generative AIs","summary":" The deployment of generative AI (GenAI) models raises significant fairness\nconcerns, addressed in this paper through novel characterization and\nenforcement techniques specific to GenAI. Unlike standard AI performing\nspecific tasks, GenAI's broad functionality requires \"conditional fairness\"\ntailored to the context being generated, such as demographic fairness in\ngenerating images of poor people versus successful business leaders. We define\ntwo fairness levels: the first evaluates fairness in generated outputs,\nindependent of prompts and models; the second assesses inherent fairness with\nneutral prompts. Given the complexity of GenAI and challenges in fairness\nspecifications, we focus on bounding the worst case, considering a GenAI system\nunfair if the distance between appearances of a specific group exceeds preset\nthresholds. We also explore combinatorial testing for accessing relative\ncompleteness in intersectional fairness. By bounding the worst case, we develop\na prompt injection scheme within an agent-based framework to enforce\nconditional fairness with minimal intervention, validated on state-of-the-art\nGenAI systems.\n","authors":["Chih-Hong Cheng","Harald Ruess","Changshun Wu","Xingyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.16663v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08059v1","updated":"2024-08-15T09:59:26Z","published":"2024-08-15T09:59:26Z","title":"Maximally Permissive Reward Machines","summary":" Reward machines allow the definition of rewards for temporally extended tasks\nand behaviors. Specifying \"informative\" reward machines can be challenging. One\nway to address this is to generate reward machines from a high-level abstract\ndescription of the learning environment, using techniques such as AI planning.\nHowever, previous planning-based approaches generate a reward machine based on\na single (sequential or partial-order) plan, and do not allow maximum\nflexibility to the learning agent. In this paper we propose a new approach to\nsynthesising reward machines which is based on the set of partial order plans\nfor a goal. We prove that learning using such \"maximally permissive\" reward\nmachines results in higher rewards than learning using RMs based on a single\nplan. We present experimental results which support our theoretical claims by\nshowing that our approach obtains higher rewards than the single-plan approach\nin practice.\n","authors":["Giovanni Varricchione","Natasha Alechina","Mehdi Dastani","Brian Logan"],"pdf_url":"https://arxiv.org/pdf/2408.08059v1.pdf","comment":"Paper accepted for publication at the European Conference on\n Artificial Intelligence (ECAI) 2024"},{"id":"http://arxiv.org/abs/2408.08058v1","updated":"2024-08-15T09:55:51Z","published":"2024-08-15T09:55:51Z","title":"Navigating Data Scarcity using Foundation Models: A Benchmark of\n Few-Shot and Zero-Shot Learning Approaches in Medical Imaging","summary":" Data scarcity is a major limiting factor for applying modern machine learning\ntechniques to clinical tasks. Although sufficient data exists for some\nwell-studied medical tasks, there remains a long tail of clinically relevant\ntasks with poor data availability. Recently, numerous foundation models have\ndemonstrated high suitability for few-shot learning (FSL) and zero-shot\nlearning (ZSL), potentially making them more accessible to practitioners.\nHowever, it remains unclear which foundation model performs best on FSL medical\nimage analysis tasks and what the optimal methods are for learning from limited\ndata. We conducted a comprehensive benchmark study of ZSL and FSL using 16\npretrained foundation models on 19 diverse medical imaging datasets. Our\nresults indicate that BiomedCLIP, a model pretrained exclusively on medical\ndata, performs best on average for very small training set sizes, while very\nlarge CLIP models pretrained on LAION-2B perform best with slightly more\ntraining samples. However, simply fine-tuning a ResNet-18 pretrained on\nImageNet performs similarly with more than five training examples per class.\nOur findings also highlight the need for further research on foundation models\nspecifically tailored for medical applications and the collection of more\ndatasets to train these models.\n","authors":["Stefano Woerner","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2408.08058v1.pdf","comment":"Accepted as an oral presentation in MICCAI 2024 2nd International\n Workshop on Foundation Models for General Medical AI"},{"id":"http://arxiv.org/abs/2408.08056v1","updated":"2024-08-15T09:50:11Z","published":"2024-08-15T09:50:11Z","title":"DATTA: Towards Diversity Adaptive Test-Time Adaptation in Dynamic Wild\n World","summary":" Test-time adaptation (TTA) effectively addresses distribution shifts between\ntraining and testing data by adjusting models on test samples, which is crucial\nfor improving model inference in real-world applications. However, traditional\nTTA methods typically follow a fixed pattern to address the dynamic data\npatterns (low-diversity or high-diversity patterns) often leading to\nperformance degradation and consequently a decline in Quality of Experience\n(QoE). The primary issues we observed are:Different scenarios require different\nnormalization methods (e.g., Instance Normalization is optimal in mixed domains\nbut not in static domains). Model fine-tuning can potentially harm the model\nand waste time.Hence, it is crucial to design strategies for effectively\nmeasuring and managing distribution diversity to minimize its negative impact\non model performance. Based on these observations, this paper proposes a new\ngeneral method, named Diversity Adaptive Test-Time Adaptation (DATTA), aimed at\nimproving QoE. DATTA dynamically selects the best batch normalization methods\nand fine-tuning strategies by leveraging the Diversity Score to differentiate\nbetween high and low diversity score batches. It features three key components:\nDiversity Discrimination (DD) to assess batch diversity, Diversity Adaptive\nBatch Normalization (DABN) to tailor normalization methods based on DD\ninsights, and Diversity Adaptive Fine-Tuning (DAFT) to selectively fine-tune\nthe model. Experimental results show that our method achieves up to a 21%\nincrease in accuracy compared to state-of-the-art methodologies, indicating\nthat our method maintains good model performance while demonstrating its\nrobustness. Our code will be released soon.\n","authors":["Chuyang Ye","Dongyan Wei","Zhendong Liu","Yuanyi Pang","Yixi Lin","Jiarong Liao","Qinting Jiang","Xianghua Fu","Qing Li","Jingyan Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08056v1.pdf","comment":"16 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.08055v1","updated":"2024-08-15T09:49:37Z","published":"2024-08-15T09:49:37Z","title":"COTODE: COntinuous Trajectory neural Ordinary Differential Equations for\n modelling event sequences","summary":" Observation of the underlying actors that generate event sequences reveals\nthat they often evolve continuously. Most modern methods, however, tend to\nmodel such processes through at most piecewise-continuous trajectories. To\naddress this, we adopt a way of viewing events not as standalone phenomena but\ninstead as observations of a Gaussian Process, which in turn governs the\nactor's dynamics. We propose integrating these obtained dynamics, resulting in\na continuous-trajectory modification of the widely successful Neural ODE model.\nThrough Gaussian Process theory, we were able to evaluate the uncertainty in an\nactor's representation, which arises from not observing them between events.\nThis estimate led us to develop a novel, theoretically backed negative feedback\nmechanism. Empirical studies indicate that our model with Gaussian process\ninterpolation and negative feedback achieves state-of-the-art performance, with\nimprovements up to 20% AUROC against similar architectures.\n","authors":["Ilya Kuleshov","Galina Boeva","Vladislav Zhuzhel","Evgenia Romanenkova","Evgeni Vorsin","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2408.08055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08047v1","updated":"2024-08-15T09:26:26Z","published":"2024-08-15T09:26:26Z","title":"An Efficient Continuous Control Perspective for\n Reinforcement-Learning-based Sequential Recommendation","summary":" Sequential recommendation, where user preference is dynamically inferred from\nsequential historical behaviors, is a critical task in recommender systems\n(RSs). To further optimize long-term user engagement, offline\nreinforcement-learning-based RSs have become a mainstream technique as they\nprovide an additional advantage in avoiding global explorations that may harm\nonline users' experiences. However, previous studies mainly focus on discrete\naction and policy spaces, which might have difficulties in handling\ndramatically growing items efficiently.\n To mitigate this issue, in this paper, we aim to design an algorithmic\nframework applicable to continuous policies. To facilitate the control in the\nlow-dimensional but dense user preference space, we propose an\n\\underline{\\textbf{E}}fficient \\underline{\\textbf{Co}}ntinuous\n\\underline{\\textbf{C}}ontrol framework (ECoC). Based on a statistically tested\nassumption, we first propose the novel unified action representation abstracted\nfrom normalized user and item spaces. Then, we develop the corresponding policy\nevaluation and policy improvement procedures. During this process, strategic\nexploration and directional control in terms of unified actions are carefully\ndesigned and crucial to final recommendation decisions. Moreover, beneficial\nfrom unified actions, the conservatism regularization for policies and value\nfunctions are combined and perfectly compatible with the continuous framework.\nThe resulting dual regularization ensures the successful offline training of\nRL-based recommendation policies. Finally, we conduct extensive experiments to\nvalidate the effectiveness of our framework. The results show that compared to\nthe discrete baselines, our ECoC is trained far more efficiently. Meanwhile,\nthe final policies outperform baselines in both capturing the offline data and\ngaining long-term rewards.\n","authors":["Jun Wang","Likang Wu","Qi Liu","Yu Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08041v1","updated":"2024-08-15T09:19:42Z","published":"2024-08-15T09:19:42Z","title":"The Clever Hans Effect in Unsupervised Learning","summary":" Unsupervised learning has become an essential building block of AI systems.\nThe representations it produces, e.g. in foundation models, are critical to a\nwide variety of downstream applications. It is therefore important to carefully\nexamine unsupervised models to ensure not only that they produce accurate\npredictions, but also that these predictions are not \"right for the wrong\nreasons\", the so-called Clever Hans (CH) effect. Using specially developed\nExplainable AI techniques, we show for the first time that CH effects are\nwidespread in unsupervised learning. Our empirical findings are enriched by\ntheoretical insights, which interestingly point to inductive biases in the\nunsupervised learning machine as a primary source of CH effects. Overall, our\nwork sheds light on unexplored risks associated with practical applications of\nunsupervised learning and suggests ways to make unsupervised learning more\nrobust.\n","authors":["Jacob Kauffmann","Jonas Dippel","Lukas Ruff","Wojciech Samek","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2408.08041v1.pdf","comment":"12 pages + supplement"},{"id":"http://arxiv.org/abs/2404.10635v3","updated":"2024-08-15T09:17:18Z","published":"2024-03-26T15:36:47Z","title":"Compressed Federated Reinforcement Learning with a Generative Model","summary":" Reinforcement learning has recently gained unprecedented popularity, yet it\nstill grapples with sample inefficiency. Addressing this challenge, federated\nreinforcement learning (FedRL) has emerged, wherein agents collaboratively\nlearn a single policy by aggregating local estimations. However, this\naggregation step incurs significant communication costs. In this paper, we\npropose CompFedRL, a communication-efficient FedRL approach incorporating both\n\\textit{periodic aggregation} and (direct/error-feedback) compression\nmechanisms. Specifically, we consider compressed federated $Q$-learning with a\ngenerative model setup, where a central server learns an optimal $Q$-function\nby periodically aggregating compressed $Q$-estimates from local agents. For the\nfirst time, we characterize the impact of these two mechanisms (which have\nremained elusive) by providing a finite-time analysis of our algorithm,\ndemonstrating strong convergence behaviors when utilizing either direct or\nerror-feedback compression. Our bounds indicate improved solution accuracy\nconcerning the number of agents and other federated hyperparameters while\nsimultaneously reducing communication costs. To corroborate our theory, we also\nconduct in-depth numerical experiments to verify our findings, considering\nTop-$K$ and Sparsified-$K$ sparsification operators.\n","authors":["Ali Beikmohammadi","Sarit Khirirat","Sindri Magnússon"],"pdf_url":"https://arxiv.org/pdf/2404.10635v3.pdf","comment":"European Conference on Machine Learning and Principles and Practice\n of Knowledge Discovery in Databases (ECML-PKDD 2024)"},{"id":"http://arxiv.org/abs/2407.06162v2","updated":"2024-08-15T08:59:38Z","published":"2024-06-02T17:09:59Z","title":"RNNs, CNNs and Transformers in Human Action Recognition: A Survey and a\n Hybrid Model","summary":" Human Action Recognition (HAR) encompasses the task of monitoring human\nactivities across various domains, including but not limited to medical,\neducational, entertainment, visual surveillance, video retrieval, and the\nidentification of anomalous activities. Over the past decade, the field of HAR\nhas witnessed substantial progress by leveraging Convolutional Neural Networks\n(CNNs) to effectively extract and comprehend intricate information, thereby\nenhancing the overall performance of HAR systems. Recently, the domain of\ncomputer vision has witnessed the emergence of Vision Transformers (ViTs) as a\npotent solution. The efficacy of transformer architecture has been validated\nbeyond the confines of image analysis, extending their applicability to diverse\nvideo-related tasks. Notably, within this landscape, the research community has\nshown keen interest in HAR, acknowledging its manifold utility and widespread\nadoption across various domains. This article aims to present an encompassing\nsurvey that focuses on CNNs and the evolution of Recurrent Neural Networks\n(RNNs) to ViTs given their importance in the domain of HAR. By conducting a\nthorough examination of existing literature and exploring emerging trends, this\nstudy undertakes a critical analysis and synthesis of the accumulated knowledge\nin this field. Additionally, it investigates the ongoing efforts to develop\nhybrid approaches. Following this direction, this article presents a novel\nhybrid model that seeks to integrate the inherent strengths of CNNs and ViTs.\n","authors":["Khaled Alomar","Halil Ibrahim Aysel","Xiaohao Cai"],"pdf_url":"https://arxiv.org/pdf/2407.06162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08024v1","updated":"2024-08-15T08:47:35Z","published":"2024-08-15T08:47:35Z","title":"Adaptive User Journeys in Pharma E-Commerce with Reinforcement Learning:\n Insights from SwipeRx","summary":" This paper introduces a reinforcement learning (RL) platform that enhances\nend-to-end user journeys in healthcare digital tools through personalization.\nWe explore a case study with SwipeRx, the most popular all-in-one app for\npharmacists in Southeast Asia, demonstrating how the platform can be used to\npersonalize and adapt user experiences. Our RL framework is tested through a\nseries of experiments with product recommendations tailored to each pharmacy\nbased on real-time information on their purchasing history and in-app\nengagement, showing a significant increase in basket size. By integrating\nadaptive interventions into existing mobile health solutions and enriching user\njourneys, our platform offers a scalable solution to improve pharmaceutical\nsupply chain management, health worker capacity building, and clinical decision\nand patient care, ultimately contributing to better healthcare outcomes.\n","authors":["Ana Fernández del Río","Michael Brennan Leong","Paulo Saraiva","Ivan Nazarov","Aditya Rastogi","Moiz Hassan","Dexian Tang","África Periáñez"],"pdf_url":"https://arxiv.org/pdf/2408.08024v1.pdf","comment":"Presented at the Third Workshop on End-to-End Customer Journey\n Optimization at KDD 2024 (KDD CJ Workshop '24), August 26, Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.08023v1","updated":"2024-08-15T08:43:28Z","published":"2024-08-15T08:43:28Z","title":"Causal Discovery from Time-Series Data with Short-Term Invariance-Based\n Convolutional Neural Networks","summary":" Causal discovery from time-series data aims to capture both intra-slice\n(contemporaneous) and inter-slice (time-lagged) causality between variables\nwithin the temporal chain, which is crucial for various scientific disciplines.\nCompared to causal discovery from non-time-series data, causal discovery from\ntime-series data necessitates more serialized samples with a larger amount of\nobserved time steps. To address the challenges, we propose a novel\ngradient-based causal discovery approach STIC, which focuses on\n\\textbf{S}hort-\\textbf{T}erm \\textbf{I}nvariance using \\textbf{C}onvolutional\nneural networks to uncover the causal relationships from time-series data.\nSpecifically, STIC leverages both the short-term time and mechanism invariance\nof causality within each window observation, which possesses the property of\nindependence, to enhance sample efficiency. Furthermore, we construct two\ncausal convolution kernels, which correspond to the short-term time and\nmechanism invariance respectively, to estimate the window causal graph. To\ndemonstrate the necessity of convolutional neural networks for causal discovery\nfrom time-series data, we theoretically derive the equivalence between\nconvolution and the underlying generative principle of time-series data under\nthe assumption that the additive noise model is identifiable. Experimental\nevaluations conducted on both synthetic and FMRI benchmark datasets demonstrate\nthat our STIC outperforms baselines significantly and achieves the\nstate-of-the-art performance, particularly when the datasets contain a limited\nnumber of observed time steps. Code is available at\n\\url{https://github.com/HITshenrj/STIC}.\n","authors":["Rujia Shen","Boran Wang","Chao Zhao","Yi Guan","Jingchi Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19730v3","updated":"2024-08-15T08:41:57Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Large Model","summary":" This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v3.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.08019v1","updated":"2024-08-15T08:34:00Z","published":"2024-08-15T08:34:00Z","title":"Accelerating High-Fidelity Waveform Generation via Adversarial Flow\n Matching Optimization","summary":" This paper introduces PeriodWave-Turbo, a high-fidelity and high-efficient\nwaveform generation model via adversarial flow matching optimization. Recently,\nconditional flow matching (CFM) generative models have been successfully\nadopted for waveform generation tasks, leveraging a single vector field\nestimation objective for training. Although these models can generate\nhigh-fidelity waveform signals, they require significantly more ODE steps\ncompared to GAN-based models, which only need a single generation step.\nAdditionally, the generated samples often lack high-frequency information due\nto noisy vector field estimation, which fails to ensure high-frequency\nreproduction. To address this limitation, we enhance pre-trained CFM-based\ngenerative models by incorporating a fixed-step generator modification. We\nutilized reconstruction losses and adversarial feedback to accelerate\nhigh-fidelity waveform generation. Through adversarial flow matching\noptimization, it only requires 1,000 steps of fine-tuning to achieve\nstate-of-the-art performance across various objective metrics. Moreover, we\nsignificantly reduce inference speed from 16 steps to 2 or 4 steps.\nAdditionally, by scaling up the backbone of PeriodWave from 29M to 70M\nparameters for improved generalization, PeriodWave-Turbo achieves unprecedented\nperformance, with a perceptual evaluation of speech quality (PESQ) score of\n4.454 on the LibriTTS dataset. Audio samples, source code and checkpoints will\nbe available at https://github.com/sh-lee-prml/PeriodWave.\n","authors":["Sang-Hoon Lee","Ha-Yeong Choi","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2408.08019v1.pdf","comment":"9 pages, 9 tables, 1 figure,"},{"id":"http://arxiv.org/abs/2404.19288v2","updated":"2024-08-15T08:32:26Z","published":"2024-04-30T06:36:43Z","title":"Training-free Graph Neural Networks and the Power of Labels as Features","summary":" We propose training-free graph neural networks (TFGNNs), which can be used\nwithout training and can also be improved with optional training, for\ntransductive node classification. We first advocate labels as features (LaF),\nwhich is an admissible but not explored technique. We show that LaF provably\nenhances the expressive power of graph neural networks. We design TFGNNs based\non this analysis. In the experiments, we confirm that TFGNNs outperform\nexisting GNNs in the training-free setting and converge with much fewer\ntraining iterations than traditional GNNs.\n","authors":["Ryoma Sato"],"pdf_url":"https://arxiv.org/pdf/2404.19288v2.pdf","comment":"TMLR 2024"},{"id":"http://arxiv.org/abs/2408.04963v2","updated":"2024-08-15T08:26:56Z","published":"2024-08-09T09:29:02Z","title":"LiD-FL: Towards List-Decodable Federated Learning","summary":" Federated learning is often used in environments with many unverified\nparticipants. Therefore, federated learning under adversarial attacks receives\nsignificant attention. This paper proposes an algorithmic framework for\nlist-decodable federated learning, where a central server maintains a list of\nmodels, with at least one guaranteed to perform well. The framework has no\nstrict restriction on the fraction of honest workers, extending the\napplicability of Byzantine federated learning to the scenario with more than\nhalf adversaries. Under proper assumptions on the loss function, we prove a\nconvergence theorem for our method. Experimental results, including image\nclassification tasks with both convex and non-convex losses, demonstrate that\nthe proposed algorithm can withstand the malicious majority under various\nattacks.\n","authors":["Hong Liu","Liren Shan","Han Bao","Ronghui You","Yuhao Yi","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2408.04963v2.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08015v1","updated":"2024-08-15T08:25:50Z","published":"2024-08-15T08:25:50Z","title":"Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for\n Collaborative DNN Training on Heterogeneous Edge Devices","summary":" On-device Deep Neural Network (DNN) training has been recognized as crucial\nfor privacy-preserving machine learning at the edge. However, the intensive\ntraining workload and limited onboard computing resources pose significant\nchallenges to the availability and efficiency of model training. While existing\nworks address these challenges through native resource management optimization,\nwe instead leverage our observation that edge environments usually comprise a\nrich set of accompanying trusted edge devices with idle resources beyond a\nsingle terminal. We propose Asteroid, a distributed edge training system that\nbreaks the resource walls across heterogeneous edge devices for efficient model\ntraining acceleration. Asteroid adopts a hybrid pipeline parallelism to\norchestrate distributed training, along with a judicious parallelism planning\nfor maximizing throughput under certain resource constraints. Furthermore, a\nfault-tolerant yet lightweight pipeline replay mechanism is developed to tame\nthe device-level dynamics for training robustness and performance stability. We\nimplement Asteroid on heterogeneous edge devices with both vision and language\nmodels, demonstrating up to 12.2x faster training than conventional parallelism\nmethods and 2.1x faster than state-of-the-art hybrid parallelism methods\nthrough evaluations. Furthermore, Asteroid can recover training pipeline 14x\nfaster than baseline methods while preserving comparable throughput despite\nunexpected device exiting and failure.\n","authors":["Shengyuan Ye","Liekang Zeng","Xiaowen Chu","Guoliang Xing","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08015v1.pdf","comment":"Accepted by The 30th Annual International Conference on Mobile\n Computing and Networking (MobiCom'24)"},{"id":"http://arxiv.org/abs/2408.08006v1","updated":"2024-08-15T08:16:11Z","published":"2024-08-15T08:16:11Z","title":"Hessian QM9: A quantum chemistry database of molecular Hessians in\n implicit solvents","summary":" A significant challenge in computational chemistry is developing\napproximations that accelerate \\emph{ab initio} methods while preserving\naccuracy. Machine learning interatomic potentials (MLIPs) have emerged as a\npromising solution for constructing atomistic potentials that can be\ntransferred across different molecular and crystalline systems. Most MLIPs are\ntrained only on energies and forces in vacuum, while an improved description of\nthe potential energy surface could be achieved by including the curvature of\nthe potential energy surface. We present Hessian QM9, the first database of\nequilibrium configurations and numerical Hessian matrices, consisting of 41,645\nmolecules from the QM9 dataset at the $\\omega$B97x/6-31G* level. Molecular\nHessians were calculated in vacuum, as well as water, tetrahydrofuran, and\ntoluene using an implicit solvation model. To demonstrate the utility of this\ndataset, we show that incorporating second derivatives of the potential energy\nsurface into the loss function of a MLIP significantly improves the prediction\nof vibrational frequencies in all solvent environments, thus making this\ndataset extremely useful for studying organic molecules in realistic solvent\nenvironments for experimental characterization.\n","authors":["Nicholas J. Williams","Lara Kabalan","Ljiljana Stojanovic","Viktor Zolyomi","Edward O. Pyzer-Knapp"],"pdf_url":"https://arxiv.org/pdf/2408.08006v1.pdf","comment":"7 pages, 2 figues"},{"id":"http://arxiv.org/abs/2408.08005v1","updated":"2024-08-15T08:15:06Z","published":"2024-08-15T08:15:06Z","title":"Inversion-DeepONet: A Novel DeepONet-Based Network with Encoder-Decoder\n for Full Waveform Inversion","summary":" Full waveform inversion (FWI) plays a crucial role in the field of\ngeophysics. There has been lots of research about applying deep learning (DL)\nmethods to FWI. The success of DL-FWI relies significantly on the quantity and\ndiversity of the datasets. Nevertheless, existing FWI datasets, like OpenFWI,\nwhere sources have fixed locations or identical frequencies, provide limited\ninformation and do not represent the complex real-world scene. For instance,\nlow frequencies help in resolving larger-scale structures. High frequencies\nallow for a more detailed subsurface features. %A single source frequency is\ninsufficient to describe subsurface structural properties. We consider that\nsimultaneously using sources with different frequencies, instead of performing\ninversion using low frequencies data and then gradually introducing higher\nfrequencies data, has rationale and potential advantages. Hence, we develop\nthree enhanced datasets based on OpenFWI where each source have varying\nlocations, frequencies or both. Moreover, we propose a novel deep operator\nnetwork (DeepONet) architecture Inversion-DeepONet for FWI. We utilize\nconvolutional neural network (CNN) to extract the features from seismic data in\nbranch net. Source parameters, such as locations and frequencies, are fed to\ntrunk net. Then another CNN is employed as the decoder of DeepONet to\nreconstruct the velocity models more effectively. Through experiments, we\nconfirm the superior performance on accuracy and generalization ability of our\nnetwork, compared with existing data-driven FWI methods.\n","authors":["Zekai Guo","Lihui Chai","Shengjun Huang","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2408.08005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.14721v4","updated":"2024-08-15T07:37:47Z","published":"2020-11-30T12:02:26Z","title":"Probabilistic Load Forecasting Based on Adaptive Online Learning","summary":" Load forecasting is crucial for multiple energy management tasks such as\nscheduling generation capacity, planning supply and demand, and minimizing\nenergy trade costs. Such relevance has increased even more in recent years due\nto the integration of renewable energies, electric cars, and microgrids.\nConventional load forecasting techniques obtain single-value load forecasts by\nexploiting consumption patterns of past load demand. However, such techniques\ncannot assess intrinsic uncertainties in load demand, and cannot capture\ndynamic changes in consumption patterns. To address these problems, this paper\npresents a method for probabilistic load forecasting based on the adaptive\nonline learning of hidden Markov models. We propose learning and forecasting\ntechniques with theoretical guarantees, and experimentally assess their\nperformance in multiple scenarios. In particular, we develop adaptive online\nlearning techniques that update model parameters recursively, and sequential\nprediction techniques that obtain probabilistic forecasts using the most recent\nparameters. The performance of the method is evaluated using multiple datasets\ncorresponding with regions that have different sizes and display assorted\ntime-varying consumption patterns. The results show that the proposed method\ncan significantly improve the performance of existing techniques for a wide\nrange of scenarios.\n","authors":["Verónica Álvarez","Santiago Mazuelas","José A. Lozano"],"pdf_url":"https://arxiv.org/pdf/2011.14721v4.pdf","comment":"\\c{opyright} 2021 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2408.07986v1","updated":"2024-08-15T07:25:52Z","published":"2024-08-15T07:25:52Z","title":"Experimental evaluation of offline reinforcement learning for HVAC\n control in buildings","summary":" Reinforcement learning (RL) techniques have been increasingly investigated\nfor dynamic HVAC control in buildings. However, most studies focus on exploring\nsolutions in online or off-policy scenarios without discussing in detail the\nimplementation feasibility or effectiveness of dealing with purely offline\ndatasets or trajectories. The lack of these works limits the real-world\ndeployment of RL-based HVAC controllers, especially considering the abundance\nof historical data. To this end, this paper comprehensively evaluates the\nstrengths and limitations of state-of-the-art offline RL algorithms by\nconducting analytical and numerical studies. The analysis is conducted from two\nperspectives: algorithms and dataset characteristics. As a prerequisite, the\nnecessity of applying offline RL algorithms is first confirmed in two building\nenvironments. The ability of observation history modeling to reduce violations\nand enhance performance is subsequently studied. Next, the performance of\nRL-based controllers under datasets with different qualitative and quantitative\nconditions is investigated, including constraint satisfaction and power\nconsumption. Finally, the sensitivity of certain hyperparameters is also\nevaluated. The results indicate that datasets of a certain suboptimality level\nand relatively small scale can be utilized to effectively train a\nwell-performed RL-based HVAC controller. Specifically, such controllers can\nreduce at most 28.5% violation ratios of indoor temperatures and achieve at\nmost 12.1% power savings compared to the baseline controller. In summary, this\npaper presents our well-structured investigations and new findings when\napplying offline reinforcement learning to building HVAC systems.\n","authors":["Jun Wang","Linyan Li","Qi Liu","Yu Yang"],"pdf_url":"https://arxiv.org/pdf/2408.07986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07985v1","updated":"2024-08-15T07:10:17Z","published":"2024-08-15T07:10:17Z","title":"Analytical Uncertainty-Based Loss Weighting in Multi-Task Learning","summary":" With the rise of neural networks in various domains, multi-task learning\n(MTL) gained significant relevance. A key challenge in MTL is balancing\nindividual task losses during neural network training to improve performance\nand efficiency through knowledge sharing across tasks. To address these\nchallenges, we propose a novel task-weighting method by building on the most\nprevalent approach of Uncertainty Weighting and computing analytically optimal\nuncertainty-based weights, normalized by a softmax function with tunable\ntemperature. Our approach yields comparable results to the combinatorially\nprohibitive, brute-force approach of Scalarization while offering a more\ncost-effective yet high-performing alternative. We conduct an extensive\nbenchmark on various datasets and architectures. Our method consistently\noutperforms six other common weighting methods. Furthermore, we report\nnoteworthy experimental findings for the practical application of MTL. For\nexample, larger networks diminish the influence of weighting methods, and\ntuning the weight decay has a low impact compared to the learning rate.\n","authors":["Lukas Kirchdorfer","Cathrin Elich","Simon Kutsche","Heiner Stuckenschmidt","Lukas Schott","Jan M. Köhler"],"pdf_url":"https://arxiv.org/pdf/2408.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03337v3","updated":"2024-08-15T06:58:08Z","published":"2024-07-22T07:19:12Z","title":"PsyDI: Towards a Personalized and Progressively In-depth Chatbot for\n Psychological Measurements","summary":" In the field of psychology, traditional assessment methods, such as\nstandardized scales, are frequently critiqued for their static nature, lack of\npersonalization, and reduced participant engagement, while comprehensive\ncounseling evaluations are often inaccessible. The complexity of quantifying\npsychological traits further limits these methods. Despite advances with large\nlanguage models (LLMs), many still depend on single-round Question-and-Answer\ninteractions. To bridge this gap, we introduce PsyDI, a personalized and\nprogressively in-depth chatbot designed for psychological measurements,\nexemplified by its application in the Myers-Briggs Type Indicator (MBTI)\nframework. PsyDI leverages user-related multi-modal information and engages in\ncustomized, multi-turn interactions to provide personalized, easily accessible\nmeasurements, while ensuring precise MBTI type determination. To address the\nchallenge of unquantifiable psychological traits, we introduce a novel training\nparadigm that involves learning the ranking of proxy variables associated with\nthese traits, culminating in a robust score model for MBTI measurements. The\nscore model enables PsyDI to conduct comprehensive and precise measurements\nthrough multi-turn interactions within a unified estimation context. Through\nvarious experiments, we validate the efficacy of both the score model and the\nPsyDI pipeline, demonstrating its potential to serve as a general framework for\npsychological measurements. Furthermore, the online deployment of PsyDI has\ngarnered substantial user engagement, with over 3,000 visits, resulting in the\ncollection of numerous multi-turn dialogues annotated with MBTI types, which\nfacilitates further research. The source code for the training and web service\ncomponents is publicly available as a part of OpenDILab at:\nhttps://github.com/opendilab/PsyDI\n","authors":["Xueyan Li","Xinyan Chen","Yazhe Niu","Shuai Hu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.03337v3.pdf","comment":"29 pages, 15 figures"},{"id":"http://arxiv.org/abs/2408.07978v1","updated":"2024-08-15T06:52:24Z","published":"2024-08-15T06:52:24Z","title":"Coupling without Communication and Drafter-Invariant Speculative\n Decoding","summary":" Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice\nwants to generate a sample $a\\sim P$ and Bob a sample $b \\sim Q$ such that $a =\nb$ with has as high of probability as possible. It is well-known that, by\nsampling from an optimal coupling between the distributions, Alice and Bob can\nachieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total\nvariation distance. What if Alice and Bob must solve this same problem without\ncommunicating at all? Perhaps surprisingly, with access to public randomness,\nthey can still achieve $Pr[a = b] \\geq \\frac{1 - D_{TV}(P,Q)}{1 + D_{TV}(P,Q)}\n\\geq 1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple\nprotocol based on the Weighted MinHash algorithm. In this work, we explore the\ncommunication-free coupling in greater depth. First, we show that an equally\nsimple protocol based on Gumbel sampling matches the worst-case guarantees of\nthe Weighted MinHash approach, but tends to perform better in practice.\nConversely, we prove that both approaches are actually sharp: no\ncommunication-free protocol can achieve $Pr[a=b]>\\frac{1 - D_{TV}(P,Q)}{1 +\nD_{TV}(P,Q)}$ in the worst-case. Finally, we prove that, for distributions over\n$n$ items, there exists a scheme that uses just $O(\\log(n/\\epsilon))$ bits of\ncommunication to achieve $Pr[a = b] = 1 - D_{TV}(P,Q) - \\epsilon$, i.e. to\nessentially match optimal coupling. Beyond our theoretical results, we\ndemonstrate an application of communication-free coupling to speculative\ndecoding, a recent method for accelerating autoregressive large language models\n[Leviathan, Kalman, Matias, ICML 2023]. We show that communication-free\nprotocols yield a variant of speculative decoding that we call\nDrafter-Invariant Speculative Decoding, which has the desirable property that\nthe output of the method is fixed given a fixed random seed, regardless of what\ndrafter is used for speculation.\n","authors":["Majid Daliri","Christopher Musco","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2408.07978v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2408.07966v1","updated":"2024-08-15T06:26:46Z","published":"2024-08-15T06:26:46Z","title":"Addressing Skewed Heterogeneity via Federated Prototype Rectification\n with Personalization","summary":" Federated learning is an efficient framework designed to facilitate\ncollaborative model training across multiple distributed devices while\npreserving user data privacy. A significant challenge of federated learning is\ndata-level heterogeneity, i.e., skewed or long-tailed distribution of private\ndata. Although various methods have been proposed to address this challenge,\nmost of them assume that the underlying global data is uniformly distributed\nacross all clients. This paper investigates data-level heterogeneity federated\nlearning with a brief review and redefines a more practical and challenging\nsetting called Skewed Heterogeneous Federated Learning (SHFL). Accordingly, we\npropose a novel Federated Prototype Rectification with Personalization which\nconsists of two parts: Federated Personalization and Federated Prototype\nRectification. The former aims to construct balanced decision boundaries\nbetween dominant and minority classes based on private data, while the latter\nexploits both inter-class discrimination and intra-class consistency to rectify\nempirical prototypes. Experiments on three popular benchmarks show that the\nproposed approach outperforms current state-of-the-art methods and achieves\nbalanced performance in both personalization and generalization.\n","authors":["Shunxin Guo","Hongsong Wang","Shuxia Lin","Zhiqiang Kou","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2408.07966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07962v1","updated":"2024-08-15T06:18:50Z","published":"2024-08-15T06:18:50Z","title":"Meta SAC-Lag: Towards Deployable Safe Reinforcement Learning via\n MetaGradient-based Hyperparameter Tuning","summary":" Safe Reinforcement Learning (Safe RL) is one of the prevalently studied\nsubcategories of trial-and-error-based methods with the intention to be\ndeployed on real-world systems. In safe RL, the goal is to maximize reward\nperformance while minimizing constraints, often achieved by setting bounds on\nconstraint functions and utilizing the Lagrangian method. However, deploying\nLagrangian-based safe RL in real-world scenarios is challenging due to the\nnecessity of threshold fine-tuning, as imprecise adjustments may lead to\nsuboptimal policy convergence. To mitigate this challenge, we propose a unified\nLagrangian-based model-free architecture called Meta Soft Actor-Critic\nLagrangian (Meta SAC-Lag). Meta SAC-Lag uses meta-gradient optimization to\nautomatically update the safety-related hyperparameters. The proposed method is\ndesigned to address safe exploration and threshold adjustment with minimal\nhyperparameter tuning requirement. In our pipeline, the inner parameters are\nupdated through the conventional formulation and the hyperparameters are\nadjusted using the meta-objectives which are defined based on the updated\nparameters. Our results show that the agent can reliably adjust the safety\nperformance due to the relatively fast convergence rate of the safety\nthreshold. We evaluate the performance of Meta SAC-Lag in five simulated\nenvironments against Lagrangian baselines, and the results demonstrate its\ncapability to create synergy between parameters, yielding better or competitive\nresults. Furthermore, we conduct a real-world experiment involving a robotic\narm tasked with pouring coffee into a cup without spillage. Meta SAC-Lag is\nsuccessfully trained to execute the task, while minimizing effort constraints.\n","authors":["Homayoun Honari","Amir Mehdi Soufi Enayati","Mehran Ghafarian Tamizi","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2408.07962v1.pdf","comment":"Main text accepted to the IEEE/RSJ International Conference on\n Intelligent Robots and Systems (IROS) 2024, 10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2406.01609v2","updated":"2024-08-15T06:11:27Z","published":"2024-05-28T04:22:28Z","title":"Judgement Citation Retrieval using Contextual Similarity","summary":" Traditionally in the domain of legal research, the retrieval of pertinent\ncitations from intricate case descriptions has demanded manual effort and\nkeyword-based search applications that mandate expertise in understanding legal\njargon. Legal case descriptions hold pivotal information for legal\nprofessionals and researchers, necessitating more efficient and automated\napproaches. We propose a methodology that combines natural language processing\n(NLP) and machine learning techniques to enhance the organization and\nutilization of legal case descriptions. This approach revolves around the\ncreation of textual embeddings with the help of state-of-art embedding models.\nOur methodology addresses two primary objectives: unsupervised clustering and\nsupervised citation retrieval, both designed to automate the citation\nextraction process. Although the proposed methodology can be used for any\ndataset, we employed the Supreme Court of The United States (SCOTUS) dataset,\nyielding remarkable results. Our methodology achieved an impressive accuracy\nrate of 90.9%. By automating labor-intensive processes, we pave the way for a\nmore efficient, time-saving, and accessible landscape in legal research,\nbenefiting legal professionals, academics, and researchers.\n","authors":["Akshat Mohan Dasula","Hrushitha Tigulla","Preethika Bhukya"],"pdf_url":"https://arxiv.org/pdf/2406.01609v2.pdf","comment":"14 pages, 16 images"},{"id":"http://arxiv.org/abs/2408.07956v1","updated":"2024-08-15T06:09:19Z","published":"2024-08-15T06:09:19Z","title":"RandomNet: Clustering Time Series Using Untrained Deep Neural Networks","summary":" Neural networks are widely used in machine learning and data mining.\nTypically, these networks need to be trained, implying the adjustment of\nweights (parameters) within the network based on the input data. In this work,\nwe propose a novel approach, RandomNet, that employs untrained deep neural\nnetworks to cluster time series. RandomNet uses different sets of random\nweights to extract diverse representations of time series and then ensembles\nthe clustering relationships derived from these different representations to\nbuild the final clustering results. By extracting diverse representations, our\nmodel can effectively handle time series with different characteristics. Since\nall parameters are randomly generated, no training is required during the\nprocess. We provide a theoretical analysis of the effectiveness of the method.\nTo validate its performance, we conduct extensive experiments on all of the 128\ndatasets in the well-known UCR time series archive and perform statistical\nanalysis of the results. These datasets have different sizes, sequence lengths,\nand they are from diverse fields. The experimental results show that the\nproposed method is competitive compared with existing state-of-the-art methods.\n","authors":["Xiaosheng Li","Wenjie Xi","Jessica Lin"],"pdf_url":"https://arxiv.org/pdf/2408.07956v1.pdf","comment":"25 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.07941v1","updated":"2024-08-15T05:31:26Z","published":"2024-08-15T05:31:26Z","title":"Robust Offline Active Learning on Graphs","summary":" We consider the problem of active learning on graphs, which has crucial\napplications in many real-world networks where labeling node responses is\nexpensive. In this paper, we propose an offline active learning method that\nselects nodes to query by explicitly incorporating information from both the\nnetwork structure and node covariates. Building on graph signal recovery\ntheories and the random spectral sparsification technique, the proposed method\nadopts a two-stage biased sampling strategy that takes both informativeness and\nrepresentativeness into consideration for node querying. Informativeness refers\nto the complexity of graph signals that are learnable from the responses of\nqueried nodes, while representativeness refers to the capacity of queried nodes\nto control generalization errors given noisy node-level information. We\nestablish a theoretical relationship between generalization error and the\nnumber of nodes selected by the proposed method. Our theoretical results\ndemonstrate the trade-off between informativeness and representativeness in\nactive learning. Extensive numerical experiments show that the proposed method\nis competitive with existing graph-based active learning methods, especially\nwhen node covariates and responses contain noises. Additionally, the proposed\nmethod is applicable to both regression and classification tasks on graphs.\n","authors":["Yuanchen Wu","Yubai Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.07941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02247v3","updated":"2024-08-15T05:28:10Z","published":"2024-08-05T05:41:16Z","title":"Contrastive Learning and Abstract Concepts: The Case of Natural Numbers","summary":" Contrastive Learning (CL) has been successfully applied to classification and\nother downstream tasks related to concrete concepts, such as objects contained\nin the ImageNet dataset. No attempts seem to have been made so far in applying\nthis promising scheme to more abstract entities. A prominent example of these\ncould be the concept of (discrete) Quantity. CL can be frequently interpreted\nas a self-supervised scheme guided by some profound and ubiquitous conservation\nprinciple (e.g. conservation of identity in object classification tasks). In\nthis introductory work we apply a suitable conservation principle to the\nsemi-abstract concept of natural numbers by which discrete quantities can be\nestimated or predicted. We experimentally show, by means of a toy problem, that\ncontrastive learning can be trained to count at a glance with high accuracy\nboth at human as well as at super-human ranges.. We compare this with the\nresults of a trained-to-count at a glance supervised learning (SL) neural\nnetwork scheme of similar architecture. We show that both schemes exhibit\nsimilar good performance on baseline experiments, where the distributions of\nthe training and testing stages are equal. Importantly, we demonstrate that in\nsome generalization scenarios, where training and testing distributions differ,\nCL boasts more robust and much better error performance.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2408.02247v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18552v2","updated":"2024-08-15T05:14:38Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Modigari Narendra","Vigya Sharma","Santhosh Malarvannan","Amir H. Gandomi"],"pdf_url":"https://arxiv.org/pdf/2407.18552v2.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2408.07932v1","updated":"2024-08-15T05:03:14Z","published":"2024-08-15T05:03:14Z","title":"MobileMEF: Fast and Efficient Method for Multi-Exposure Fusion","summary":" Recent advances in camera design and imaging technology have enabled the\ncapture of high-quality images using smartphones. However, due to the limited\ndynamic range of digital cameras, the quality of photographs captured in\nenvironments with highly imbalanced lighting often results in poor-quality\nimages. To address this issue, most devices capture multi-exposure frames and\nthen use some multi-exposure fusion method to merge those frames into a final\nfused image. Nevertheless, most traditional and current deep learning\napproaches are unsuitable for real-time applications on mobile devices due to\ntheir heavy computational and memory requirements. We propose a new method for\nmulti-exposure fusion based on an encoder-decoder deep learning architecture\nwith efficient building blocks tailored for mobile devices. This efficient\ndesign makes our model capable of processing 4K resolution images in less than\n2 seconds on mid-range smartphones. Our method outperforms state-of-the-art\ntechniques regarding full-reference quality measures and computational\nefficiency (runtime and memory usage), making it ideal for real-time\napplications on hardware-constrained devices. Our code is available at:\nhttps://github.com/LucasKirsten/MobileMEF.\n","authors":["Lucas Nedel Kirsten","Zhicheng Fu","Nikhil Ambha Madhusudhana"],"pdf_url":"https://arxiv.org/pdf/2408.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07081v2","updated":"2024-08-15T04:51:43Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Understanding sentences that contain mathematical expressions in text form\nposes significant challenges. To address this, the importance of converting\nthese expressions into a compiled formula is highlighted. For instance, the\nexpression ``x equals minus b plus or minus the square root of b squared minus\nfour a c, all over two a'' from automatic speech recognition (ASR) is more\nreadily comprehensible when displayed as a compiled formula $x = \\frac{-b \\pm\n\\sqrt{b^2 - 4ac}}{2a}$. To develop a text-to-formula conversion system, we can\nbreak down the process into text-to-LaTeX and LaTeX-to-formula conversions,\nwith the latter managed by various existing LaTeX engines. However, the former\napproach has been notably hindered by the severe scarcity of text-to-LaTeX\npaired data, which presents a significant challenge in this field. In this\ncontext, we introduce MathBridge, the first extensive dataset for translating\nmathematical spoken expressions into LaTeX, to establish a robust baseline for\nfuture research on text-to-LaTeX translation. MathBridge comprises\napproximately 23 million LaTeX formulas paired with the corresponding spoken\nEnglish expressions. Through comprehensive evaluations, including fine-tuning\nand testing with data, we discovered that MathBridge significantly enhances the\ncapabilities of pretrained language models for text-to-LaTeX translation.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement. Our findings indicate the need\nfor a new metric, specifically for text-to-LaTeX conversion evaluations.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v2.pdf","comment":"9page, 6 figures"},{"id":"http://arxiv.org/abs/2408.07925v1","updated":"2024-08-15T04:38:24Z","published":"2024-08-15T04:38:24Z","title":"A Single Channel-Based Neonatal Sleep-Wake Classification using Hjorth\n Parameters and Improved Gradient Boosting","summary":" Sleep plays a crucial role in neonatal development. Monitoring the sleep\npatterns in neonates in a Neonatal Intensive Care Unit (NICU) is imperative for\nunderstanding the maturation process. While polysomnography (PSG) is considered\nthe best practice for sleep classification, its expense and reliance on human\nannotation pose challenges. Existing research often relies on multichannel EEG\nsignals; however, concerns arise regarding the vulnerability of neonates and\nthe potential impact on their sleep quality. This paper introduces a novel\napproach to neonatal sleep stage classification using a single-channel gradient\nboosting algorithm with Hjorth features. The gradient boosting parameters are\nfine-tuned using random search cross-validation (randomsearchCV), achieving an\naccuracy of 82.35% for neonatal sleep-wake classification. Validation is\nconducted through 5-fold cross-validation. The proposed algorithm not only\nenhances existing neonatal sleep algorithms but also opens avenues for broader\napplications.\n","authors":["Muhammad Arslan","Muhammad Mubeen","Saadullah Farooq Abbasi","Muhammad Shahbaz Khan","Wadii Boulila","Jawad Ahmad"],"pdf_url":"https://arxiv.org/pdf/2408.07925v1.pdf","comment":"8 pages, 5 figures, 3 tables, International Polydisciplinary\n Conference on Artificial Intelligence and New Technologies"},{"id":"http://arxiv.org/abs/2408.03388v2","updated":"2024-08-15T04:24:00Z","published":"2024-08-06T18:18:37Z","title":"A Non-negative VAE:the Generalized Gamma Belief Network","summary":" The gamma belief network (GBN), often regarded as a deep topic model, has\ndemonstrated its potential for uncovering multi-layer interpretable latent\nrepresentations in text data. Its notable capability to acquire interpretable\nlatent factors is partially attributed to sparse and non-negative\ngamma-distributed latent variables. However, the existing GBN and its\nvariations are constrained by the linear generative model, thereby limiting\ntheir expressiveness and applicability. To address this limitation, we\nintroduce the generalized gamma belief network (Generalized GBN) in this paper,\nwhich extends the original linear generative model to a more expressive\nnon-linear generative model. Since the parameters of the Generalized GBN no\nlonger possess an analytic conditional posterior, we further propose an\nupward-downward Weibull inference network to approximate the posterior\ndistribution of the latent variables. The parameters of both the generative\nmodel and the inference network are jointly trained within the variational\ninference framework. Finally, we conduct comprehensive experiments on both\nexpressivity and disentangled representation learning tasks to evaluate the\nperformance of the Generalized GBN against state-of-the-art Gaussian\nvariational autoencoders serving as baselines.\n","authors":["Zhibin Duan","Tiansheng Wen","Muyao Wang","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.03388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07922v1","updated":"2024-08-15T04:18:40Z","published":"2024-08-15T04:18:40Z","title":"A Deep Features-Based Approach Using Modified ResNet50 and Gradient\n Boosting for Visual Sentiments Classification","summary":" The versatile nature of Visual Sentiment Analysis (VSA) is one reason for its\nrising profile. It isn't easy to efficiently manage social media data with\nvisual information since previous research has concentrated on Sentiment\nAnalysis (SA) of single modalities, like textual. In addition, most visual\nsentiment studies need to adequately classify sentiment because they are mainly\nfocused on simply merging modal attributes without investigating their\nintricate relationships. This prompted the suggestion of developing a fusion of\ndeep learning and machine learning algorithms. In this research, a deep\nfeature-based method for multiclass classification has been used to extract\ndeep features from modified ResNet50. Furthermore, gradient boosting algorithm\nhas been used to classify photos containing emotional content. The approach is\nthoroughly evaluated on two benchmarked datasets, CrowdFlower and GAPED.\nFinally, cutting-edge deep learning and machine learning models were used to\ncompare the proposed strategy. When compared to state-of-the-art approaches,\nthe proposed method demonstrates exceptional performance on the datasets\npresented.\n","authors":["Muhammad Arslan","Muhammad Mubeen","Arslan Akram","Saadullah Farooq Abbasi","Muhammad Salman Ali","Muhammad Usman Tariq"],"pdf_url":"https://arxiv.org/pdf/2408.07922v1.pdf","comment":"4 pages, 4 figures, 3 tables, IEEE International Conference on\n Multimedia Information Processing and Retrieval (MIPR) 2024"},{"id":"http://arxiv.org/abs/2408.07921v1","updated":"2024-08-15T04:16:45Z","published":"2024-08-15T04:16:45Z","title":"Physics-Informed Neural Network for Predicting Out-of-Training-Range\n TCAD Solution with Minimized Domain Expertise","summary":" Machine learning (ML) is promising in assisting technology computer-aided\ndesign (TCAD) simulations to alleviate difficulty in convergence and prolonged\nsimulation time. While ML is widely used in TCAD, they either require access to\nthe internal solver, require extensive domain expertise, are only trained by\nterminal quantities such as currents and voltages, and/or lack\nout-of-training-range prediction capability. In this paper, using Si nanowire\nas an example, we demonstrate that it is possible to use a physics-informed\nneural network (PINN) to predict out-of-training-range TCAD solutions without\naccessing the internal solver and with minimal domain expertise. The machine\nnot only can predict a 2.5 times larger range than the training but also can\npredict the inversion region by only being trained with subthreshold region\ndata. The physics-informed module is also trained with data without the need\nfor human-coded equations making this easier to be extended to more\nsophisticated systems.\n","authors":["Albert Lu","Yu Foon Chau","Hiu Yung Wong"],"pdf_url":"https://arxiv.org/pdf/2408.07921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03320v2","updated":"2024-08-15T04:10:05Z","published":"2024-08-06T17:55:58Z","title":"Hedge Fund Portfolio Construction Using PolyModel Theory and\n iTransformer","summary":" When constructing portfolios, a key problem is that a lot of financial time\nseries data are sparse, making it challenging to apply machine learning\nmethods. Polymodel theory can solve this issue and demonstrate superiority in\nportfolio construction from various aspects. To implement the PolyModel theory\nfor constructing a hedge fund portfolio, we begin by identifying an asset pool,\nutilizing over 10,000 hedge funds for the past 29 years' data. PolyModel theory\nalso involves choosing a wide-ranging set of risk factors, which includes\nvarious financial indices, currencies, and commodity prices. This comprehensive\nselection mirrors the complexities of the real-world environment. Leveraging on\nthe PolyModel theory, we create quantitative measures such as Long-term Alpha,\nLong-term Ratio, and SVaR. We also use more classical measures like the Sharpe\nratio or Morningstar's MRAR. To enhance the performance of the constructed\nportfolio, we also employ the latest deep learning techniques (iTransformer) to\ncapture the upward trend, while efficiently controlling the downside, using all\nthe features. The iTransformer model is specifically designed to address the\nchallenges in high-dimensional time series forecasting and could largely\nimprove our strategies. More precisely, our strategies achieve better Sharpe\nratio and annualized return. The above process enables us to create multiple\nportfolio strategies aiming for high returns and low risks when compared to\nvarious benchmarks.\n","authors":["Siqiao Zhao","Zhikang Dong","Zeyu Cao","Raphael Douady"],"pdf_url":"https://arxiv.org/pdf/2408.03320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14463v2","updated":"2024-08-15T04:07:25Z","published":"2024-07-19T17:06:03Z","title":"SurvReLU: Inherently Interpretable Survival Analysis via Deep ReLU\n Networks","summary":" Survival analysis models time-to-event distributions with censorship.\nRecently, deep survival models using neural networks have dominated due to\ntheir representational power and state-of-the-art performance. However, their\n\"black-box\" nature hinders interpretability, which is crucial in real-world\napplications. In contrast, \"white-box\" tree-based survival models offer better\ninterpretability but struggle to converge to global optima due to greedy\nexpansion. In this paper, we bridge the gap between previous deep survival\nmodels and traditional tree-based survival models through deep rectified linear\nunit (ReLU) networks. We show that a deliberately constructed deep ReLU network\n(SurvReLU) can harness the interpretability of tree-based structures with the\nrepresentational power of deep survival models. Empirical studies on both\nsimulated and real survival benchmark datasets show the effectiveness of the\nproposed SurvReLU in terms of performance and interoperability. The code is\navailable at \\href{https://github.com/xs018/SurvReLU}{\\color{magenta}{\nhttps://github.com/xs018/SurvReLU}}.\n","authors":["Xiaotong Sun","Peijie Qiu","Shengfan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.14463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07633v4","updated":"2024-08-15T03:49:55Z","published":"2023-11-13T13:19:34Z","title":"There is No Silver Bullet: Benchmarking Methods in Predictive\n Combinatorial Optimization","summary":" Predictive combinatorial optimization, where the parameters of combinatorial\noptimization (CO) are unknown at the decision-making time, is the precise\nmodeling of many real-world applications, including energy cost-aware\nscheduling and budget allocation on advertising. Tackling such a problem\nusually involves a prediction model and a CO solver. These two modules are\nintegrated into the predictive CO pipeline following two design principles:\n``Predict-then-Optimize (PtO)'', which learns predictions by supervised\ntraining and subsequently solves CO using predicted coefficients, while the\nother, named ``Predict-and-Optimize (PnO)'', directly optimizes towards the\nultimate decision quality and claims to yield better decisions than traditional\nPtO approaches. However, there lacks a systematic benchmark of both approaches,\nincluding the specific design choices at the module level, as well as an\nevaluation dataset that covers representative real-world scenarios. To this\nend, we develop a modular framework to benchmark 11 existing PtO/PnO methods on\n8 problems, including a new industrial dataset for combinatorial advertising\nthat will be released. Our study shows that PnO approaches are better than PtO\non 7 out of 8 benchmarks, but there is no silver bullet found for the specific\ndesign choices of PnO. A comprehensive categorization of current approaches and\nintegration of typical scenarios are provided under a unified benchmark.\nTherefore, this paper could serve as a comprehensive benchmark for future PnO\napproach development and also offer fast prototyping for application-focused\ndevelopment.\n","authors":["Haoyu Geng","Hang Ruan","Runzhong Wang","Yang Li","Yang Wang","Lei Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.07633v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07911v1","updated":"2024-08-15T03:34:53Z","published":"2024-08-15T03:34:53Z","title":"CEGRL-TKGR: A Causal Enhanced Graph Representation Learning Framework\n for Improving Temporal Knowledge Graph Extrapolation Reasoning","summary":" Temporal knowledge graph reasoning (TKGR) is increasingly gaining attention\nfor its ability to extrapolate new events from historical data, thereby\nenriching the inherently incomplete temporal knowledge graphs. Existing\ngraph-based representation learning frameworks have made significant strides in\ndeveloping evolving representations for both entities and relational\nembeddings. Despite these achievements, there's a notable tendency in these\nmodels to inadvertently learn biased data representations and mine spurious\ncorrelations, consequently failing to discern the causal relationships between\nevents. This often leads to incorrect predictions based on these false\ncorrelations. To address this, we propose an innovative causal enhanced graph\nrepresentation learning framework for TKGR (named CEGRL-TKGR). This framework\nintroduces causal structures in graph-based representation learning to unveil\nthe essential causal relationships between events, ultimately enhancing task\nperformance. Specifically, we first disentangle the evolutionary\nrepresentations of entities and relations in a temporal graph sequence into two\ndistinct components, namely causal representations and confounding\nrepresentations. Then, drawing on causal intervention theory, we advocate the\nutilization of causal representations for predictions, aiming to mitigate the\neffects of erroneous correlations caused by confounding features, thus\nachieving more robust and accurate predictions. Finally, extensive experimental\nresults on six benchmark datasets demonstrate the superior performance of our\nmodel in the link prediction task.\n","authors":["Jinze Sun","Yongpan Sheng","Lirong He"],"pdf_url":"https://arxiv.org/pdf/2408.07911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07906v1","updated":"2024-08-15T03:24:07Z","published":"2024-08-15T03:24:07Z","title":"KAN versus MLP on Irregular or Noisy Functions","summary":" In this paper, we compare the performance of Kolmogorov-Arnold Networks (KAN)\nand Multi-Layer Perceptron (MLP) networks on irregular or noisy functions. We\ncontrol the number of parameters and the size of the training samples to ensure\na fair comparison. For clarity, we categorize the functions into six types:\nregular functions, continuous functions with local non-differentiable points,\nfunctions with jump discontinuities, functions with singularities, functions\nwith coherent oscillations, and noisy functions. Our experimental results\nindicate that KAN does not always perform best. For some types of functions,\nMLP outperforms or performs comparably to KAN. Furthermore, increasing the size\nof training samples can improve performance to some extent. When noise is added\nto functions, the irregular features are often obscured by the noise, making it\nchallenging for both MLP and KAN to extract these features effectively. We hope\nthese experiments provide valuable insights for future neural network research\nand encourage further investigations to overcome these challenges.\n","authors":["Chen Zeng","Jiahui Wang","Haoran Shen","Qiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07897v1","updated":"2024-08-15T03:01:02Z","published":"2024-08-15T03:01:02Z","title":"The Nah Bandit: Modeling User Non-compliance in Recommendation Systems","summary":" Recommendation systems now pervade the digital world, ranging from\nadvertising to entertainment. However, it remains challenging to implement\neffective recommendation systems in the physical world, such as in mobility or\nhealth. This work focuses on a key challenge: in the physical world, it is\noften easy for the user to opt out of taking any recommendation if they are not\nto her liking, and to fall back to her baseline behavior. It is thus crucial in\ncyber-physical recommendation systems to operate with an interaction model that\nis aware of such user behavior, lest the user abandon the recommendations\naltogether. This paper thus introduces the Nah Bandit, a tongue-in-cheek\nreference to describe a Bandit problem where users can say `nah' to the\nrecommendation and opt for their preferred option instead. As such, this\nproblem lies in between a typical bandit setup and supervised learning. We\nmodel the user non-compliance by parameterizing an anchoring effect of\nrecommendations on users. We then propose the Expert with Clustering (EWC)\nalgorithm, a hierarchical approach that incorporates feedback from both\nrecommended and non-recommended options to accelerate user preference learning.\nIn a recommendation scenario with $N$ users, $T$ rounds per user, and $K$\nclusters, EWC achieves a regret bound of $O(N\\sqrt{T\\log K} + NT)$, achieving\nsuperior theoretical performance in the short term compared to LinUCB\nalgorithm. Experimental results also highlight that EWC outperforms both\nsupervised learning and traditional contextual bandit approaches. This\nadvancement reveals that effective use of non-compliance feedback can\naccelerate preference learning and improve recommendation accuracy. This work\nlays the foundation for future research in Nah Bandit, providing a robust\nframework for more effective recommendation systems.\n","authors":["Tianyue Zhou","Jung-Hoon Cho","Cathy Wu"],"pdf_url":"https://arxiv.org/pdf/2408.07897v1.pdf","comment":"12 pages, 8 figures, under review"},{"id":"http://arxiv.org/abs/2407.21294v2","updated":"2024-08-15T02:57:09Z","published":"2024-07-31T02:36:14Z","title":"Decentralized and Uncoordinated Learning of Stable Matchings: A\n Game-Theoretic Approach","summary":" We consider the problem of learning stable matchings with unknown preferences\nin a decentralized and uncoordinated manner, where \"decentralized\" means that\nplayers make decisions individually without the influence of a central\nplatform, and \"uncoordinated\" means that players do not need to synchronize\ntheir decisions using pre-specified rules. First, we provide a game formulation\nfor this problem with known preferences, where the set of pure Nash equilibria\n(NE) coincides with the set of stable matchings, and mixed NE can be rounded to\na stable matching. Then, we show that for hierarchical markets, applying the\nexponential weight (EXP) learning algorithm to the stable matching game\nachieves logarithmic regret in a fully decentralized and uncoordinated fashion.\nMoreover, we show that EXP converges locally and exponentially fast to a stable\nmatching in general markets. We also introduce another decentralized and\nuncoordinated learning algorithm that globally converges to a stable matching\nwith arbitrarily high probability. Finally, we provide stronger feedback\nconditions under which it is possible to drive the market faster toward an\napproximate stable matching. Our proposed game-theoretic framework bridges the\ndiscrete problem of learning stable matchings with the problem of learning NE\nin continuous-action games.\n","authors":["S. Rasoul Etesami","R. Srikant"],"pdf_url":"https://arxiv.org/pdf/2407.21294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07894v1","updated":"2024-08-15T02:52:02Z","published":"2024-08-15T02:52:02Z","title":"System States Forecasting of Microservices with Dynamic Spatio-Temporal\n Data","summary":" In the AIOps (Artificial Intelligence for IT Operations) era, accurately\nforecasting system states is crucial. In microservices systems, this task\nencounters the challenge of dynamic and complex spatio-temporal relationships\namong microservice instances, primarily due to dynamic deployments, diverse\ncall paths, and cascading effects among instances. Current time-series\nforecasting methods, which focus mainly on intrinsic patterns, are insufficient\nin environments where spatial relationships are critical. Similarly,\nspatio-temporal graph approaches often neglect the nature of temporal trend,\nconcentrating mostly on message passing between nodes. Moreover, current\nresearch in microservices domain frequently underestimates the importance of\nnetwork metrics and topological structures in capturing the evolving dynamics\nof systems. This paper introduces STMformer, a model tailored for forecasting\nsystem states in microservices environments, capable of handling multi-node and\nmultivariate time series. Our method leverages dynamic network connection data\nand topological information to assist in modeling the intricate spatio-temporal\nrelationships within the system. Additionally, we integrate the\nPatchCrossAttention module to compute the impact of cascading effects globally.\nWe have developed a dataset based on a microservices system and conducted\ncomprehensive experiments with STMformer against leading methods. In both\nshort-term and long-term forecasting tasks, our model consistently achieved a\n8.6% reduction in MAE(Mean Absolute Error) and a 2.2% reduction in MSE (Mean\nSquared Error). The source code is available at\nhttps://github.com/xuyifeiiie/STMformer.\n","authors":["Yifei Xu","Jingguo Ge","Haina Tang","Shuai Ding","Tong Li","Hui Li"],"pdf_url":"https://arxiv.org/pdf/2408.07894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17900v5","updated":"2024-08-15T02:33:22Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.778 and an AP value of 0.426 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07891v1","updated":"2024-08-15T02:32:50Z","published":"2024-08-15T02:32:50Z","title":"Quantum-inspired Interpretable Deep Learning Architecture for Text\n Sentiment Analysis","summary":" Text has become the predominant form of communication on social media,\nembedding a wealth of emotional nuances. Consequently, the extraction of\nemotional information from text is of paramount importance. Despite previous\nresearch making some progress, existing text sentiment analysis models still\nface challenges in integrating diverse semantic information and lack\ninterpretability. To address these issues, we propose a quantum-inspired deep\nlearning architecture that combines fundamental principles of quantum mechanics\n(QM principles) with deep learning models for text sentiment analysis.\nSpecifically, we analyze the commonalities between text representation and QM\nprinciples to design a quantum-inspired text representation method and further\ndevelop a quantum-inspired text embedding layer. Additionally, we design a\nfeature extraction layer based on long short-term memory (LSTM) networks and\nself-attention mechanisms (SAMs). Finally, we calculate the text density matrix\nusing the quantum complex numbers principle and apply 2D-convolution neural\nnetworks (CNNs) for feature condensation and dimensionality reduction. Through\na series of visualization, comparative, and ablation experiments, we\ndemonstrate that our model not only shows significant advantages in accuracy\nand efficiency compared to previous related models but also achieves a certain\nlevel of interpretability by integrating QM principles. Our code is available\nat QISA.\n","authors":["Bingyu Li","Da Zhang","Zhiyuan Zhao","Junyu Gao","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.07891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07890v1","updated":"2024-08-15T02:31:48Z","published":"2024-08-15T02:31:48Z","title":"Local Causal Discovery with Background Knowledge","summary":" Causality plays a pivotal role in various fields of study. Based on the\nframework of causal graphical models, previous works have proposed identifying\nwhether a variable is a cause or non-cause of a target in every Markov\nequivalent graph solely by learning a local structure. However, the presence of\nprior knowledge, often represented as a partially known causal graph, is common\nin many causal modeling applications. Leveraging this prior knowledge allows\nfor the further identification of causal relationships. In this paper, we first\npropose a method for learning the local structure using all types of causal\nbackground knowledge, including direct causal information, non-ancestral\ninformation and ancestral information. Then we introduce criteria for\nidentifying causal relationships based solely on the local structure in the\npresence of prior knowledge. We also apply out method to fair machine learning,\nand experiments involving local structure learning, causal relationship\nidentification, and fair machine learning demonstrate that our method is both\neffective and efficient.\n","authors":["Qingyuan Zheng","Yue Liu","Yangbo He"],"pdf_url":"https://arxiv.org/pdf/2408.07890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07955v2","updated":"2024-08-15T02:18:08Z","published":"2024-01-15T20:42:16Z","title":"A Study on Large Language Models' Limitations in Multiple-Choice\n Question Answering","summary":" The widespread adoption of Large Language Models (LLMs) has become\ncommonplace, particularly with the emergence of open-source models. More\nimportantly, smaller models are well-suited for integration into consumer\ndevices and are frequently employed either as standalone solutions or as\nsubroutines in various AI tasks. Despite their ubiquitous use, there is no\nsystematic analysis of their specific capabilities and limitations. In this\nstudy, we tackle one of the most widely used tasks - answering Multiple Choice\nQuestion (MCQ). We analyze 26 small open-source models and find that 65% of the\nmodels do not understand the task, only 4 models properly select an answer from\nthe given choices, and only 5 of these models are choice order independent.\nThese results are rather alarming given the extensive use of MCQ tests with\nthese models. We recommend exercising caution and testing task understanding\nbefore using MCQ to evaluate LLMs in any field whatsoever.\n","authors":["Aisha Khatun","Daniel G. Brown"],"pdf_url":"https://arxiv.org/pdf/2401.07955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09039v2","updated":"2024-08-15T02:08:06Z","published":"2024-03-14T02:26:10Z","title":"Detecting Anomalies in Dynamic Graphs via Memory enhanced Normality","summary":" Anomaly detection in dynamic graphs presents a significant challenge due to\nthe temporal evolution of graph structures and attributes. The conventional\napproaches that tackle this problem typically employ an unsupervised learning\nframework, capturing normality patterns with exclusive normal data during\ntraining and identifying deviations as anomalies during testing. However, these\nmethods face critical drawbacks: they either only depend on proxy tasks for\nrepresentation without directly pinpointing normal patterns, or they neglect to\ndifferentiate between spatial and temporal normality patterns. More recent\nmethods that use contrastive learning with negative sampling also face high\ncomputational costs, limiting their scalability to large graphs. To address\nthese challenges, we introduce a novel Spatial-Temporal memories-enhanced graph\nautoencoder (STRIPE). Initially, STRIPE employs Graph Neural Networks (GNNs)\nand gated temporal convolution layers to extract spatial and temporal features.\nThen STRIPE incorporates separate spatial and temporal memory networks to\ncapture and store prototypes of normal patterns, respectively. These stored\npatterns are retrieved and integrated with encoded graph embeddings through a\nmutual attention mechanism. Finally, the integrated features are fed into the\ndecoder to reconstruct the graph streams which serve as the proxy task for\nanomaly detection. This comprehensive approach not only minimizes\nreconstruction errors but also emphasizes the compactness and distinctiveness\nof the embeddings w.r.t. the nearest memory prototypes. Extensive experiments\non six benchmark datasets demonstrate the effectiveness and efficiency of\nSTRIPE, where STRIPE significantly outperforms existing methods with 5.8%\nimprovement in AUC scores and 4.62X faster in training time.\n","authors":["Jie Liu","Xuequn Shang","Xiaolin Han","Kai Zheng","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2403.09039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07666v2","updated":"2024-08-15T01:49:29Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07877v1","updated":"2024-08-15T01:33:06Z","published":"2024-08-15T01:33:06Z","title":"IReCa: Intrinsic Reward-enhanced Context-aware Reinforcement Learning\n for Human-AI Coordination","summary":" In human-AI coordination scenarios, human agents usually exhibit asymmetric\nbehaviors that are significantly sparse and unpredictable compared to those of\nAI agents. These characteristics introduce two primary challenges to human-AI\ncoordination: the effectiveness of obtaining sparse rewards and the efficiency\nof training the AI agents. To tackle these challenges, we propose an Intrinsic\nReward-enhanced Context-aware (IReCa) reinforcement learning (RL) algorithm,\nwhich leverages intrinsic rewards to facilitate the acquisition of sparse\nrewards and utilizes environmental context to enhance training efficiency. Our\nIReCa RL algorithm introduces three unique features: (i) it encourages the\nexploration of sparse rewards by incorporating intrinsic rewards that\nsupplement traditional extrinsic rewards from the environment; (ii) it improves\nthe acquisition of sparse rewards by prioritizing the corresponding sparse\nstate-action pairs; and (iii) it enhances the training efficiency by optimizing\nthe exploration and exploitation through innovative context-aware weights of\nextrinsic and intrinsic rewards. Extensive simulations executed in the\nOvercooked layouts demonstrate that our IReCa RL algorithm can increase the\naccumulated rewards by approximately 20% and reduce the epochs required for\nconvergence by approximately 67% compared to state-of-the-art baselines.\n","authors":["Xin Hao","Bahareh Nakisa","Mohmmad Naim Rastgoo","Richard Dazeley"],"pdf_url":"https://arxiv.org/pdf/2408.07877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07875v1","updated":"2024-08-15T01:23:49Z","published":"2024-08-15T01:23:49Z","title":"Incremental Structure Discovery of Classification via Sequential Monte\n Carlo","summary":" Gaussian Processes (GPs) provide a powerful framework for making predictions\nand understanding uncertainty for classification with kernels and Bayesian\nnon-parametric learning. Building such models typically requires strong prior\nknowledge to define preselect kernels, which could be ineffective for online\napplications of classification that sequentially process data because features\nof data may shift during the process. To alleviate the requirement of prior\nknowledge used in GPs and learn new features from data that arrive\nsuccessively, this paper presents a novel method to automatically discover\nmodels of classification on complex data with little prior knowledge. Our\nmethod adapts a recently proposed technique for GP-based time-series structure\ndiscovery, which integrates GPs and Sequential Monte Carlo (SMC). We extend the\ntechnique to handle extra latent variables in GP classification, such that our\nmethod can effectively and adaptively learn a-priori unknown structures of\nclassification from continuous input. In addition, our method adapts new batch\nof data with updated structures of models. Our experiments show that our method\nis able to automatically incorporate various features of kernels on synthesized\ndata and real-world data for classification. In the experiments of real-world\ndata, our method outperforms various classification methods on both online and\noffline setting achieving a 10\\% accuracy improvement on one benchmark.\n","authors":["Changze Huang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07364v2","updated":"2024-08-15T01:05:23Z","published":"2024-08-14T08:22:13Z","title":"Robust Active Learning (RoAL): Countering Dynamic Adversaries in Active\n Learning with Elastic Weight Consolidation","summary":" Despite significant advancements in active learning and adversarial attacks,\nthe intersection of these two fields remains underexplored, particularly in\ndeveloping robust active learning frameworks against dynamic adversarial\nthreats. The challenge of developing robust active learning frameworks under\ndynamic adversarial attacks is critical, as these attacks can lead to\ncatastrophic forgetting within the active learning cycle. This paper introduces\nRobust Active Learning (RoAL), a novel approach designed to address this issue\nby integrating Elastic Weight Consolidation (EWC) into the active learning\nprocess. Our contributions are threefold: First, we propose a new dynamic\nadversarial attack that poses significant threats to active learning\nframeworks. Second, we introduce a novel method that combines EWC with active\nlearning to mitigate catastrophic forgetting caused by dynamic adversarial\nattacks. Finally, we conduct extensive experimental evaluations to demonstrate\nthe efficacy of our approach. The results show that RoAL not only effectively\ncounters dynamic adversarial threats but also significantly reduces the impact\nof catastrophic forgetting, thereby enhancing the robustness and performance of\nactive learning systems in adversarial environments.\n","authors":["Ricky Maulana Fajri","Yulong Pei","Lu Yin","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2408.07364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07869v1","updated":"2024-08-15T00:53:09Z","published":"2024-08-15T00:53:09Z","title":"A Systematic Evaluation of Generated Time Series and Their Effects in\n Self-Supervised Pretraining","summary":" Self-supervised Pretrained Models (PTMs) have demonstrated remarkable\nperformance in computer vision and natural language processing tasks. These\nsuccesses have prompted researchers to design PTMs for time series data. In our\nexperiments, most self-supervised time series PTMs were surpassed by simple\nsupervised models. We hypothesize this undesired phenomenon may be caused by\ndata scarcity. In response, we test six time series generation methods, use the\ngenerated data in pretraining in lieu of the real data, and examine the effects\non classification performance. Our results indicate that replacing a real-data\npretraining set with a greater volume of only generated samples produces\nnoticeable improvement.\n","authors":["Audrey Der","Chin-Chia Michael Yeh","Xin Dai","Huiyuan Chen","Yan Zheng","Yujie Fan","Zhongfang Zhuang","Vivian Lai","Junpeng Wang","Liang Wang","Wei Zhang","Eamonn Keogh"],"pdf_url":"https://arxiv.org/pdf/2408.07869v1.pdf","comment":"To appear in CIKM 2024 as a short paper; the version here is the\n self-contained version that includes the non-mandatory supplementary material\n available on the paper's companion website"},{"id":"http://arxiv.org/abs/2408.07865v1","updated":"2024-08-15T00:39:42Z","published":"2024-08-15T00:39:42Z","title":"Capturing the Complexity of Human Strategic Decision-Making with Machine\n Learning","summary":" Understanding how people behave in strategic settings--where they make\ndecisions based on their expectations about the behavior of others--is a\nlong-standing problem in the behavioral sciences. We conduct the largest study\nto date of strategic decision-making in the context of initial play in\ntwo-player matrix games, analyzing over 90,000 human decisions across more than\n2,400 procedurally generated games that span a much wider space than previous\ndatasets. We show that a deep neural network trained on these data predicts\npeople's choices better than leading theories of strategic behavior, indicating\nthat there is systematic variation that is not explained by those theories. We\nthen modify the network to produce a new, interpretable behavioral model,\nrevealing what the original network learned about people: their ability to\noptimally respond and their capacity to reason about others are dependent on\nthe complexity of individual games. This context-dependence is critical in\nexplaining deviations from the rational Nash equilibrium, response times, and\nuncertainty in strategic decisions. More broadly, our results demonstrate how\nmachine learning can be applied beyond prediction to further help generate\nnovel explanations of complex human behavior.\n","authors":["Jian-Qiao Zhu","Joshua C. Peterson","Benjamin Enke","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2408.07865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08459v1","updated":"2024-08-15T23:57:02Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08456v1","updated":"2024-08-15T23:46:37Z","published":"2024-08-15T23:46:37Z","title":"Efficient Data-Sketches and Fine-Tuning for Early Detection of\n Distributional Drift in Medical Imaging","summary":" Distributional drift detection is important in medical applications as it\nhelps ensure the accuracy and reliability of models by identifying changes in\nthe underlying data distribution that could affect diagnostic or treatment\ndecisions. However, current methods have limitations in detecting drift; for\nexample, the inclusion of abnormal datasets can lead to unfair comparisons.\nThis paper presents an accurate and sensitive approach to detect distributional\ndrift in CT-scan medical images by leveraging data-sketching and fine-tuning\ntechniques. We developed a robust baseline library model for real-time anomaly\ndetection, allowing for efficient comparison of incoming images and\nidentification of anomalies. Additionally, we fine-tuned a vision transformer\npre-trained model to extract relevant features using breast cancer images as an\nexample, significantly enhancing model accuracy to 99.11\\%. Combining with\ndata-sketches and fine-tuning, our feature extraction evaluation demonstrated\nthat cosine similarity scores between similar datasets provide greater\nimprovements, from around 50\\% increased to 100\\%. Finally, the sensitivity\nevaluation shows that our solutions are highly sensitive to even 1\\%\nsalt-and-pepper and speckle noise, and it is not sensitive to lighting noise\n(e.g., lighting conditions have no impact on data drift). The proposed methods\noffer a scalable and reliable solution for maintaining the accuracy of\ndiagnostic models in dynamic clinical environments.\n","authors":["Yusen Wu","Hao Chen","Alex Pissinou Makki","Phuong Nguyen","Yelena Yesha"],"pdf_url":"https://arxiv.org/pdf/2408.08456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15742v3","updated":"2024-08-15T23:39:00Z","published":"2024-01-28T20:01:44Z","title":"Efficient Data-Driven MPC for Demand Response of Commercial Buildings","summary":" Model predictive control (MPC) has been shown to significantly improve the\nenergy efficiency of buildings while maintaining thermal comfort. Data-driven\napproaches based on neural networks have been proposed to facilitate system\nmodelling. However, such approaches are generally nonconvex and result in\ncomputationally intractable optimization problems. In this work, we design a\nreadily implementable energy management method for small commercial buildings.\nWe then leverage our approach to formulate a real-time demand bidding strategy.\nWe propose a data-driven and mixed-integer convex MPC which is solved via\nderivative-free optimization given a limited computational time of 5 minutes to\nrespect operational constraints. We consider rooftop unit heating, ventilation,\nand air conditioning systems with discrete controls to accurately model the\noperation of most commercial buildings. Our approach uses an input convex\nrecurrent neural network to model the thermal dynamics. We apply our approach\nin several demand response (DR) settings, including a demand bidding, a\ntime-of-use, and a critical peak rebate program. Controller performance is\nevaluated on a state-of-the-art building simulation. The proposed approach\nimproves thermal comfort while reducing energy consumption and cost through DR\nparticipation, when compared to other data-driven approaches or a set-point\ncontroller.\n","authors":["Marie-Christine Paré","Vasken Dermardiros","Antoine Lesage-Landry"],"pdf_url":"https://arxiv.org/pdf/2401.15742v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01787v3","updated":"2024-08-15T23:36:42Z","published":"2024-02-01T23:12:57Z","title":"Harm Amplification in Text-to-Image Models","summary":" Text-to-image (T2I) models have emerged as a significant advancement in\ngenerative AI; however, there exist safety concerns regarding their potential\nto produce harmful image outputs even when users input seemingly safe prompts.\nThis phenomenon, where T2I models generate harmful representations that were\nnot explicit in the input prompt, poses a potentially greater risk than\nadversarial prompts, leaving users unintentionally exposed to harms. Our paper\naddresses this issue by formalizing a definition for this phenomenon which we\nterm harm amplification. We further contribute to the field by developing a\nframework of methodologies to quantify harm amplification in which we consider\nthe harm of the model output in the context of user input. We then empirically\nexamine how to apply these different methodologies to simulate real-world\ndeployment scenarios including a quantification of disparate impacts across\ngenders resulting from harm amplification. Together, our work aims to offer\nresearchers tools to comprehensively address safety challenges in T2I systems\nand contribute to the responsible deployment of generative AI models.\n","authors":["Susan Hao","Renee Shelby","Yuchi Liu","Hansa Srinivasan","Mukul Bhutani","Burcu Karagol Ayan","Ryan Poplin","Shivani Poddar","Sarah Laszlo"],"pdf_url":"https://arxiv.org/pdf/2402.01787v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08454v1","updated":"2024-08-15T23:34:04Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":" The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.17072v4","updated":"2024-08-15T23:22:20Z","published":"2023-10-26T00:28:37Z","title":"MMP++: Motion Manifold Primitives with Parametric Curve Models","summary":" Motion Manifold Primitives (MMP), a manifold-based approach for encoding\nbasic motion skills, can produce diverse trajectories, enabling the system to\nadapt to unseen constraints. Nonetheless, we argue that current MMP models lack\ncrucial functionalities of movement primitives, such as temporal and via-points\nmodulation, found in traditional approaches. This shortfall primarily stems\nfrom MMP's reliance on discrete-time trajectories. To overcome these\nlimitations, we introduce Motion Manifold Primitives++ (MMP++), a new model\nthat integrates the strengths of both MMP and traditional methods by\nincorporating parametric curve representations into the MMP framework.\nFurthermore, we identify a significant challenge with MMP++: performance\ndegradation due to geometric distortions in the latent space, meaning that\nsimilar motions are not closely positioned. To address this, Isometric Motion\nManifold Primitives++ (IMMP++) is proposed to ensure the latent space\naccurately preserves the manifold's geometry. Our experimental results across\nvarious applications, including 2-DoF planar motions, 7-DoF robot arm motions,\nand SE(3) trajectory planning, show that MMP++ and IMMP++ outperform existing\nmethods in trajectory generation tasks, achieving substantial improvements in\nsome cases. Moreover, they enable the modulation of latent coordinates and\nvia-points, thereby allowing efficient online adaptation to dynamic\nenvironments.\n","authors":["Yonghyeon Lee"],"pdf_url":"https://arxiv.org/pdf/2310.17072v4.pdf","comment":"15 pages. The paper will appear in the IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2404.06599v2","updated":"2024-08-15T23:16:11Z","published":"2024-04-09T20:06:25Z","title":"CMDA-OT: Collaborative Multi-source Domain Adaptation Through Optimal\n Transport","summary":" Multi-source Domain Adaptation (MDA) seeks to adapt models trained on data\nfrom multiple labeled source domains to perform effectively on an unlabeled\ntarget domain data, assuming access to sources data. To address the challenges\nof model adaptation and data privacy, we introduce Collaborative MDA Through\nOptimal Transport (CMDA-OT), a novel framework consisting of two key phases. In\nthe first phase, each source domain is independently adapted to the target\ndomain using optimal transport methods. In the second phase, a centralized\ncollaborative learning architecture is employed, which aggregates the N models\nfrom the N sources without accessing their data, thereby safeguarding privacy.\nDuring this process, the server leverages a small set of pseudo-labeled samples\nfrom the target domain, known as the target validation subset, to refine and\nguide the adaptation. This dual-phase approach not only improves model\nperformance on the target domain but also addresses vital privacy challenges\ninherent in domain adaptation.\n","authors":["Omar Ghannou","Younès Bennani"],"pdf_url":"https://arxiv.org/pdf/2404.06599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08448v1","updated":"2024-08-15T22:57:39Z","published":"2024-08-15T22:57:39Z","title":"Exploring Cross-model Neuronal Correlations in the Context of Predicting\n Model Performance and Generalizability","summary":" As Artificial Intelligence (AI) models are increasingly integrated into\ncritical systems, the need for a robust framework to establish the\ntrustworthiness of AI is increasingly paramount. While collaborative efforts\nhave established conceptual foundations for such a framework, there remains a\nsignificant gap in developing concrete, technically robust methods for\nassessing AI model quality and performance. A critical drawback in the\ntraditional methods for assessing the validity and generalizability of models\nis their dependence on internal developer datasets, rendering it challenging to\nindependently assess and verify their performance claims. This paper introduces\na novel approach for assessing a newly trained model's performance based on\nanother known model by calculating correlation between neural networks. The\nproposed method evaluates correlations by determining if, for each neuron in\none network, there exists a neuron in the other network that produces similar\noutput. This approach has implications for memory efficiency, allowing for the\nuse of smaller networks when high correlation exists between networks of\ndifferent sizes. Additionally, the method provides insights into robustness,\nsuggesting that if two highly correlated networks are compared and one\ndemonstrates robustness when operating in production environments, the other is\nlikely to exhibit similar robustness. This contribution advances the technical\ntoolkit for responsible AI, supporting more comprehensive and nuanced\nevaluations of AI models to ensure their safe and effective deployment.\n","authors":["Haniyeh Ehsani Oskouie","Lionel Levine","Majid Sarrafzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.08448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11087v3","updated":"2024-08-15T22:57:08Z","published":"2024-06-16T22:11:41Z","title":"DP-MemArc: Differential Privacy Transfer Learning for Memory Efficient\n Language Models","summary":" Large language models have repeatedly shown outstanding performance across\ndiverse applications. However, deploying these models can inadvertently risk\nuser privacy. The significant memory demands during training pose a major\nchallenge in terms of resource consumption. This substantial size places a\nheavy load on memory resources, raising considerable practical concerns. In\nthis paper, we introduce DP-MemArc, a novel training framework aimed at\nreducing the memory costs of large language models while emphasizing the\nprotection of user data privacy. DP-MemArc incorporates side network or\nreversible network designs to support a variety of differential privacy\nmemory-efficient fine-tuning schemes. Our approach not only achieves in memory\noptimization but also ensures robust privacy protection, keeping user data\nsecure and confidential. Extensive experiments have demonstrated that DP-MemArc\neffectively provides differential privacy-efficient fine-tuning across\ndifferent task scenarios.\n","authors":["Yanming Liu","Xinyue Peng","Yuwei Zhang","Xiaolan Ke","Songhang Deng","Jiannan Cao","Chen Ma","Mengchen Fu","Xuhong Zhang","Sheng Cheng","Xun Wang","Jianwei Yin","Tianyu Du"],"pdf_url":"https://arxiv.org/pdf/2406.11087v3.pdf","comment":"9 pages second version"},{"id":"http://arxiv.org/abs/2408.08446v1","updated":"2024-08-15T22:53:35Z","published":"2024-08-15T22:53:35Z","title":"Lifelong Reinforcement Learning via Neuromodulation","summary":" Navigating multiple tasks$\\unicode{x2014}$for instance in succession as in\ncontinual or lifelong learning, or in distributions as in meta or multi-task\nlearning$\\unicode{x2014}$requires some notion of adaptation. Evolution over\ntimescales of millennia has imbued humans and other animals with highly\neffective adaptive learning and decision-making strategies. Central to these\nfunctions are so-called neuromodulatory systems. In this work we introduce an\nabstract framework for integrating theories and evidence from neuroscience and\nthe cognitive sciences into the design of adaptive artificial reinforcement\nlearning algorithms. We give a concrete instance of this framework built on\nliterature surrounding the neuromodulators Acetylcholine (ACh) and\nNoradrenaline (NA), and empirically validate the effectiveness of the resulting\nadaptive algorithm in a non-stationary multi-armed bandit problem. We conclude\nwith a theory-based experiment proposal providing an avenue to link our\nframework back to efforts in experimental neuroscience.\n","authors":["Sebastian Lee","Samuel Liebana Garcia","Claudia Clopath","Will Dabney"],"pdf_url":"https://arxiv.org/pdf/2408.08446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08444v1","updated":"2024-08-15T22:34:44Z","published":"2024-08-15T22:34:44Z","title":"W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question\n Answering","summary":" In knowledge-intensive tasks such as open-domain question answering (OpenQA),\nLarge Language Models (LLMs) often struggle to generate factual answers relying\nsolely on their internal (parametric) knowledge. To address this limitation,\nRetrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving\nrelevant information from external sources, thereby positioning the retriever\nas a pivotal component. Although dense retrieval demonstrates state-of-the-art\nperformance, its training poses challenges due to the scarcity of ground-truth\nevidence, largely attributed to the high costs of human annotation. In this\npaper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create\nweakly labeled data for training dense retrievers. Specifically, we rerank the\ntop-$K$ passages retrieved via BM25 by assessing the probability that LLMs will\ngenerate the correct answer based on the question and each passage. The\nhighest-ranking passages are then used as positive training examples for dense\nretrieval. Our comprehensive experiments across four publicly available OpenQA\ndatasets demonstrate that our approach enhances both retrieval and OpenQA\nperformance compared to baseline models.\n","authors":["Jinming Nian","Zhiyuan Peng","Qifan Wang","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.07588v3","updated":"2024-08-15T22:33:25Z","published":"2020-08-12T20:08:04Z","title":"Uncertainty Quantification using Variational Inference for Biomedical\n Image Segmentation","summary":" Deep learning motivated by convolutional neural networks has been highly\nsuccessful in a range of medical imaging problems like image classification,\nimage segmentation, image synthesis etc. However for validation and\ninterpretability, not only do we need the predictions made by the model but\nalso how confident it is while making those predictions. This is important in\nsafety critical applications for the people to accept it. In this work, we used\nan encoder decoder architecture based on variational inference techniques for\nsegmenting brain tumour images. We evaluate our work on the publicly available\nBRATS dataset using Dice Similarity Coefficient (DSC) and Intersection Over\nUnion (IOU) as the evaluation metrics. Our model is able to segment brain\ntumours while taking into account both aleatoric uncertainty and epistemic\nuncertainty in a principled bayesian manner.\n","authors":["Abhinav Sagar"],"pdf_url":"https://arxiv.org/pdf/2008.07588v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17587v2","updated":"2024-08-15T22:28:55Z","published":"2024-05-27T18:40:49Z","title":"RAGSys: Item-Cold-Start Recommender as RAG System","summary":" Large Language Models (LLM) hold immense promise for real-world applications,\nbut their generic knowledge often falls short of domain-specific needs.\nFine-tuning, a common approach, can suffer from catastrophic forgetting and\nhinder generalizability. In-Context Learning (ICL) offers an alternative, which\ncan leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant\ndemonstrations for few-shot learning tasks. This paper explores the desired\nqualities of a demonstration retrieval system for ICL. We argue that ICL\nretrieval in this context resembles item-cold-start recommender systems,\nprioritizing discovery and maximizing information gain over strict relevance.\nWe propose a novel evaluation method that measures the LLM's subsequent\nperformance on NLP tasks, eliminating the need for subjective diversity scores.\nOur findings demonstrate the critical role of diversity and quality bias in\nretrieved demonstrations for effective ICL, and highlight the potential of\nrecommender system techniques in this domain.\n","authors":["Emile Contal","Garrin McGoldrick"],"pdf_url":"https://arxiv.org/pdf/2405.17587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08442v1","updated":"2024-08-15T22:27:32Z","published":"2024-08-15T22:27:32Z","title":"A semi-centralized multi-agent RL framework for efficient irrigation\n scheduling","summary":" This paper proposes a Semi-Centralized Multi-Agent Reinforcement Learning\n(SCMARL) approach for irrigation scheduling in spatially variable agricultural\nfields, where management zones address spatial variability. The SCMARL\nframework is hierarchical in nature, with a centralized coordinator agent at\nthe top level and decentralized local agents at the second level. The\ncoordinator agent makes daily binary irrigation decisions based on field-wide\nconditions, which are communicated to the local agents. Local agents determine\nappropriate irrigation amounts for specific management zones using local\nconditions. The framework employs state augmentation approach to handle\nnon-stationarity in the local agents' environments. An extensive evaluation on\na large-scale field in Lethbridge, Canada, compares the SCMARL approach with a\nlearning-based multi-agent model predictive control scheduling approach,\nhighlighting its enhanced performance, resulting in water conservation and\nimproved Irrigation Water Use Efficiency (IWUE). Notably, the proposed approach\nachieved a 4.0% savings in irrigation water while enhancing the IWUE by 6.3%.\n","authors":["Bernard T. Agyeman","Benjamin Decard-Nelson","Jinfeng Liu","Sirish L. Shah"],"pdf_url":"https://arxiv.org/pdf/2408.08442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13193v2","updated":"2024-08-15T22:27:19Z","published":"2024-05-21T20:53:18Z","title":"Efficient Imitation Learning with Conservative World Models","summary":" We tackle the problem of policy learning from expert demonstrations without a\nreward function. A central challenge in this space is that these policies fail\nupon deployment due to issues of distributional shift, environment\nstochasticity, or compounding errors. Adversarial imitation learning alleviates\nthis issue but requires additional on-policy training samples for stability,\nwhich presents a challenge in realistic domains due to inefficient learning and\nhigh sample complexity. One approach to this issue is to learn a world model of\nthe environment, and use synthetic data for policy training. While successful\nin prior works, we argue that this is sub-optimal due to additional\ndistribution shifts between the learned model and the real environment.\nInstead, we re-frame imitation learning as a fine-tuning problem, rather than a\npure reinforcement learning one. Drawing theoretical connections to offline RL\nand fine-tuning algorithms, we argue that standard online world model\nalgorithms are not well suited to the imitation learning problem. We derive a\nprincipled conservative optimization bound and demonstrate empirically that it\nleads to improved performance on two very challenging manipulation environments\nfrom high-dimensional raw pixel observations. We set a new state-of-the-art\nperformance on the Franka Kitchen environment from images, requiring only 10\ndemos on no reward labels, as well as solving a complex dexterity manipulation\ntask.\n","authors":["Victor Kolev","Rafael Rafailov","Kyle Hatch","Jiajun Wu","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2405.13193v2.pdf","comment":"Oral presentation, L4DC 2024"},{"id":"http://arxiv.org/abs/2408.08441v1","updated":"2024-08-15T22:27:00Z","published":"2024-08-15T22:27:00Z","title":"D5RL: Diverse Datasets for Data-Driven Deep Reinforcement Learning","summary":" Offline reinforcement learning algorithms hold the promise of enabling\ndata-driven RL methods that do not require costly or dangerous real-world\nexploration and benefit from large pre-collected datasets. This in turn can\nfacilitate real-world applications, as well as a more standardized approach to\nRL research. Furthermore, offline RL methods can provide effective\ninitializations for online finetuning to overcome challenges with exploration.\nHowever, evaluating progress on offline RL algorithms requires effective and\nchallenging benchmarks that capture properties of real-world tasks, provide a\nrange of task difficulties, and cover a range of challenges both in terms of\nthe parameters of the domain (e.g., length of the horizon, sparsity of rewards)\nand the parameters of the data (e.g., narrow demonstration data or broad\nexploratory data). While considerable progress in offline RL in recent years\nhas been enabled by simpler benchmark tasks, the most widely used datasets are\nincreasingly saturating in performance and may fail to reflect properties of\nrealistic tasks. We propose a new benchmark for offline RL that focuses on\nrealistic simulations of robotic manipulation and locomotion environments,\nbased on models of real-world robotic systems, and comprising a variety of data\nsources, including scripted data, play-style data collected by human\nteleoperators, and other data sources. Our proposed benchmark covers\nstate-based and image-based domains, and supports both offline RL and online\nfine-tuning evaluation, with some of the tasks specifically designed to require\nboth pre-training and fine-tuning. We hope that our proposed benchmark will\nfacilitate further progress on both offline RL and fine-tuning algorithms.\nWebsite with code, examples, tasks, and data is available at\n\\url{https://sites.google.com/view/d5rl/}\n","authors":["Rafael Rafailov","Kyle Hatch","Anikait Singh","Laura Smith","Aviral Kumar","Ilya Kostrikov","Philippe Hansen-Estruch","Victor Kolev","Philip Ball","Jiajun Wu","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2408.08441v1.pdf","comment":"RLC 2024"},{"id":"http://arxiv.org/abs/2408.08432v1","updated":"2024-08-15T21:49:43Z","published":"2024-08-15T21:49:43Z","title":"Predictive uncertainty estimation in deep learning for lung carcinoma\n classification in digital pathology under real dataset shifts","summary":" Deep learning has shown tremendous progress in a wide range of digital\npathology and medical image classification tasks. Its integration into safe\nclinical decision-making support requires robust and reliable models. However,\nreal-world data comes with diversities that often lie outside the intended\nsource distribution. Moreover, when test samples are dramatically different,\nclinical decision-making is greatly affected. Quantifying predictive\nuncertainty in models is crucial for well-calibrated predictions and\ndetermining when (or not) to trust a model. Unfortunately, many works have\noverlooked the importance of predictive uncertainty estimation. This paper\nevaluates whether predictive uncertainty estimation adds robustness to deep\nlearning-based diagnostic decision-making systems. We investigate the effect of\nvarious carcinoma distribution shift scenarios on predictive performance and\ncalibration. We first systematically investigate three popular methods for\nimproving predictive uncertainty: Monte Carlo dropout, deep ensemble, and\nfew-shot learning on lung adenocarcinoma classification as a primary disease in\nwhole slide images. Secondly, we compare the effectiveness of the methods in\nterms of performance and calibration under clinically relevant distribution\nshifts such as in-distribution shifts comprising primary disease sub-types and\nother characterization analysis data; out-of-distribution shifts comprising\nwell-differentiated cases, different organ origin, and imaging modality shifts.\nWhile studies on uncertainty estimation exist, to our best knowledge, no\nrigorous large-scale benchmark compares predictive uncertainty estimation\nincluding these dataset shifts for lung carcinoma classification.\n","authors":["Abdur R. Fayjie","Jutika Borah","Florencia Carbone","Jan Tack","Patrick Vandewalle"],"pdf_url":"https://arxiv.org/pdf/2408.08432v1.pdf","comment":"17 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2202.07592v2","updated":"2024-08-15T21:49:15Z","published":"2022-02-15T17:28:42Z","title":"Deep Convolutional Autoencoder for Assessment of Anomalies in\n Multi-stream Sensor Data","summary":" This work investigates a practical and novel method for automated\nunsupervised fault detection in vehicles using a fully convolutional\nautoencoder. The results demonstrate the algorithm we developed can detect\nanomalies which correspond to powertrain faults by learning patterns in the\nmultivariate time-series data of hybrid-electric vehicle powertrain sensors.\nData was collected by engineers at Ford Motor Company from numerous sensors\nover several drive cycle variations. This study provides evidence of the\nanomaly detecting capability of our trained autoencoder and investigates the\nsuitability of our autoencoder relative to other unsupervised methods for\nautomatic fault detection in this data set. Preliminary results of testing the\nautoencoder on the powertrain sensor data indicate the data reconstruction\napproach availed by the autoencoder is a robust technique for identifying the\nabnormal sequences in the multivariate series. These results support that\nirregularities in hybrid-electric vehicles' powertrains are conveyed via sensor\nsignals in the embedded electronic communication system, and therefore can be\nidentified mechanistically with a trained algorithm. Additional unsupervised\nmethods are tested and show the autoencoder performs better at fault detection\nthan outlier detectors and other novel deep learning techniques.\n","authors":["Anthony Geglio","Eisa Hedayati","Mark Tascillo","Dyche Anderson","Jonathan Barker","Timothy C. Havens"],"pdf_url":"https://arxiv.org/pdf/2202.07592v2.pdf","comment":"SSCI2022, 7 pages, 3 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2408.08430v1","updated":"2024-08-15T21:43:26Z","published":"2024-08-15T21:43:26Z","title":"Random Gradient Masking as a Defensive Measure to Deep Leakage in\n Federated Learning","summary":" Federated Learning(FL), in theory, preserves privacy of individual clients'\ndata while producing quality machine learning models. However, attacks such as\nDeep Leakage from Gradients(DLG) severely question the practicality of FL. In\nthis paper, we empirically evaluate the efficacy of four defensive methods\nagainst DLG: Masking, Clipping, Pruning, and Noising. Masking, while only\npreviously studied as a way to compress information during parameter transfer,\nshows surprisingly robust defensive utility when compared to the other three\nestablished methods. Our experimentation is two-fold. We first evaluate the\nminimum hyperparameter threshold for each method across MNIST, CIFAR-10, and\nlfw datasets. Then, we train FL clients with each method and their minimum\nthreshold values to investigate the trade-off between DLG defense and training\nperformance. Results reveal that Masking and Clipping show near to none\ndegradation in performance while obfuscating enough information to effectively\ndefend against DLG.\n","authors":["Joon Kim","Sejin Park"],"pdf_url":"https://arxiv.org/pdf/2408.08430v1.pdf","comment":"13 pages, 5 figures, to be submitted to Applied Intelligence"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.08205v1","updated":"2024-08-15T15:13:22Z","published":"2024-08-15T15:13:22Z","title":"A Multi-task Adversarial Attack Against Face Authentication","summary":" Deep-learning-based identity management systems, such as face authentication\nsystems, are vulnerable to adversarial attacks. However, existing attacks are\ntypically designed for single-task purposes, which means they are tailored to\nexploit vulnerabilities unique to the individual target rather than being\nadaptable for multiple users or systems. This limitation makes them unsuitable\nfor certain attack scenarios, such as morphing, universal, transferable, and\ncounter attacks. In this paper, we propose a multi-task adversarial attack\nalgorithm called MTADV that are adaptable for multiple users or systems. By\ninterpreting these scenarios as multi-task attacks, MTADV is applicable to both\nsingle- and multi-task attacks, and feasible in the white- and gray-box\nsettings. Furthermore, MTADV is effective against various face datasets,\nincluding LFW, CelebA, and CelebA-HQ, and can work with different deep learning\nmodels, such as FaceNet, InsightFace, and CurricularFace. Importantly, MTADV\nretains its feasibility as a single-task attack targeting a single user/system.\nTo the best of our knowledge, MTADV is the first adversarial attack method that\ncan target all of the aforementioned scenarios in one algorithm.\n","authors":["Hanrui Wang","Shuo Wang","Cunjian Chen","Massimo Tistarelli","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08205v1.pdf","comment":"Accepted by ACM Transactions on Multimedia Computing, Communications,\n and Applications"},{"id":"http://arxiv.org/abs/2406.14176v2","updated":"2024-08-15T14:59:31Z","published":"2024-06-20T10:33:15Z","title":"A Multi-Stream Fusion Approach with One-Class Learning for Audio-Visual\n Deepfake Detection","summary":" This paper addresses the challenge of developing a robust audio-visual\ndeepfake detection model. In practical use cases, new generation algorithms are\ncontinually emerging, and these algorithms are not encountered during the\ndevelopment of detection methods. This calls for the generalization ability of\nthe method. Additionally, to ensure the credibility of detection methods, it is\nbeneficial for the model to interpret which cues from the video indicate it is\nfake. Motivated by these considerations, we then propose a multi-stream fusion\napproach with one-class learning as a representation-level regularization\ntechnique. We study the generalization problem of audio-visual deepfake\ndetection by creating a new benchmark by extending and re-splitting the\nexisting FakeAVCeleb dataset. The benchmark contains four categories of fake\nvideos (Real Audio-Fake Visual, Fake Audio-Fake Visual, Fake Audio-Real Visual,\nand Unsynchronized videos). The experimental results demonstrate that our\napproach surpasses the previous models by a large margin. Furthermore, our\nproposed framework offers interpretability, indicating which modality the model\nidentifies as more likely to be fake. The source code is released at\nhttps://github.com/bok-bok/MSOC.\n","authors":["Kyungbok Lee","You Zhang","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2406.14176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08093v1","updated":"2024-08-15T11:36:18Z","published":"2024-08-15T11:36:18Z","title":"When Video Coding Meets Multimodal Large Language Models: A Unified\n Paradigm for Video Coding","summary":" Existing codecs are designed to eliminate intrinsic redundancies to create a\ncompact representation for compression. However, strong external priors from\nMultimodal Large Language Models (MLLMs) have not been explicitly explored in\nvideo compression. Herein, we introduce a unified paradigm for Cross-Modality\nVideo Coding (CMVC), which is a pioneering approach to explore multimodality\nrepresentation and video generative models in video coding. Specifically, on\nthe encoder side, we disentangle a video into spatial content and motion\ncomponents, which are subsequently transformed into distinct modalities to\nachieve very compact representation by leveraging MLLMs. During decoding,\npreviously encoded components and video generation models are leveraged to\ncreate multiple encoding-decoding modes that optimize video reconstruction\nquality for specific decoding requirements, including Text-Text-to-Video (TT2V)\nmode to ensure high-quality semantic information and Image-Text-to-Video (IT2V)\nmode to achieve superb perceptual consistency. In addition, we propose an\nefficient frame interpolation model for IT2V mode via Low-Rank Adaption (LoRA)\ntuning to guarantee perceptual quality, which allows the generated motion cues\nto behave smoothly. Experiments on benchmarks indicate that TT2V achieves\neffective semantic reconstruction, while IT2V exhibits competitive perceptual\nconsistency. These results highlight potential directions for future research\nin video coding.\n","authors":["Pingping Zhang","Jinlong Li","Meng Wang","Nicu Sebe","Sam Kwong","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07957v1","updated":"2024-08-15T06:11:59Z","published":"2024-08-15T06:11:59Z","title":"Joint Optimization of Buffer Delay and HARQ for Video Communications","summary":" To improve the quality of experience (QoE) in video communication over lossy\nnetworks, this paper presents a transmission method that jointly optimizes\nbuffer delay and Hybrid Automatic Repeat request (HARQ), referred to as\nBD-HARQ. This method operates on packet group and employs dynamic buffer delay\ncombined with HARQ strategy for transmission. By defining the QoE based on\nmetrics such as buffer delay, Forward Error Correction (FEC) redundancy, and\ndata recovery rate, the proposed method derives its closed-form expression\nthrough rigorous mathematical modeling and analysis. The optimal transmission\nparameters, i.e., the buffer delay and the FEC redundancy, are then determined\nand implemented, guaranteeing the real-time performance, transmission\nefficiency, and data recovery rate of video communication. Experimental results\ndemonstrate that the proposed method aligns well with its theoretical\nexpectations, and that it can provide up to 13.7% higher QoE compared to\nexisting methods and increase the tolerance for packet loss rate from 15%-22%\nto up to 31% while maintaining a high QoE.\n","authors":["Baoping Cheng","Peng Lei","Xiaoyan Xie","Tao Fu","Yukun Zhang","Xiaoming Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07957v1.pdf","comment":"6 pages, 5figures"},{"id":"http://arxiv.org/abs/2407.18552v2","updated":"2024-08-15T05:14:38Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Modigari Narendra","Vigya Sharma","Santhosh Malarvannan","Amir H. Gandomi"],"pdf_url":"https://arxiv.org/pdf/2407.18552v2.pdf","comment":"38 Pages, 9 Tables, 12 Figures"}]},"2024-08-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.08872v1","updated":"2024-08-16T17:57:01Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08869v1","updated":"2024-08-16T17:54:09Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable gains in accuracy for Large\nLanguage Models (LLMs). However, such techniques depend on the availability of\nan accurate answer extraction process to aggregate across multiple outputs.\nMoreover, they acquire higher inference cost, in comparison to Greedy Decoding,\ndue to generation of relatively higher number of output tokens. Research has\nshown that the free form text outputs from Self-Consistency can be aggregated\nreliably using LLMs to produce the final output. Additionally, recent\nadvancements in LLM inference have demonstrated that usage of diverse exemplars\nin prompts have the ability to induce diversity in the LLM outputs. Such proven\ntechniques can be easily extended to self-ensembling based approaches to\nachieve enhanced results in text generation. In this paper, we introduce PEDAL\n(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid\nself-ensembling approach, that combines the strengths of diverse exemplar based\nprompts and LLM based aggregation to achieve improvement in overall\nperformance. On the publicly available SVAMP and ARC datasets, our experiments\nreveal that PEDAL can achieve better accuracy than Greedy Decoding based\nstrategies with lower inference cost compared to Self Consistency based\napproaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02304v3","updated":"2024-08-16T17:28:08Z","published":"2023-10-03T17:59:32Z","title":"Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation","summary":" Several recent advances in AI systems solve problems by providing a\n\"scaffolding\" program that structures multiple calls to language models (LMs)\nto generate better outputs. A scaffolding program is written in a programming\nlanguage such as Python. In this work, we use a language-model-infused\nscaffolding program to improve itself. We start with a seed \"improver\" that\nimproves an input program according to a given utility function by querying an\nLM several times and returning the best solution. We then run this seed\nimprover to improve itself. Across a small set of downstream tasks, the\nresulting improved improver generates programs with significantly better\nperformance than its seed improver. A variety of self-improvement strategies\nare proposed by the language model, including beam search, genetic algorithms,\nand simulated annealing. Since the language models themselves are not altered,\nthis is not full recursive self-improvement. Nonetheless, it demonstrates that\na modern language model, GPT-4 in our experiments, is capable of writing code\nthat can call itself to improve itself. We consider concerns around the\ndevelopment of self-improving technologies and evaluate the frequency with\nwhich the generated code bypasses a sandbox.\n","authors":["Eric Zelikman","Eliana Lorch","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2310.02304v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08848v1","updated":"2024-08-16T17:19:23Z","published":"2024-08-16T17:19:23Z","title":"PsychoLex: Unveiling the Psychological Mind of Large Language Models","summary":" This paper explores the intersection of psychology and artificial\nintelligence through the development and evaluation of specialized Large\nLanguage Models (LLMs). We introduce PsychoLex, a suite of resources designed\nto enhance LLMs' proficiency in psychological tasks in both Persian and\nEnglish. Key contributions include the PsychoLexQA dataset for instructional\ncontent and the PsychoLexEval dataset for rigorous evaluation of LLMs in\ncomplex psychological scenarios. Additionally, we present the PsychoLexLLaMA\nmodel, optimized specifically for psychological applications, demonstrating\nsuperior performance compared to general-purpose models. The findings\nunderscore the potential of tailored LLMs for advancing psychological research\nand applications, while also highlighting areas for further refinement. This\nresearch offers a foundational step towards integrating LLMs into specialized\npsychological domains, with implications for future advancements in AI-driven\npsychological practice.\n","authors":["Mohammad Amin Abbasi","Farnaz Sadat Mirnezami","Hassan Naderi"],"pdf_url":"https://arxiv.org/pdf/2408.08848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15455v2","updated":"2024-08-16T17:12:27Z","published":"2024-03-18T23:41:52Z","title":"Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams","summary":" The proliferation of textual data on the Internet presents a unique\nopportunity for institutions and companies to monitor public opinion about\ntheir services and products. Given the rapid generation of such data, the text\nstream mining setting, which handles sequentially arriving, potentially\ninfinite text streams, is often more suitable than traditional batch learning.\nWhile pre-trained language models are commonly employed for their high-quality\ntext vectorization capabilities in streaming contexts, they face challenges\nadapting to concept drift - the phenomenon where the data distribution changes\nover time, adversely affecting model performance. Addressing the issue of\nconcept drift, this study explores the efficacy of seven text sampling methods\ndesigned to selectively fine-tune language models, thereby mitigating\nperformance degradation. We precisely assess the impact of these methods on\nfine-tuning the SBERT model using four different loss functions. Our\nevaluation, focused on Macro F1-score and elapsed time, employs two text stream\ndatasets and an incremental SVM classifier to benchmark performance. Our\nfindings indicate that Softmax loss and Batch All Triplets loss are\nparticularly effective for text stream classification, demonstrating that\nlarger sample sizes generally correlate with improved macro F1-scores. Notably,\nour proposed WordPieceToken ratio sampling method significantly enhances\nperformance with the identified loss functions, surpassing baseline results.\n","authors":["Cristiano Mesquita Garcia","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2403.15455v2.pdf","comment":"Accepted for presentation at the 27th International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2403.03640v4","updated":"2024-08-16T17:06:39Z","published":"2024-03-06T11:56:02Z","title":"Apollo: A Lightweight Multilingual Medical LLM towards Democratizing\n Medical AI to 6B People","summary":" Despite the vast repository of global medical knowledge predominantly being\nin English, local languages are crucial for delivering tailored healthcare\nservices, particularly in areas with limited medical resources. To extend the\nreach of medical AI advancements to a broader population, we aim to develop\nmedical LLMs across the six most widely spoken languages, encompassing a global\npopulation of 6.1 billion. This effort culminates in the creation of the\nApolloCorpora multilingual medical dataset and the XMedBench benchmark. In the\nmultilingual medical benchmark, the released Apollo models, at various\nrelatively-small sizes (i.e., 0.5B, 1.8B, 2B, 6B, and 7B), achieve the best\nperformance among models of equivalent size. Especially, Apollo-7B is the\nstate-of-the-art multilingual medical LLMs up to 70B. Additionally, these lite\nmodels could be used to improve the multi-lingual medical capabilities of\nlarger models without fine-tuning in a proxy-tuning fashion. We will\nopen-source training corpora, code, model weights and evaluation benchmark.\n","authors":["Xidong Wang","Nuo Chen","Junyin Chen","Yidong Wang","Guorui Zhen","Yan Hu","Xiangbo Wu","Anningzhe Gao","Xiang Wan","Haizhou Li","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2403.03640v4.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.07964v3","updated":"2024-08-16T17:01:06Z","published":"2024-01-15T21:06:20Z","title":"AI-as-exploration: Navigating intelligence space","summary":" Artificial Intelligence is a field that lives many lives, and the term has\ncome to encompass a motley collection of scientific and commercial endeavours.\nIn this paper, I articulate the contours of a rather neglected but central\nscientific role that AI has to play, which I dub `AI-as-exploration'.The basic\nthrust of AI-as-exploration is that of creating and studying systems that can\nreveal candidate building blocks of intelligence that may differ from the forms\nof human and animal intelligence we are familiar with. In other words, I\nsuggest that AI is one of the best tools we have for exploring intelligence\nspace, namely the space of possible intelligent systems. I illustrate the value\nof AI-as-exploration by focusing on a specific case study, i.e., recent work on\nthe capacity to combine novel and invented concepts in humans and Large\nLanguage Models. I show that the latter, despite showing human-level accuracy\nin such a task, probably solve it in ways radically different, but no less\nrelevant to intelligence research, to those hypothesised for humans.\n","authors":["Dimitri Coelho Mollo"],"pdf_url":"https://arxiv.org/pdf/2401.07964v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08841v1","updated":"2024-08-16T17:00:11Z","published":"2024-08-16T17:00:11Z","title":"FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats","summary":" The table reasoning task aims to answer the question according to the given\ntable. Currently, using Large Language Models (LLMs) is the predominant method\nfor table reasoning. Most existing methods employ a fixed tabular format to\nrepresent the table, which could limit the performance. Given that each\ninstance requires different capabilities and models possess varying abilities,\nwe assert that different instances and models suit different tabular formats.\nWe prove the aforementioned claim through quantitative analysis of experimental\nresults, where different instances and models achieve different performances\nusing various tabular formats. Building on this discussion, we propose\nFLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by\nemploying flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a\nclassifier to predict the most suitable tabular format based on the instance\nand the LLM. (ii) FLEXTAF-Vote integrates the results across different formats.\nOur experiments on WikiTableQuestions and TabFact reveal significant\nimprovements, with average gains of 2.3% and 4.8% compared to the best\nperformance achieved using a fixed tabular format with greedy decoding and\nself-consistency decoding, thereby validating the effectiveness of our methods.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Baoxin Wang","Dayong Wu","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01008v3","updated":"2024-08-16T16:26:11Z","published":"2023-03-31T16:11:56Z","title":"Self-Supervised Multimodal Learning: A Survey","summary":" Multimodal learning, which aims to understand and analyze information from\nmultiple modalities, has achieved substantial progress in the supervised regime\nin recent years. However, the heavy dependence on data paired with expensive\nhuman annotations impedes scaling up models. Meanwhile, given the availability\nof large-scale unannotated data in the wild, self-supervised learning has\nbecome an attractive strategy to alleviate the annotation bottleneck. Building\non these two directions, self-supervised multimodal learning (SSML) provides\nways to learn from raw multimodal data. In this survey, we provide a\ncomprehensive review of the state-of-the-art in SSML, in which we elucidate\nthree major challenges intrinsic to self-supervised learning with multimodal\ndata: (1) learning representations from multimodal data without labels, (2)\nfusion of different modalities, and (3) learning with unaligned data. We then\ndetail existing solutions to these challenges. Specifically, we consider (1)\nobjectives for learning from multimodal unlabeled data via self-supervision,\n(2) model architectures from the perspective of different multimodal fusion\nstrategies, and (3) pair-free learning strategies for coarse-grained and\nfine-grained alignment. We also review real-world applications of SSML\nalgorithms in diverse fields such as healthcare, remote sensing, and machine\ntranslation. Finally, we discuss challenges and future directions for SSML. A\ncollection of related resources can be found at:\nhttps://github.com/ys-zong/awesome-self-supervised-multimodal-learning.\n","authors":["Yongshuo Zong","Oisin Mac Aodha","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2304.01008v3.pdf","comment":"Accepted to IEEE T-PAMI"},{"id":"http://arxiv.org/abs/2402.10666v3","updated":"2024-08-16T15:37:25Z","published":"2024-02-16T13:14:35Z","title":"Multi-Hop Table Retrieval for Open-Domain Text-to-SQL","summary":" Open-domain text-to-SQL is an important task that retrieves question-relevant\ntables from massive databases and then generates SQL. However, existing\nretrieval methods that retrieve in a single hop do not pay attention to the\ntext-to-SQL challenge of schema linking, which is aligning the entities in the\nquestion with table entities, reflected in two aspects: similar irrelevant\nentity and domain mismatch entity. Therefore, we propose our method, the\nmulti-hop table retrieval with rewrite and beam search (Murre). To reduce the\neffect of the similar irrelevant entity, our method focuses on unretrieved\nentities at each hop and considers the low-ranked tables by beam search. To\nalleviate the limitation of domain mismatch entity, Murre rewrites the question\nbased on retrieved tables in multiple hops, decreasing the domain gap with\nrelevant tables. We conduct experiments on SpiderUnion and BirdUnion+, reaching\nnew state-of-the-art results with an average improvement of 6.38%.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2402.10666v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08660v2","updated":"2024-08-16T15:33:23Z","published":"2024-06-12T21:46:13Z","title":"Fine-Tuned 'Small' LLMs (Still) Significantly Outperform Zero-Shot\n Generative AI Models in Text Classification","summary":" Generative AI offers a simple, prompt-based alternative to fine-tuning\nsmaller BERT-style LLMs for text classification tasks. This promises to\neliminate the need for manually labeled training data and task-specific model\ntraining. However, it remains an open question whether tools like ChatGPT can\ndeliver on this promise. In this paper, we show that smaller, fine-tuned LLMs\n(still) consistently and significantly outperform larger, zero-shot prompted\nmodels in text classification. We compare three major generative AI models\n(ChatGPT with GPT-3.5/GPT-4 and Claude Opus) with several fine-tuned LLMs\nacross a diverse set of classification tasks (sentiment, approval/disapproval,\nemotions, party positions) and text categories (news, tweets, speeches). We\nfind that fine-tuning with application-specific training data achieves superior\nperformance in all cases. To make this approach more accessible to a broader\naudience, we provide an easy-to-use toolkit alongside this paper. Our toolkit,\naccompanied by non-technical step-by-step guidance, enables users to select and\nfine-tune BERT-like LLMs for any classification task with minimal technical and\ncomputational effort.\n","authors":["Martin Juan José Bucher","Marco Martini"],"pdf_url":"https://arxiv.org/pdf/2406.08660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08805v1","updated":"2024-08-16T15:29:54Z","published":"2024-08-16T15:29:54Z","title":"CIKMar: A Dual-Encoder Approach to Prompt-Based Reranking in Educational\n Dialogue Systems","summary":" In this study, we introduce CIKMar, an efficient approach to educational\ndialogue systems powered by the Gemma Language model. By leveraging a\nDual-Encoder ranking system that incorporates both BERT and SBERT model, we\nhave designed CIKMar to deliver highly relevant and accurate responses, even\nwith the constraints of a smaller language model size. Our evaluation reveals\nthat CIKMar achieves a robust recall and F1-score of 0.70 using BERTScore\nmetrics. However, we have identified a significant challenge: the Dual-Encoder\ntends to prioritize theoretical responses over practical ones. These findings\nunderscore the potential of compact and efficient models like Gemma in\ndemocratizing access to advanced educational AI systems, ensuring effective and\ncontextually appropriate responses.\n","authors":["Joanito Agili Lopo","Marina Indah Prasasti","Alma Permatasari"],"pdf_url":"https://arxiv.org/pdf/2408.08805v1.pdf","comment":"This paper is the result of the final project of the Natural Language\n Processing course, Master of Artificial Intelligence, Universitas Gadjah Mada"},{"id":"http://arxiv.org/abs/2408.08803v1","updated":"2024-08-16T15:28:02Z","published":"2024-08-16T15:28:02Z","title":"Leveraging FourierKAN Classification Head for Pre-Trained\n Transformer-based Text Classification","summary":" For many years, transformer-based pre-trained models with Multi-layer\nPerceptron (MLP) heads have been the standard for text classification tasks.\nHowever, the fixed non-linear functions employed by MLPs often fall short of\ncapturing the intricacies of the contextualized embeddings produced by\npre-trained encoders. Furthermore, MLPs usually require a significant number of\ntraining parameters, which can be computationally expensive. In this work, we\nintroduce FourierKAN (FR-KAN), a variant of the promising MLP alternative\ncalled Kolmogorov-Arnold Networks (KANs), as classification heads for\ntransformer-based encoders. Our studies reveal an average increase of 10% in\naccuracy and 11% in F1-score when incorporating FR-KAN heads instead of\ntraditional MLP heads for several transformer-based pre-trained models across\nmultiple text classification tasks. Beyond improving model accuracy, FR-KAN\nheads train faster and require fewer parameters. Our research opens new grounds\nfor broader applications of KAN across several Natural Language Processing\n(NLP) tasks.\n","authors":["Abdullah Al Imran","Md Farhan Ishmam"],"pdf_url":"https://arxiv.org/pdf/2408.08803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14322v3","updated":"2024-08-16T15:02:45Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2401.14267v3","updated":"2024-08-16T14:56:36Z","published":"2024-01-25T16:01:49Z","title":"Transformers and Cortical Waves: Encoders for Pulling In Context Across\n Time","summary":" The capabilities of transformer networks such as ChatGPT and other Large\nLanguage Models (LLMs) have captured the world's attention. The crucial\ncomputational mechanism underlying their performance relies on transforming a\ncomplete input sequence - for example, all the words in a sentence - into a\nlong \"encoding vector\" that allows transformers to learn long-range temporal\ndependencies in naturalistic sequences. Specifically, \"self-attention\" applied\nto this encoding vector enhances temporal context in transformers by computing\nassociations between pairs of words in the input sequence. We suggest that\nwaves of neural activity traveling across single cortical areas or multiple\nregions at the whole-brain scale could implement a similar encoding principle.\nBy encapsulating recent input history into a single spatial pattern at each\nmoment in time, cortical waves may enable temporal context to be extracted from\nsequences of sensory inputs, the same computational principle used in\ntransformers.\n","authors":["Lyle Muller","Patricia S. Churchland","Terrence J. Sejnowski"],"pdf_url":"https://arxiv.org/pdf/2401.14267v3.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08782v1","updated":"2024-08-16T14:54:41Z","published":"2024-08-16T14:54:41Z","title":"EmoDynamiX: Emotional Support Dialogue Strategy Prediction by Modelling\n MiXed Emotions and Discourse Dynamics","summary":" Designing emotionally intelligent conversational systems to provide comfort\nand advice to people experiencing distress is a compelling area of research.\nPrevious efforts have focused on developing modular dialogue systems that treat\nsocio-emotional strategy prediction as an auxiliary task and generate\nstrategy-conditioned responses with customized decoders. Recently, with\nadvancements in large language models (LLMs), end-to-end dialogue agents\nwithout explicit socio-emotional strategy prediction steps have become\nprevalent. However, despite their excellence in language generation, recent\nstudies show that LLMs' inherent preference bias towards certain\nsocio-emotional strategies hinders the delivery of high-quality emotional\nsupport. To address this challenge, we propose decoupling strategy prediction\nfrom language generation, and introduce a novel dialogue strategy predictor,\nEmoDynamiX, which models the discourse dynamics between user emotions and\nsystem strategies using a heterogeneous graph. Additionally, we make use of the\nEmotion Recognition in Conversations (ERC) task and design a flexible\nmixed-emotion module to capture fine-grained emotional states of the user.\nExperimental results on two ESC datasets show EmoDynamiX outperforms previous\nstate-of-the-art methods with a significant margin.\n","authors":["Chenwei Wan","Matthieu Labeau","Chloé Clavel"],"pdf_url":"https://arxiv.org/pdf/2408.08782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08781v1","updated":"2024-08-16T14:49:35Z","published":"2024-08-16T14:49:35Z","title":"Evaluating the Evaluator: Measuring LLMs' Adherence to Task Evaluation\n Instructions","summary":" LLMs-as-a-judge is a recently popularized method which replaces human\njudgements in task evaluation (Zheng et al. 2024) with automatic evaluation\nusing LLMs. Due to widespread use of RLHF (Reinforcement Learning from Human\nFeedback), state-of-the-art LLMs like GPT4 and Llama3 are expected to have\nstrong alignment with human preferences when prompted for a quality judgement,\nsuch as the coherence of a text. While this seems beneficial, it is not clear\nwhether the assessments by an LLM-as-a-judge constitute only an evaluation\nbased on the instructions in the prompts, or reflect its preference for\nhigh-quality data similar to its fine-tune data. To investigate how much\ninfluence prompting the LLMs-as-a-judge has on the alignment of AI judgements\nto human judgements, we analyze prompts with increasing levels of instructions\nabout the target quality of an evaluation, for several LLMs-as-a-judge.\nFurther, we compare to a prompt-free method using model perplexity as a quality\nmeasure instead. We aggregate a taxonomy of quality criteria commonly used\nacross state-of-the-art evaluations with LLMs and provide this as a rigorous\nbenchmark of models as judges. Overall, we show that the LLMs-as-a-judge\nbenefit only little from highly detailed instructions in prompts and that\nperplexity can sometimes align better with human judgements than prompting,\nespecially on textual quality.\n","authors":["Bhuvanashree Murugadoss","Christian Poelitz","Ian Drosos","Vu Le","Nick McKenna","Carina Suzana Negreanu","Chris Parnin","Advait Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.08781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08780v1","updated":"2024-08-16T14:49:04Z","published":"2024-08-16T14:49:04Z","title":"Large Language Models Might Not Care What You Are Saying: Prompt Format\n Beats Descriptions","summary":" With the help of in-context learning (ICL), large language models (LLMs) have\nachieved impressive performance across various tasks. However, the function of\ndescriptive instructions during ICL remains under-explored. In this work, we\npropose an ensemble prompt framework to describe the selection criteria of\nmultiple in-context examples, and preliminary experiments on machine\ntranslation (MT) across six translation directions confirm that this framework\nboosts ICL perfromance. But to our surprise, LLMs might not necessarily care\nwhat the descriptions actually say, and the performance gain is primarily\ncaused by the ensemble format, since the framework could lead to improvement\neven with random descriptive nouns. We further apply this new ensemble prompt\non a range of commonsense, math, logical reasoning and hallucination tasks with\nthree LLMs and achieve promising results, suggesting again that designing a\nproper prompt format would be much more effective and efficient than paying\neffort into specific descriptions. Our code will be publicly available once\nthis paper is published.\n","authors":["Chenming Tang","Zhixiang Wang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.08780v1.pdf","comment":"10 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.08779v1","updated":"2024-08-16T14:43:15Z","published":"2024-08-16T14:43:15Z","title":"DAC: Decomposed Automation Correction for Text-to-SQL","summary":" Text-to-SQL is an important task that helps people obtain information from\ndatabases by automatically generating SQL queries. Considering the brilliant\nperformance, approaches based on Large Language Models (LLMs) become the\nmainstream for text-to-SQL. Among these approaches, automated correction is an\neffective approach that further enhances performance by correcting the mistakes\nin the generated results. The existing correction methods require LLMs to\ndirectly correct with generated SQL, while previous research shows that LLMs do\nnot know how to detect mistakes, leading to poor performance. Therefore, in\nthis paper, we propose to employ the decomposed correction to enhance\ntext-to-SQL performance. We first demonstrate that decomposed correction\noutperforms direct correction since detecting and fixing mistakes with the\nresults of the decomposed sub-tasks is easier than with SQL. Based on this\nanalysis, we introduce Decomposed Automation Correction (DAC), which corrects\nSQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC\nfirst generates the entity and skeleton corresponding to the question and then\ncompares the differences between the initial SQL and the generated entities and\nskeleton as feedback for correction. Experimental results show that our method\nimproves performance by $3.7\\%$ on average of Spider, Bird, and KaggleDBQA\ncompared with the baseline method, demonstrating the effectiveness of DAC.\n","authors":["Dingzirui Wang","Longxu Dou","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08769v1","updated":"2024-08-16T14:23:59Z","published":"2024-08-16T14:23:59Z","title":"Lower Layer Matters: Alleviating Hallucination via Multi-Layer Fusion\n Contrastive Decoding with Truthfulness Refocused","summary":" Large Language Models (LLMs) have demonstrated exceptional performance across\nvarious natural language processing tasks, yet they occasionally tend to yield\ncontent that factually inaccurate or discordant with the expected output, a\nphenomenon empirically referred to as \"hallucination\". To tackle this issue,\nrecent works have investigated contrastive decoding between the original model\nand an amateur model with induced hallucination, which has shown promising\nresults. Nonetheless, this method may undermine the output distribution of the\noriginal LLM caused by its coarse contrast and simplistic subtraction\noperation, potentially leading to errors in certain cases. In this paper, we\nintroduce a novel contrastive decoding framework termed LOL (LOwer Layer\nMatters). Our approach involves concatenating the contrastive decoding of both\nthe final and lower layers between the original model and the amateur model,\nthereby achieving multi-layer fusion to aid in the mitigation of hallucination.\nAdditionally, we incorporate a truthfulness refocused module that leverages\ncontextual guidance to enhance factual encoding, further capturing truthfulness\nduring contrastive decoding. Extensive experiments conducted on two publicly\navailable datasets illustrate that our proposed LOL framework can substantially\nalleviate hallucination while surpassing existing baselines in most cases.\nCompared with the best baseline, we improve by average 4.5 points on all\nmetrics of TruthfulQA. The source code is coming soon.\n","authors":["Dingwei Chen","Feiteng Fang","Shiwen Ni","Feng Liang","Ruifeng Xu","Min Yang","Chengming Li"],"pdf_url":"https://arxiv.org/pdf/2408.08769v1.pdf","comment":"9 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.08729v1","updated":"2024-08-16T13:22:55Z","published":"2024-08-16T13:22:55Z","title":"ConcateNet: Dialogue Separation Using Local And Global Feature\n Concatenation","summary":" Dialogue separation involves isolating a dialogue signal from a mixture, such\nas a movie or a TV program. This can be a necessary step to enable dialogue\nenhancement for broadcast-related applications. In this paper, ConcateNet for\ndialogue separation is proposed, which is based on a novel approach for\nprocessing local and global features aimed at better generalization for\nout-of-domain signals. ConcateNet is trained using a noise reduction-focused,\npublicly available dataset and evaluated using three datasets: two noise\nreduction-focused datasets (in-domain), which show competitive performance for\nConcateNet, and a broadcast-focused dataset (out-of-domain), which verifies the\nbetter generalization performance for the proposed architecture compared to\nconsidered state-of-the-art noise-reduction methods.\n","authors":["Mhd Modar Halimeh","Matteo Torcoli","Emanuël Habets"],"pdf_url":"https://arxiv.org/pdf/2408.08729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08724v1","updated":"2024-08-16T13:11:53Z","published":"2024-08-16T13:11:53Z","title":"ChatZero:Zero-shot Cross-Lingual Dialogue Generation via Pseudo-Target\n Language","summary":" Although large language models(LLMs) show amazing capabilities, among various\nexciting applications discovered for LLMs fall short in other low-resource\nlanguages. Besides, most existing methods depend on large-scale dialogue\ncorpora and thus building systems for dialogue generation in a zero-shot\nscenario remains a considerable challenge. To address this challenge, we\npropose a novel end-to-end zero-shot dialogue generation model ChatZero based\non cross-lingual code-switching method. First, we construct code-switching\nlanguage and pseudo-target language with placeholders. Then for cross-lingual\nsemantic transfer, we employ unsupervised contrastive learning to minimize the\nsemantics gap of the source language, code-switching language, and\npseudo-target language that are mutually positive examples in the high\ndimensional semantic space. Experiments on the multilingual DailyDialog and\nDSTC7-AVSD datasets demonstrate that ChatZero can achieve more than 90\\% of the\noriginal performance under the zero-shot case compared to supervised learning,\nand achieve state-of-the-art performance compared with other baselines.\n","authors":["Yongkang Liu","Feng Shi","Daling Wang","Yifei Zhang","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2408.08724v1.pdf","comment":"ECAI2024"},{"id":"http://arxiv.org/abs/2403.06249v3","updated":"2024-08-16T12:30:07Z","published":"2024-03-10T16:22:20Z","title":"No Language is an Island: Unifying Chinese and English in Financial\n Large Language Models, Instruction Data, and Benchmarks","summary":" While the progression of Large Language Models (LLMs) has notably propelled\nfinancial analysis, their application has largely been confined to singular\nlanguage realms, leaving untapped the potential of bilingual Chinese-English\ncapacity. To bridge this chasm, we introduce ICE-PIXIU, seamlessly amalgamating\nthe ICE-INTENT model and ICE-FLARE benchmark for bilingual financial analysis.\nICE-PIXIU uniquely integrates a spectrum of Chinese tasks, alongside translated\nand original English datasets, enriching the breadth and depth of bilingual\nfinancial modeling. It provides unrestricted access to diverse model variants,\na substantial compilation of diverse cross-lingual and multi-modal instruction\ndata, and an evaluation benchmark with expert annotations, comprising 10 NLP\ntasks, 20 bilingual specific tasks, totaling 95k datasets. Our thorough\nevaluation emphasizes the advantages of incorporating these bilingual datasets,\nespecially in translation tasks and utilizing original English data, enhancing\nboth linguistic flexibility and analytical acuity in financial contexts.\nNotably, ICE-INTENT distinguishes itself by showcasing significant enhancements\nover conventional LLMs and existing financial LLMs in bilingual milieus,\nunderscoring the profound impact of robust bilingual data on the accuracy and\nefficacy of financial NLP.\n","authors":["Gang Hu","Ke Qin","Chenhan Yuan","Min Peng","Alejandro Lopez-Lira","Benyou Wang","Sophia Ananiadou","Jimin Huang","Qianqian Xie"],"pdf_url":"https://arxiv.org/pdf/2403.06249v3.pdf","comment":"19 pages, 3 figures, 12 tables, including Appendix"},{"id":"http://arxiv.org/abs/2408.08696v1","updated":"2024-08-16T12:20:56Z","published":"2024-08-16T12:20:56Z","title":"Turning Trash into Treasure: Accelerating Inference of Large Language\n Models with Token Recycling","summary":" The rapid growth in the parameters of large language models (LLMs) has made\ninference latency a fundamental bottleneck, limiting broader application of\nLLMs. Speculative decoding represents a lossless approach to accelerate\ninference through a guess-and-verify paradigm, leveraging the parallel\ncapabilities of modern hardware. Some speculative decoding methods rely on\nadditional structures to guess draft tokens, such as small models or\nparameter-efficient architectures, which need extra training before use.\nAlternatively, retrieval-based train-free techniques build libraries from\npre-existing corpora or by n-gram generation. However, they face challenges\nlike large storage requirements, time-consuming retrieval, and limited\nadaptability. Observing that candidate tokens generated during the decoding\nprocess are likely to reoccur in future sequences, we propose Token Recycling.\nThis approach stores candidate tokens in an adjacency matrix and employs a\nbreadth-first search (BFS)-like algorithm on the matrix to construct a draft\ntree. The tree is then validated through tree attention. New candidate tokens\nfrom the decoding process are then used to update the matrix. Token Recycling\nrequires \\textless2MB of additional storage and achieves approximately 2x\nspeedup across all sizes of LLMs. It significantly outperforms existing\ntrain-free methods by 30\\% and even a training method by 25\\%. It can be\ndirectly applied to any existing LLMs and tasks without the need for\nadaptation.\n","authors":["Xianzhen Luo","Yixuan Wang","Qingfu Zhu","Zhiming Zhang","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08696v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.08694v1","updated":"2024-08-16T12:16:59Z","published":"2024-08-16T12:16:59Z","title":"Quantifying the Effectiveness of Student Organization Activities using\n Natural Language Processing","summary":" Student extracurricular activities play an important role in enriching the\nstudents' educational experiences. With the increasing popularity of Machine\nLearning and Natural Language Processing, it becomes a logical step that\nincorporating ML-NLP in improving extracurricular activities is a potential\nfocus of study in Artificial Intelligence (AI). This research study aims to\ndevelop a machine learning workflow that will quantify the effectiveness of\nstudent-organized activities based on student emotional responses using\nsentiment analysis. The study uses the Bidirectional Encoder Representations\nfrom Transformers (BERT) Large Language Model (LLM) called via the\npysentimiento toolkit, as a Transformer pipeline in Hugging Face. A sample data\nset from Organization C, a Recognized Student Organization (RSO) of a higher\neducational institute in the Philippines, College X, was used to develop the\nworkflow. The workflow consisted of data preprocessing, key feature selection,\nLLM feature processing, and score aggregation, resulting in an Event Score for\neach data set. The results show that the BERT LLM can also be used effectively\nin analyzing sentiment beyond product reviews and post comments. For the\nstudent affairs offices of educational institutions, this study can provide a\npractical example of how NLP can be applied to real-world scenarios, showcasing\nthe potential impact of data-driven decision making.\n","authors":["Lyberius Ennio F. Taruc","Arvin R. De La Cruz"],"pdf_url":"https://arxiv.org/pdf/2408.08694v1.pdf","comment":"11 pages, 4 figures, presented in International Conference on\n Generative Al and its Applications (ICGAIA-24) last 22nd - 23rd, July, 2024\n at Jakarta, Indonesia"},{"id":"http://arxiv.org/abs/2408.08693v1","updated":"2024-08-16T12:14:55Z","published":"2024-08-16T12:14:55Z","title":"Med-PMC: Medical Personalized Multi-modal Consultation with a Proactive\n Ask-First-Observe-Next Paradigm","summary":" The application of the Multi-modal Large Language Models (MLLMs) in medical\nclinical scenarios remains underexplored. Previous benchmarks only focus on the\ncapacity of the MLLMs in medical visual question-answering (VQA) or report\ngeneration and fail to assess the performance of the MLLMs on complex clinical\nmulti-modal tasks. In this paper, we propose a novel Medical Personalized\nMulti-modal Consultation (Med-PMC) paradigm to evaluate the clinical capacity\nof the MLLMs. Med-PMC builds a simulated clinical environment where the MLLMs\nare required to interact with a patient simulator to complete the multi-modal\ninformation-gathering and decision-making task. Specifically, the patient\nsimulator is decorated with personalized actors to simulate diverse patients in\nreal scenarios. We conduct extensive experiments to access 12 types of MLLMs,\nproviding a comprehensive view of the MLLMs' clinical performance. We found\nthat current MLLMs fail to gather multimodal information and show potential\nbias in the decision-making task when consulted with the personalized patient\nsimulators. Further analysis demonstrates the effectiveness of Med-PMC, showing\nthe potential to guide the development of robust and reliable clinical MLLMs.\nCode and data are available at https://github.com/LiuHC0428/Med-PMC.\n","authors":["Hongcheng Liu","Yusheng Liao","Siqv Ou","Yuhao Wang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08693v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08688v1","updated":"2024-08-16T12:01:55Z","published":"2024-08-16T12:01:55Z","title":"The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic\n Preference Optimization Dataset Generation","summary":" This paper presents and evaluates multi-agent workflows for synthetic\nPreference Optimization (PO) dataset generation. PO dataset generation requires\ntwo modules: (1) response evaluation, and (2) response generation. In the\nresponse evaluation module, the responses from Large Language Models (LLMs) are\nevaluated and ranked - a task typically carried out by human annotators that we\nautomate using LLMs. We assess the response evaluation module in a 2 step\nprocess. In step 1, we assess LLMs as evaluators using three distinct prompting\nstrategies. In step 2, we apply the winning prompting strategy to compare the\nperformance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In each step, we\nuse inter-rater agreement using Cohen's Kappa between human annotators and\nLLMs. For the response generation module, we compare different configurations\nfor the LLM Feedback Loop using the identified LLM evaluator configuration. We\nuse the win rate (the fraction of times a generation framework is selected as\nthe best by an LLM evaluator) to determine the best multi-agent configuration\nfor generation. After identifying the best configurations for both modules, we\nuse models from the GPT, Gemma, and Llama families to generate our PO datasets\nusing the above pipeline. We generate two types of PO datasets, one to improve\nthe generation capabilities of individual LLM and the other to improve the\nmulti-agent workflow. Our evaluation shows that GPT-4o-as-a-Judge is more\nconsistent across datasets when the candidate responses do not include\nresponses from the GPT family. Additionally, we find that the LLM Feedback\nLoop, with Llama as the generator and Gemma as the reviewer, achieves a notable\n71.8% and 73.8% win rate over single-agent Llama and Gemma, respectively.\n","authors":["Samee Arif","Sualeha Farid","Abdul Hameed Azeemi","Awais Athar","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08212v2","updated":"2024-08-16T11:57:53Z","published":"2024-08-15T15:23:00Z","title":"Covert Bias: The Severity of Social Views' Unalignment in Language\n Models Towards Implicit and Explicit Opinion","summary":" While various approaches have recently been studied for bias identification,\nlittle is known about how implicit language that does not explicitly convey a\nviewpoint affects bias amplification in large language models. To examine the\nseverity of bias toward a view, we evaluated the performance of two downstream\ntasks where the implicit and explicit knowledge of social groups were used.\nFirst, we present a stress test evaluation by using a biased model in edge\ncases of excessive bias scenarios. Then, we evaluate how LLMs calibrate\nlinguistically in response to both implicit and explicit opinions when they are\naligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM\nperformance in identifying implicit and explicit opinions, with a general\ntendency of bias toward explicit opinions of opposing stances. Moreover, the\nbias-aligned models generate more cautious responses using uncertainty phrases\ncompared to the unaligned (zero-shot) base models. The direct, incautious\nresponses of the unaligned models suggest a need for further refinement of\ndecisiveness by incorporating uncertainty markers to enhance their reliability,\nespecially on socially nuanced topics with high subjectivity.\n","authors":["Abeer Aldayel","Areej Alokaili","Rehab Alahmadi"],"pdf_url":"https://arxiv.org/pdf/2408.08212v2.pdf","comment":"This work is under-review"},{"id":"http://arxiv.org/abs/2408.08682v1","updated":"2024-08-16T11:55:44Z","published":"2024-08-16T11:55:44Z","title":"LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression","summary":" The key to effective point cloud compression is to obtain a robust context\nmodel consistent with complex 3D data structures. Recently, the advancement of\nlarge language models (LLMs) has highlighted their capabilities not only as\npowerful generators for in-context learning and generation but also as\neffective compressors. These dual attributes of LLMs make them particularly\nwell-suited to meet the demands of data compression. Therefore, this paper\nexplores the potential of using LLM for compression tasks, focusing on lossless\npoint cloud geometry compression (PCGC) experiments. However, applying LLM\ndirectly to PCGC tasks presents some significant challenges, i.e., LLM does not\nunderstand the structure of the point cloud well, and it is a difficult task to\nfill the gap between text and point cloud through text description, especially\nfor large complicated and small shapeless point clouds. To address these\nproblems, we introduce a novel architecture, namely the Large Language\nModel-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to\ncompress point cloud geometry information without any text description or\naligning operation. By utilizing different adaptation techniques for\ncross-modality representation alignment and semantic consistency, including\nclustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA),\nthe proposed method can translate LLM to a compressor/generator for point\ncloud. To the best of our knowledge, this is the first structure to employ LLM\nas a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC\noutperforms the other existing methods significantly, by achieving -40.213% bit\nrate reduction compared to the reference software of MPEG Geometry-based Point\nCloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction\ncompared to the state-of-the-art learning-based method.\n","authors":["Yuqi Ye","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08661v1","updated":"2024-08-16T11:09:56Z","published":"2024-08-16T11:09:56Z","title":"MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector","summary":" The increasing parameters and expansive dataset of large language models\n(LLMs) highlight the urgent demand for a technical solution to audit the\nunderlying privacy risks and copyright issues associated with LLMs. Existing\nstudies have partially addressed this need through an exploration of the\npre-training data detection problem, which is an instance of a membership\ninference attack (MIA). This problem involves determining whether a given piece\nof text has been used during the pre-training phase of the target LLM. Although\nexisting methods have designed various sophisticated MIA score functions to\nachieve considerable detection performance in pre-trained LLMs, how to achieve\nhigh-confidence detection and how to perform MIA on aligned LLMs remain\nchallenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA\nmethod, which instructs LLMs themselves to serve as a more precise pre-training\ndata detector internally, rather than design an external MIA score function.\nFurthermore, we design two instruction-based safeguards to respectively\nmitigate the privacy risks brought by the existing methods and MIA-Tuner. To\ncomprehensively evaluate the most recent state-of-the-art LLMs, we collect a\nmore up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely\nadopted benchmark WIKIMIA. We conduct extensive experiments across various\naligned and unaligned LLMs over the two benchmark datasets. The results\ndemonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a\nsignificantly high level of 0.9.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08661v1.pdf","comment":"code and dataset: https://github.com/wjfu99/MIA-Tuner"},{"id":"http://arxiv.org/abs/2406.08068v2","updated":"2024-08-16T10:50:45Z","published":"2024-06-12T10:36:27Z","title":"Large Language Models Meet Text-Centric Multimodal Sentiment Analysis: A\n Survey","summary":" Compared to traditional sentiment analysis, which only considers text,\nmultimodal sentiment analysis needs to consider emotional signals from\nmultimodal sources simultaneously and is therefore more consistent with the way\nhow humans process sentiment in real-world scenarios. It involves processing\nemotional information from various sources such as natural language, images,\nvideos, audio, physiological signals, etc. However, although other modalities\nalso contain diverse emotional cues, natural language usually contains richer\ncontextual information and therefore always occupies a crucial position in\nmultimodal sentiment analysis. The emergence of ChatGPT has opened up immense\npotential for applying large language models (LLMs) to text-centric multimodal\ntasks. However, it is still unclear how existing LLMs can adapt better to\ntext-centric multimodal sentiment analysis tasks. This survey aims to (1)\npresent a comprehensive review of recent research in text-centric multimodal\nsentiment analysis tasks, (2) examine the potential of LLMs for text-centric\nmultimodal sentiment analysis, outlining their approaches, advantages, and\nlimitations, (3) summarize the application scenarios of LLM-based multimodal\nsentiment analysis technology, and (4) explore the challenges and potential\nresearch directions for multimodal sentiment analysis in the future.\n","authors":["Hao Yang","Yanyan Zhao","Yang Wu","Shilong Wang","Tian Zheng","Hongbo Zhang","Zongyang Ma","Wanxiang Che","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2406.08068v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.14556 by other authors"},{"id":"http://arxiv.org/abs/2408.08656v1","updated":"2024-08-16T10:45:45Z","published":"2024-08-16T10:45:45Z","title":"LLMs Are Biased Towards Output Formats! Systematically Evaluating and\n Mitigating Output Format Bias of LLMs","summary":" We present the first systematic evaluation examining format bias in\nperformance of large language models (LLMs). Our approach distinguishes between\ntwo categories of an evaluation metric under format constraints to reliably and\naccurately assess performance: one measures performance when format constraints\nare adhered to, while the other evaluates performance regardless of constraint\nadherence. We then define a metric for measuring the format bias of LLMs and\nestablish effective strategies to reduce it. Subsequently, we present our\nempirical format bias evaluation spanning four commonly used categories --\nmultiple-choice question-answer, wrapping, list, and mapping -- covering 15\nwidely-used formats. Our evaluation on eight generation tasks uncovers\nsignificant format bias across state-of-the-art LLMs. We further discover that\nimproving the format-instruction following capabilities of LLMs across formats\npotentially reduces format bias. Based on our evaluation findings, we study\nprompting and fine-tuning with synthesized format data techniques to mitigate\nformat bias. Our methods successfully reduce the variance in ChatGPT's\nperformance among wrapping formats from 235.33 to 0.71 (%$^2$).\n","authors":["Do Xuan Long","Hai Nguyen Ngoc","Tiviatis Sim","Hieu Dao","Shafiq Joty","Kenji Kawaguchi","Nancy F. Chen","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2408.08656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03544v2","updated":"2024-08-16T10:42:38Z","published":"2024-08-07T04:49:38Z","title":"Unlocking the Non-Native Language Context Limitation: Native Language\n Prompting Facilitates Knowledge Elicitation","summary":" Multilingual large language models (MLLMs) struggle to answer questions posed\nin non-dominant languages, even though they have acquired the relevant\nknowledge from their dominant language corpus. In contrast, human multilinguals\ncan overcome such non-native language context limitations through Positive\nNative Language Transfer (PNLT). Inspired by the process of PNLT, we analogize\nthe dominant language of MLLMs to the native language of human multilinguals,\nand propose Native Language Prompting (NatLan) to simulate the PNLT observed in\nhuman multilinguals. It explicitly creates native language contexts for MLLMs\nto facilitate the elicitation of the rich native language knowledge during\nquestion-answering, unlocking the limitations imposed by non-native language\ncontexts. By employing multi-MLLM collaboration, NatLan reduces the workload on\neach MLLM in simulating PNLT and refines semantic transfer. On the C-Eval\nbenchmark, NatLan provides up to a 10.1% average accuracy improvement and up to\na 5.0% increase in the hard-level subset across five MLLMs, surpassing all\ntop-notch related methods. Our code is available at\nhttps://github.com/AnonyNLP/NatLan.\n","authors":["Baixuan Li","Yunlong Fan","Zhiqiang Gao"],"pdf_url":"https://arxiv.org/pdf/2408.03544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08651v1","updated":"2024-08-16T10:34:50Z","published":"2024-08-16T10:34:50Z","title":"Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of\n Thought Reasoning","summary":" Language models are known to absorb biases from their training data, leading\nto predictions driven by statistical regularities rather than semantic\nrelevance. We investigate the impact of these biases on answer choice\npreferences in the Massive Multi-Task Language Understanding (MMLU) task. Our\nfindings reveal that differences in learned regularities across answer options\nare predictive of model preferences and mirror human test-taking strategies. To\naddress this issue, we introduce two novel methods: Counterfactual Prompting\nwith Chain of Thought (CoT) and Counterfactual Prompting with Agnostically\nPrimed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with\nCoT alone is insufficient to mitigate bias, our novel Primed Counterfactual\nPrompting with CoT approach effectively reduces the influence of base-rate\nprobabilities while improving overall accuracy. Our results suggest that\nmitigating bias requires a \"System-2\" like process and that CoT reasoning is\nsusceptible to confirmation bias under some prompting methodologies. Our\ncontributions offer practical solutions for developing more robust and fair\nlanguage models.\n","authors":["Kyle Moore","Jesse Roberts","Thao Pham","Douglas Fisher"],"pdf_url":"https://arxiv.org/pdf/2408.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08650v1","updated":"2024-08-16T10:33:19Z","published":"2024-08-16T10:33:19Z","title":"An End-to-End Model for Photo-Sharing Multi-modal Dialogue Generation","summary":" Photo-Sharing Multi-modal dialogue generation requires a dialogue agent not\nonly to generate text responses but also to share photos at the proper moment.\nUsing image text caption as the bridge, a pipeline model integrates an image\ncaption model, a text generation model, and an image generation model to handle\nthis complex multi-modal task. However, representing the images with text\ncaptions may loss important visual details and information and cause error\npropagation in the complex dialogue system. Besides, the pipeline model\nisolates the three models separately because discrete image text captions\nhinder end-to-end gradient propagation. We propose the first end-to-end model\nfor photo-sharing multi-modal dialogue generation, which integrates an image\nperceptron and an image generator with a large language model. The large\nlanguage model employs the Q-Former to perceive visual images in the input end.\nFor image generation in the output end, we propose a dynamic vocabulary\ntransformation matrix and use straight-through and gumbel-softmax techniques to\nalign the large language model and stable diffusion model and achieve\nend-to-end gradient propagation. We perform experiments on PhotoChat and\nDialogCC datasets to evaluate our end-to-end model. Compared with pipeline\nmodels, the end-to-end model gains state-of-the-art performances on various\nmetrics of text and image generation. More analysis experiments also verify the\neffectiveness of the end-to-end model for photo-sharing multi-modal dialogue\ngeneration.\n","authors":["Peiming Guo","Sinuo Liu","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08650v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.08648v1","updated":"2024-08-16T10:30:30Z","published":"2024-08-16T10:30:30Z","title":"Understanding Enthymemes in Argument Maps: Bridging Argument Mining and\n Logic-based Argumentation","summary":" Argument mining is natural language processing technology aimed at\nidentifying arguments in text. Furthermore, the approach is being developed to\nidentify the premises and claims of those arguments, and to identify the\nrelationships between arguments including support and attack relationships. In\nthis paper, we assume that an argument map contains the premises and claims of\narguments, and support and attack relationships between them, that have been\nidentified by argument mining. So from a piece of text, we assume an argument\nmap is obtained automatically by natural language processing. However, to\nunderstand and to automatically analyse that argument map, it would be\ndesirable to instantiate that argument map with logical arguments. Once we have\nthe logical representation of the arguments in an argument map, we can use\nautomated reasoning to analyze the argumentation (e.g. check consistency of\npremises, check validity of claims, and check the labelling on each arc\ncorresponds with thw logical arguments). We address this need by using\nclassical logic for representing the explicit information in the text, and\nusing default logic for representing the implicit information in the text. In\norder to investigate our proposal, we consider some specific options for\ninstantiation.\n","authors":["Jonathan Ben-Naim","Victor David","Anthony Hunter"],"pdf_url":"https://arxiv.org/pdf/2408.08648v1.pdf","comment":"Research note"},{"id":"http://arxiv.org/abs/2408.08640v1","updated":"2024-08-16T10:11:05Z","published":"2024-08-16T10:11:05Z","title":"Math-PUMA: Progressive Upward Multimodal Alignment to Enhance\n Mathematical Reasoning","summary":" Multimodal Large Language Models (MLLMs) excel in solving text-based\nmathematical problems, but they struggle with mathematical diagrams since they\nare primarily trained on natural scene images. For humans, visual aids\ngenerally enhance problem-solving, but MLLMs perform worse as information\nshifts from textual to visual modality. This decline is mainly due to their\nshortcomings in aligning images and text. To tackle aforementioned challenges,\nwe propose Math-PUMA, a methodology focused on Progressive Upward Multimodal\nAlignment. This approach is designed to improve the mathematical reasoning\nskills of MLLMs through a three-stage training process, with the second stage\nbeing the critical alignment stage. We first enhance the language model's\nmathematical reasoning capabilities with extensive set of textual mathematical\nproblems. We then construct a multimodal dataset with varying degrees of\ntextual and visual information, creating data pairs by presenting each problem\nin at least two forms. By leveraging the Kullback-Leibler (KL) divergence of\nnext-token prediction distributions to align visual and textual modalities,\nconsistent problem-solving abilities are ensured. Finally, we utilize\nmultimodal instruction tuning for MLLMs with high-quality multimodal data.\nExperimental results on multiple mathematical reasoning benchmarks demonstrate\nthat the MLLMs trained with Math-PUMA surpass most open-source MLLMs. Our\napproach effectively narrows the performance gap for problems presented in\ndifferent modalities.\n","authors":["Wenwen Zhuang","Xin Huang","Xiantao Zhang","Jin Zeng"],"pdf_url":"https://arxiv.org/pdf/2408.08640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01129v3","updated":"2024-08-16T10:03:53Z","published":"2024-04-01T14:11:45Z","title":"Emphasising Structured Information: Integrating Abstract Meaning\n Representation into LLMs for Enhanced Open-Domain Dialogue Evaluation","summary":" Automatic open-domain dialogue evaluation has attracted increasing attention.\nTrainable evaluation metrics, typically trained with true positive and randomly\nselected negative responses, tend to assign higher scores to responses that\nshare greater content similarity with a given context. However, adversarial\nnegative responses, despite possessing high content similarity with the\ncontexts, are semantically different. Consequently, existing evaluation metrics\nare not robust enough to evaluate such responses, resulting in low correlations\nwith human judgments. While recent studies have demonstrated the effectiveness\nof Large Language Models (LLMs) for open-domain dialogue evaluation, they still\nface challenges in effectively handling adversarial negative examples. In this\npaper, we propose an effective framework for open-domain dialogue evaluation,\nwhich combines domain-specific language models (SLMs) enhanced with Abstract\nMeaning Representation (AMR) knowledge with LLMs. The SLMs can explicitly\nincorporate AMR graph information of the dialogue through a gating mechanism\nfor enhanced dialogue semantic representation learning. Both the evaluation\nresult from the SLMs and the AMR graph information are incorporated into the\nLLM's prompt for enhanced evaluation performance. Experimental results on\nopen-domain dialogue evaluation tasks demonstrate the superiority of our method\ncompared to a wide range of state-of-the-art baselines, especially in\ndiscriminating adversarial negative responses. Our code and data are publicly\navailable at https://github.com/Bernard-Yang/SIMAMR.\n","authors":["Bohao Yang","Kun Zhao","Chen Tang","Dong Liu","Liang Zhan","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07081v3","updated":"2024-08-16T09:54:23Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Improving the readability of mathematical expressions in text-based document\nsuch as subtitle of mathematical video, is an significant task. To achieve\nthis, mathematical expressions should be convert to compiled formulas. For\ninstance, the spoken expression ``x equals minus b plus or minus the square\nroot of b squared minus four a c, all over two a'' from automatic speech\nrecognition is more readily comprehensible when displayed as a compiled formula\n$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken\nsentences to compiled formulas, two processes are required: spoken sentences\nare converted into LaTeX formulas, and LaTeX formulas are converted into\ncompiled formulas. The latter can be managed by using LaTeX engines. However,\nthere is no way to do the former effectively. Even if we try to solve this\nusing language models, there is no paired data between spoken sentences and\nLaTeX formulas to train it. In this paper, we introduce MathBridge, the first\nextensive dataset for translating mathematical spoken sentences into LaTeX\nformulas. MathBridge comprises approximately 23 million LaTeX formulas paired\nwith the corresponding mathematical spoken sentences. Through comprehensive\nevaluations, including fine-tuning with proposed data, we discovered that\nMathBridge significantly enhances the capabilities of pretrained language\nmodels for converting to LaTeX formulas from mathematical spoken sentences.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.08632v1","updated":"2024-08-16T09:52:02Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of \\textbf{180 benchmarks} and evaluation for MLLMs,\nfocusing on (1)perception and understanding, (2)cognition and reasoning,\n(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we\ndiscuss the limitations of the current evaluation methods for MLLMs and explore\npromising future directions. Our key argument is that evaluation should be\nregarded as a crucial discipline to better support the development of MLLMs.\nFor more details, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.08632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08631v1","updated":"2024-08-16T09:49:51Z","published":"2024-08-16T09:49:51Z","title":"Persona is a Double-edged Sword: Enhancing the Zero-shot Reasoning by\n Ensembling the Role-playing and Neutral Prompts","summary":" Recent studies demonstrate that prompting an appropriate role-playing persona\nto an LLM improves its reasoning capability. However, assigning a proper\npersona is difficult since an LLM's performance is extremely sensitive to\nassigned prompts; therefore, personas sometimes hinder LLMs and degrade their\nreasoning capabilities. In this paper, we propose a novel framework, Jekyll \\&\nHyde, which ensembles the results of role-playing and neutral prompts to\neradicate performance degradation via unilateral use of role-playing prompted\nLLM and enhance the robustness of an LLM's reasoning ability. Specifically,\nJekyll \\& Hyde collects two potential solutions from both role-playing and\nneutral prompts and selects a better solution after cross-checking via an LLM\nevaluator. However, LLM-based evaluators tend to be affected by the order of\nthose potential solutions within the prompt when selecting the proper solution;\nthus, we also propose a robust LLM evaluator to mitigate the position bias. The\nexperimental analysis demonstrates that role-playing prompts distract LLMs and\ndegrade their reasoning abilities in 4 out of 12 datasets, even when using\nGPT-4. In addition, we reveal that Jekyll \\& Hyde improves reasoning\ncapabilities by selecting better choices among the potential solutions on\ntwelve widely-used reasoning datasets. We further show that our proposed LLM\nevaluator outperforms other baselines, proving the LLMs' position bias is\nsuccessfully mitigated.\n","authors":["Junseok Kim","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2408.08631v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08624v1","updated":"2024-08-16T09:32:43Z","published":"2024-08-16T09:32:43Z","title":"RealMedQA: A pilot biomedical question answering dataset containing\n realistic clinical questions","summary":" Clinical question answering systems have the potential to provide clinicians\nwith relevant and timely answers to their questions. Nonetheless, despite the\nadvances that have been made, adoption of these systems in clinical settings\nhas been slow. One issue is a lack of question-answering datasets which reflect\nthe real-world needs of health professionals. In this work, we present\nRealMedQA, a dataset of realistic clinical questions generated by humans and an\nLLM. We describe the process for generating and verifying the QA pairs and\nassess several QA models on BioASQ and RealMedQA to assess the relative\ndifficulty of matching answers to questions. We show that the LLM is more\ncost-efficient for generating \"ideal\" QA pairs. Additionally, we achieve a\nlower lexical similarity between questions and answers than BioASQ which\nprovides an additional challenge to the top two QA models, as per the results.\nWe release our code and our dataset publicly to encourage further research.\n","authors":["Gregory Kell","Angus Roberts","Serge Umansky","Yuti Khare","Najma Ahmed","Nikhil Patel","Chloe Simela","Jack Coumbe","Julian Rozario","Ryan-Rhys Griffiths","Iain J. Marshall"],"pdf_url":"https://arxiv.org/pdf/2408.08624v1.pdf","comment":"Accepted at AMIA Annual Symposium 2024"},{"id":"http://arxiv.org/abs/2406.17962v3","updated":"2024-08-16T08:48:26Z","published":"2024-06-25T22:44:17Z","title":"Crafting Customisable Characters with LLMs: Introducing SimsChat, a\n Persona-Driven Role-Playing Agent Framework","summary":" Large Language Models (LLMs) demonstrate a remarkable ability to comprehend\nhuman instructions and generate high-quality text. This capability allows LLMs\nto function as agents that can emulate human beings at a more sophisticated\nlevel, beyond the mere replication of basic human behaviours. However, there is\na lack of exploring into leveraging LLMs to craft characters from diverse\naspects. In this work, we introduce the Customisable Conversation Agent\nFramework, which leverages LLMs to simulate real-world characters that can be\nfreely customised according to various user preferences. This adaptable\nframework is beneficial for the design of customisable characters and\nrole-playing agents aligned with human preferences. We propose the SimsConv\ndataset, which encompasses 68 different customised characters, 1,360 multi-turn\nrole-playing dialogues, and a total of 13,971 interaction dialogues. The\ncharacters are created from several real-world elements, such as career,\naspiration, trait, and skill. Building upon these foundations, we present\nSimsChat, a freely customisable role-playing agent. It incorporates diverse\nreal-world scenes and topic-specific character interaction dialogues, thereby\nsimulating characters' life experiences in various scenarios and topic-specific\ninteractions with specific emotions. Experimental results indicate that our\nproposed framework achieves desirable performance and provides a valuable\nguideline for the construction of more accurate human simulacra in the future.\nOur data and code are publicly available at\nhttps://github.com/Bernard-Yang/SimsChat.\n","authors":["Bohao Yang","Dong Liu","Chen Tang","Chenghao Xiao","Kun Zhao","Chao Li","Lin Yuan","Guang Yang","Lanxiao Huang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2406.17962v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11550v3","updated":"2024-08-16T08:46:33Z","published":"2024-07-16T09:53:32Z","title":"Ada-KV: Optimizing KV Cache Eviction by Adaptive Budget Allocation for\n Efficient LLM Inference","summary":" Large Language Models have excelled in various fields but encounter\nchallenges in memory and time efficiency due to the expanding Key-Value (KV)\ncache required for long-sequence inference. Recent efforts try to reduce KV\ncache size to a given memory budget by evicting vast non-critical cache\nelements during runtime, while preserving generation quality. Our revisiting of\ncurrent eviction methods reveals that they fundamentally minimize an upper\nbound of the $L_1$ eviction loss between the pre- and post-eviction outputs of\nmulti-head self-attention mechanisms. Moreover, our analysis indicates that the\ncommon practices of uniformly assigning budgets across attention heads harm\ntheir post-eviction generation quality. In light of these findings, we propose\na simple yet effective adaptive budget allocation algorithm. This algorithm not\nonly optimizes the theoretical loss upper bound but also reduces the $L_1$\neviction loss in practice by aligning with the varied characteristics across\ndifferent heads. By integrating this algorithm into two state-of-the-art\nmethods, we demonstrate the effectiveness of using adaptive budget allocation\nto optimize KV cache eviction. Extensive evaluations on 16 datasets and the\nNeedle-in-a-Haystack test confirm significant performance improvements across\nvarious tasks.\n","authors":["Yuan Feng","Junlin Lv","Yukun Cao","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.11550v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2402.16998v2","updated":"2024-08-16T08:13:38Z","published":"2024-02-26T20:13:58Z","title":"What Do Language Models Hear? Probing for Auditory Representations in\n Language Models","summary":" This work explores whether language models encode meaningfully grounded\nrepresentations of sounds of objects. We learn a linear probe that retrieves\nthe correct text representation of an object given a snippet of audio related\nto that object, where the sound representation is given by a pretrained audio\nmodel. This probe is trained via a contrastive loss that pushes the language\nrepresentations and sound representations of an object to be close to one\nanother. After training, the probe is tested on its ability to generalize to\nobjects that were not seen during training. Across different language models\nand audio models, we find that the probe generalization is above chance in many\ncases, indicating that despite being trained only on raw text, language models\nencode grounded knowledge of sounds for some objects.\n","authors":["Jerry Ngo","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.16998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08590v1","updated":"2024-08-16T07:47:39Z","published":"2024-08-16T07:47:39Z","title":"A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive\n Language Models","summary":" Recent studies on logical reasoning in auto-regressive Language Models (LMs)\nhave sparked a debate on whether such models can learn systematic reasoning\nprinciples during pre-training or merely exploit superficial patterns in the\ntraining data. This paper presents a mechanistic interpretation of syllogistic\nreasoning in LMs to further enhance our understanding of internal dynamics.\nSpecifically, we present a methodology for circuit discovery aimed at\ndisentangling content-independent reasoning mechanisms from world knowledge\nacquired during pre-training. Through two distinct intervention methods, we\nuncover a sufficient and necessary circuit involving middle-term suppression\nthat elucidates how LMs transfer information to derive valid conclusions from\npremises. Furthermore, we investigate how belief biases manifest in syllogistic\nreasoning, finding evidence of partial contamination from additional attention\nheads responsible for encoding commonsense and contextualized knowledge.\nFinally, we explore the generalization of the discovered mechanisms across\nvarious syllogistic schemes and model sizes, finding that the identified\ncircuit is sufficient and necessary for all the schemes on which the model\nachieves high downstream accuracy ($\\geq$ 60\\%). Overall, our findings suggest\nthat LMs indeed learn transferable content-independent reasoning mechanisms,\nbut that, at the same time, such mechanisms do not involve generalisable and\nabstract logical primitives, being susceptible to contamination by the same\nworld knowledge acquired during pre-training.\n","authors":["Geonhee Kim","Marco Valentino","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2408.08590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06566v4","updated":"2024-08-16T07:43:55Z","published":"2024-06-03T07:44:32Z","title":"Natural Language Interaction with a Household Electricity\n Knowledge-based Digital Twin","summary":" Domain specific digital twins, representing a digital replica of various\nsegments of the smart grid, are foreseen as able to model, simulate, and\ncontrol the respective segments. At the same time, knowledge-based digital\ntwins, coupled with AI, may also empower humans to understand aspects of the\nsystem through natural language interaction in view of planning and policy\nmaking. This paper is the first to assess and report on the potential of\nRetrieval Augmented Generation (RAG) question answers related to household\nelectrical energy measurement aspects leveraging a knowledge-based energy\ndigital twin. Relying on the recently published electricity consumption\nknowledge graph that actually represents a knowledge-based digital twin, we\nstudy the capabilities of ChatGPT, Gemini and Llama in answering electricity\nrelated questions. Furthermore, we compare the answers with the ones generated\nthrough a RAG techniques that leverages an existing electricity knowledge-based\ndigital twin. Our findings illustrate that the RAG approach not only reduces\nthe incidence of incorrect information typically generated by LLMs but also\nsignificantly improves the quality of the output by grounding responses in\nverifiable data. This paper details our methodology, presents a comparative\nanalysis of responses with and without RAG, and discusses the implications of\nour findings for future applications of AI in specialized sectors like energy\ndata analysis.\n","authors":["Carolina Fortuna","Vid Hanžel","Blaž Bertalanič"],"pdf_url":"https://arxiv.org/pdf/2406.06566v4.pdf","comment":"Accepted at IEEE SmartGridComm'24"},{"id":"http://arxiv.org/abs/2408.08566v1","updated":"2024-08-16T07:00:08Z","published":"2024-08-16T07:00:08Z","title":"Overview of the BioLaySumm 2024 Shared Task on the Lay Summarization of\n Biomedical Research Articles","summary":" This paper presents the setup and results of the second edition of the\nBioLaySumm shared task on the Lay Summarisation of Biomedical Research\nArticles, hosted at the BioNLP Workshop at ACL 2024. In this task edition, we\naim to build on the first edition's success by further increasing research\ninterest in this important task and encouraging participants to explore novel\napproaches that will help advance the state-of-the-art. Encouragingly, we found\nresearch interest in the task to be high, with this edition of the task\nattracting a total of 53 participating teams, a significant increase in\nengagement from the previous edition. Overall, our results show that a broad\nrange of innovative approaches were adopted by task participants, with a\npredictable shift towards the use of Large Language Models (LLMs).\n","authors":["Tomas Goldsack","Carolina Scarton","Matthew Shardlow","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08566v1.pdf","comment":"Published in: Proceedings of the 23rd Workshop on Biomedical Natural\n Language Processing"},{"id":"http://arxiv.org/abs/2408.08564v1","updated":"2024-08-16T06:54:10Z","published":"2024-08-16T06:54:10Z","title":"Collaborative Cross-modal Fusion with Large Language Model for\n Recommendation","summary":" Despite the success of conventional collaborative filtering (CF) approaches\nfor recommendation systems, they exhibit limitations in leveraging semantic\nknowledge within the textual attributes of users and items. Recent focus on the\napplication of large language models for recommendation (LLM4Rec) has\nhighlighted their capability for effective semantic knowledge capture. However,\nthese methods often overlook the collaborative signals in user behaviors. Some\nsimply instruct-tune a language model, while others directly inject the\nembeddings of a CF-based model, lacking a synergistic fusion of different\nmodalities. To address these issues, we propose a framework of Collaborative\nCross-modal Fusion with Large Language Models, termed CCF-LLM, for\nrecommendation. In this framework, we translate the user-item interactions into\na hybrid prompt to encode both semantic knowledge and collaborative signals,\nand then employ an attentive cross-modal fusion strategy to effectively fuse\nlatent embeddings of both modalities. Extensive experiments demonstrate that\nCCF-LLM outperforms existing methods by effectively utilizing semantic and\ncollaborative signals in the LLM4Rec context.\n","authors":["Zhongzhou Liu","Hao Zhang","Kuicai Dong","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08564v1.pdf","comment":"10 pages, 4 figures, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08551v1","updated":"2024-08-16T06:35:31Z","published":"2024-08-16T06:35:31Z","title":"Integrating Multi-view Analysis: Multi-view Mixture-of-Expert for\n Textual Personality Detection","summary":" Textual personality detection aims to identify personality traits by\nanalyzing user-generated content. To achieve this effectively, it is essential\nto thoroughly examine user-generated content from various perspectives.\nHowever, previous studies have struggled with automatically extracting and\neffectively integrating information from multiple perspectives, thereby\nlimiting their performance on personality detection. To address these\nchallenges, we propose the Multi-view Mixture-of-Experts Model for Textual\nPersonality Detection (MvP). MvP introduces a Multi-view Mixture-of-Experts\n(MoE) network to automatically analyze user posts from various perspectives.\nAdditionally, it employs User Consistency Regularization to mitigate conflicts\namong different perspectives and learn a multi-view generic user\nrepresentation. The model's training is optimized via a multi-task joint\nlearning strategy that balances supervised personality detection with\nself-supervised user consistency constraints. Experimental results on two\nwidely-used personality detection datasets demonstrate the effectiveness of the\nMvP model and the benefits of automatically analyzing user posts from diverse\nperspectives for textual personality detection.\n","authors":["Haohao Zhu","Xiaokun Zhang","Junyu Lu","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08551v1.pdf","comment":"Accepted by NLPCC 2024"},{"id":"http://arxiv.org/abs/2408.08545v1","updated":"2024-08-16T06:11:21Z","published":"2024-08-16T06:11:21Z","title":"SelectLLM: Query-Aware Efficient Selection Algorithm for Large Language\n Models","summary":" Large language models (LLMs) have gained increased popularity due to their\nremarkable success across various tasks, which has led to the active\ndevelopment of a large set of diverse LLMs. However, individual LLMs have\nlimitations when applied to complex tasks because of such factors as training\nbiases, model sizes, and the datasets used. A promising approach is to\nefficiently harness the diverse capabilities of LLMs to overcome these\nindividual limitations. Towards this goal, we introduce a novel LLM selection\nalgorithm called SelectLLM. This algorithm directs input queries to the most\nsuitable subset of LLMs from a large pool, ensuring they collectively provide\nthe correct response efficiently. SelectLLM uses a multi-label classifier,\nutilizing the classifier's predictions and confidence scores to design optimal\npolicies for selecting an optimal, query-aware, and lightweight subset of LLMs.\nOur findings show that the proposed model outperforms individual LLMs and\nachieves competitive performance compared to similarly sized, computationally\nexpensive top-performing LLM subsets. Specifically, with a similarly sized\ntop-performing LLM subset, we achieve a significant reduction in latency on two\nstandard reasoning benchmarks: 13% lower latency for GSM8K and 70% lower\nlatency for MMLU. Additionally, we conduct comprehensive analyses and ablation\nstudies, which validate the robustness of the proposed model.\n","authors":["Kaushal Kumar Maurya","KV Aditya Srivatsa","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2408.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05074v2","updated":"2024-08-16T06:04:31Z","published":"2024-08-09T14:02:24Z","title":"RT-Surv: Improving Mortality Prediction After Radiotherapy with Large\n Language Model Structuring of Large-Scale Unstructured Electronic Health\n Records","summary":" Accurate patient selection is critical in radiotherapy (RT) to prevent\nineffective treatments. Traditional survival prediction models, relying on\nstructured data, often lack precision. This study explores the potential of\nlarge language models (LLMs) to structure unstructured electronic health record\n(EHR) data, thereby improving survival prediction accuracy through\ncomprehensive clinical information integration. Data from 34,276 patients\ntreated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed,\nencompassing both structured and unstructured data. An open-source LLM was used\nto structure the unstructured EHR data via single-shot learning, with its\nperformance compared against a domain-specific medical LLM and a smaller\nvariant. Survival prediction models were developed using statistical, machine\nlearning, and deep learning approaches, incorporating both structured and\nLLM-structured data. Clinical experts evaluated the accuracy of the\nLLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring\nunstructured EHR data without additional training, significantly outperforming\nthe domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs\nwere more effective, particularly in extracting clinically relevant features\nlike general condition and disease extent, which closely correlated with\npatient survival. Incorporating LLM-structured clinical features into survival\nprediction models significantly improved accuracy, with the C-index of deep\nlearning models increasing from 0.737 to 0.820. These models also became more\ninterpretable by emphasizing clinically significant factors. This study shows\nthat general-domain LLMs, even without specific medical training, can\neffectively structure large-scale unstructured EHR data, substantially\nenhancing the accuracy and interpretability of clinical predictive models.\n","authors":["Sangjoon Park","Chan Woo Wee","Seo Hee Choi","Kyung Hwan Kim","Jee Suk Chang","Hong In Yoon","Ik Jae Lee","Yong Bae Kim","Jaeho Cho","Ki Chang Keum","Chang Geol Lee","Hwa Kyung Byun","Woong Sub Koom"],"pdf_url":"https://arxiv.org/pdf/2408.05074v2.pdf","comment":"23 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2408.04575v2","updated":"2024-08-16T06:01:15Z","published":"2024-08-08T16:36:24Z","title":"SCENE: Evaluating Explainable AI Techniques Using Soft Counterfactuals","summary":" Explainable Artificial Intelligence (XAI) plays a crucial role in enhancing\nthe transparency and accountability of AI models, particularly in natural\nlanguage processing (NLP) tasks. However, popular XAI methods such as LIME and\nSHAP have been found to be unstable and potentially misleading, underscoring\nthe need for a standardized evaluation approach. This paper introduces SCENE\n(Soft Counterfactual Evaluation for Natural language Explainability), a novel\nevaluation method that leverages large language models (LLMs) to generate Soft\nCounterfactual explanations in a zero-shot manner. By focusing on token-based\nsubstitutions, SCENE creates contextually appropriate and semantically\nmeaningful Soft Counterfactuals without extensive fine-tuning. SCENE adopts\nValiditysoft and Csoft metrics to assess the effectiveness of model-agnostic\nXAI methods in text classification tasks. Applied to CNN, RNN, and Transformer\narchitectures, SCENE provides valuable insights into the strengths and\nlimitations of various XAI techniques.\n","authors":["Haoran Zheng","Utku Pamuksuz"],"pdf_url":"https://arxiv.org/pdf/2408.04575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08541v1","updated":"2024-08-16T05:56:10Z","published":"2024-08-16T05:56:10Z","title":"Where is the signal in tokenization space?","summary":" Large Language Models (LLMs) are typically shipped with tokenizers that\ndeterministically encode text into so-called canonical token sequences, to\nwhich the LLMs assign probability values. One common assumption is that the\nprobability of a piece of text is the probability of its canonical token\nsequence. However, the tokenization of a string is not unique: e.g., the Llama2\ntokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same\ntext. In this paper, we study non-canonical tokenizations. We prove that, given\na string, it is computationally hard to find the most likely tokenization for\nan autoregressive LLM, as well as to compute the marginal probability over all\npossible tokenizations. We then show how the marginal is, in most cases,\nindistinguishable from the canonical probability. Surprisingly, we then\nempirically demonstrate the existence of a significant amount of signal hidden\nwithin tokenization space. Notably, by simply aggregating the probabilities of\nnon-canonical tokenizations, we achieve improvements across a range of LLM\nevaluation benchmarks for a variety of architectures, including transformers\nand state space models.\n","authors":["Renato Lui Geh","Honghua Zhang","Kareem Ahmed","Benjie Wang","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2408.08541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02817v2","updated":"2024-08-16T05:52:17Z","published":"2024-05-05T05:43:20Z","title":"Labeling supervised fine-tuning data with the scaling law","summary":" This paper introduces a multi-stage manual annotation calibrated by the\nscaling law, offering a high-quality Supervised Fine-Tuning data acquisition\nmethod for environments with constrained resources like GPU poor, limited GPT\naccess, and funding restrictions. We have preprocessed 58k authentic chat data\nand manually annotated 2.3k questions. After this, we conducted fine-tuning on\nQwen models, ranging from 0.5B to 32B parameters. The optimal version improved\n29.07 in F1 score. This confirms the viability of fine-tuning Large Language\nModel (LLM) for downstream Natural Language Processing (NLP) tasks. Our\ncontributions are: 1) Created Supervised Fine-Tuning (SFT) training data in\nalpaca format, along with a set of Low-Rank Adaptation (LoRA) weights, and 2)\nDeveloped a method for acquiring high-quality data leveraging scaling law\nprinciple. The script, raw data with alpaca format and experiments track are\nopen-sourced on Github\n(https://github.com/InternLM/HuixiangDou/tree/main/web/tools), HuggingFace\n(https://huggingface.co/tpoisonooo) and WandB\n(https://wandb.ai/tpoisonooo/huixiangdou-cr/table?nw=nwusertpoisonooo). The\nprivacy of the data involved has been authorized by users. SFT data and license\ncomes from ncnn contributors group.\n","authors":["Huanjun Kong"],"pdf_url":"https://arxiv.org/pdf/2405.02817v2.pdf","comment":"5 pages, 3 tables, 3 figures"},{"id":"http://arxiv.org/abs/2408.08535v1","updated":"2024-08-16T05:15:12Z","published":"2024-08-16T05:15:12Z","title":"CommunityKG-RAG: Leveraging Community Structures in Knowledge Graphs for\n Advanced Retrieval-Augmented Generation in Fact-Checking","summary":" Despite advancements in Large Language Models (LLMs) and Retrieval-Augmented\nGeneration (RAG) systems, their effectiveness is often hindered by a lack of\nintegration with entity relationships and community structures, limiting their\nability to provide contextually rich and accurate information retrieval for\nfact-checking. We introduce CommunityKG-RAG (Community Knowledge\nGraph-Retrieval Augmented Generation), a novel zero-shot framework that\nintegrates community structures within Knowledge Graphs (KGs) with RAG systems\nto enhance the fact-checking process. Capable of adapting to new domains and\nqueries without additional training, CommunityKG-RAG utilizes the multi-hop\nnature of community structures within KGs to significantly improve the accuracy\nand relevance of information retrieval. Our experimental results demonstrate\nthat CommunityKG-RAG outperforms traditional methods, representing a\nsignificant advancement in fact-checking by offering a robust, scalable, and\nefficient solution.\n","authors":["Rong-Ching Chang","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08521v1","updated":"2024-08-16T04:32:10Z","published":"2024-08-16T04:32:10Z","title":"MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement\n Framework for Multimodal Question Answering","summary":" Recent advancements in retrieval-augmented generation (RAG) have demonstrated\nimpressive performance in the question-answering (QA) task. However, most\nprevious works predominantly focus on text-based answers. While some studies\naddress multimodal data, they still fall short in generating comprehensive\nmultimodal answers, particularly for explaining concepts or providing\nstep-by-step tutorials on how to accomplish specific goals. This capability is\nespecially valuable for applications such as enterprise chatbots and settings\nsuch as customer service and educational systems, where the answers are sourced\nfrom multimodal data. In this paper, we introduce a simple and effective\nframework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR\nenhances text-based answers by retrieving relevant multimodal data and refining\nthe responses to create coherent multimodal answers. This framework can be\neasily extended to support multimodal answers in enterprise chatbots with\nminimal modifications. Human evaluation results indicate that multimodal\nanswers generated by MuRAR are more useful and readable compared to plain text\nanswers.\n","authors":["Zhengyuan Zhu","Daniel Lee","Hong Zhang","Sai Sree Harsha","Loic Feujio","Akash Maharaj","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.08521v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.10753v2","updated":"2024-08-16T04:12:00Z","published":"2024-02-16T15:19:46Z","title":"ToolSword: Unveiling Safety Issues of Large Language Models in Tool\n Learning Across Three Stages","summary":" Tool learning is widely acknowledged as a foundational approach or deploying\nlarge language models (LLMs) in real-world scenarios. While current research\nprimarily emphasizes leveraging tools to augment LLMs, it frequently neglects\nemerging safety considerations tied to their application. To fill this gap, we\npresent *ToolSword*, a comprehensive framework dedicated to meticulously\ninvestigating safety issues linked to LLMs in tool learning. Specifically,\nToolSword delineates six safety scenarios for LLMs in tool learning,\nencompassing **malicious queries** and **jailbreak attacks** in the input\nstage, **noisy misdirection** and **risky cues** in the execution stage, and\n**harmful feedback** and **error conflicts** in the output stage. Experiments\nconducted on 11 open-source and closed-source LLMs reveal enduring safety\nchallenges in tool learning, such as handling harmful queries, employing risky\ntools, and delivering detrimental feedback, which even GPT-4 is susceptible to.\nMoreover, we conduct further studies with the aim of fostering research on tool\nlearning safety. The data is released in\nhttps://github.com/Junjie-Ye/ToolSword.\n","authors":["Junjie Ye","Sixian Li","Guanyu Li","Caishuang Huang","Songyang Gao","Yilong Wu","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2402.10753v2.pdf","comment":"Accepted by ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2408.08506v1","updated":"2024-08-16T03:06:57Z","published":"2024-08-16T03:06:57Z","title":"Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding","summary":" Generating long-term texts such as novels using artificial intelligence has\nalways been a challenge. A common approach is to use large language models\n(LLMs) to construct a hierarchical framework that first plans and then writes.\nDespite the fact that the generated novels reach a sufficient length, they\nexhibit poor logical coherence and appeal in their plots and deficiencies in\ncharacter and event depiction, ultimately compromising the overall narrative\nquality. In this paper, we propose a method named Extracting Excelsior and\nExpanding. Ex3 initially extracts structure information from raw novel data. By\ncombining this structure information with the novel data, an\ninstruction-following dataset is meticulously crafted. This dataset is then\nutilized to fine-tune the LLM, aiming for excelsior generation performance. In\nthe final stage, a tree-like expansion method is deployed to facilitate the\ngeneration of arbitrarily long novels. Evaluation against previous methods\nshowcases Ex3's ability to produce higher-quality long-form novels.\n","authors":["Huang Lei","Jiaming Guo","Guanhua He","Xishan Zhang","Rui Zhang","Shaohui Peng","Shaoli Liu","Tianshi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07930v2","updated":"2024-08-16T02:55:45Z","published":"2024-08-15T04:57:55Z","title":"MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and\n Iterative Sub-SQL Refinement for Text-to-SQL","summary":" Recent In-Context Learning based methods have achieved remarkable success in\nText-to-SQL task. However, there is still a large gap between the performance\nof these models and human performance on datasets with complex database schema\nand difficult questions, such as BIRD. Besides, existing work has neglected to\nsupervise intermediate steps when solving questions iteratively with question\ndecomposition methods, and the schema linking methods used in these works are\nvery rudimentary. To address these issues, we propose MAG-SQL, a multi-agent\ngenerative approach with soft schema linking and iterative Sub-SQL refinement.\nIn our framework, an entity-based method with tables' summary is used to select\nthe columns in database, and a novel targets-conditions decomposition method is\nintroduced to decompose those complex questions. Additionally, we build a\niterative generating module which includes a Sub-SQL Generator and Sub-SQL\nRefiner, introducing external oversight for each step of generation. Through a\nseries of ablation studies, the effectiveness of each agent in our framework\nhas been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL\nachieves an execution accuracy of 61.08%, compared to the baseline accuracy of\n46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL.\nBesides, our approach makes similar progress on Spider.\n","authors":["Wenxuan Xie","Gaochen Wu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07930v2.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.09170v5","updated":"2024-08-16T02:21:13Z","published":"2024-04-14T07:19:27Z","title":"Distilling Reasoning Ability from Large Language Models with Adaptive\n Thinking","summary":" Chain of thought finetuning (cot-finetuning) aims to endow small language\nmodels (SLM) with reasoning ability to improve their performance towards\nspecific tasks by allowing them to imitate the reasoning procedure of large\nlanguage models (LLM) beyond simply predicting the answers. Most existing\ncot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to\ngenerate a rationale before providing an answer. This mechanism enables SLM to\nanalyze and think about complex questions, but it also makes answer correctness\nhighly sensitive to minor errors in rationale. Therefore, we propose a robust\npost-thinking mechanism to generate answers before rationale. Thanks to this\nanswer-first setting, 1) the answer can escape from the adverse effects caused\nby minor errors in the rationale; 2) the rationale serves as an error amplifier\nto the answer, which makes the SLM focus on learning hard samples; 3) the\ninferring efficiency can also benefit from the setting since users can stop the\ngeneration right after answers are outputted when inference is conducted.\nHowever, although the post-thinking mechanism brings many advantages and\nimproves the overall performance of SLM on specific tasks, it may lose the\nability to think about the questions and decompose complex questions into\nsimple sub-questions compared to pre-thinking mechanism. Therefore, a\nplug-and-play adaptive-thinking mechanism is proposed with the aid of the soft\nprompt tuning to integrate the merits of the pre-thinking mechanism and\npost-thinking mechanism, in which a perception module is introduced to\nadaptively prompt SLM answer or think first based on perceiving the complexity\nof the questions. Extensive experiments are conducted across 12 reasoning tasks\nand 2 representative language models to demonstrate the effectiveness of the\nproposed mechanism.\n","authors":["Xiaoshu Chen","Sihang Zhou","Ke Liang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09170v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16035v3","updated":"2024-08-16T01:30:12Z","published":"2023-09-27T21:26:03Z","title":"MKRAG: Medical Knowledge Retrieval Augmented Generation for Medical\n Question Answering","summary":" Large Language Models (LLMs), although powerful in general domains, often\nperform poorly on domain-specific tasks such as medical question answering\n(QA). In addition, LLMs tend to function as \"black-boxes\", making it\nchallenging to modify their behavior. To address the problem, our work employs\na transparent process of retrieval augmented generation (RAG), aiming to\nimprove LLM responses without the need for fine-tuning or retraining.\nSpecifically, we propose a comprehensive retrieval strategy to extract medical\nfacts from an external knowledge base, and then inject them into the LLM's\nquery prompt. Focusing on medical QA, we evaluate the impact of different\nretrieval models and the number of facts on LLM performance using the\nMedQA-SMILE dataset. Notably, our retrieval-augmented Vicuna-7B model exhibited\nan accuracy improvement from 44.46% to 48.54%. This work underscores the\npotential of RAG to enhance LLM performance, offering a practical approach to\nmitigate the challenges posed by black-box LLMs.\n","authors":["Yucheng Shi","Shaochen Xu","Tianze Yang","Zhengliang Liu","Tianming Liu","Quanzheng Li","Xiang Li","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2309.16035v3.pdf","comment":"Accepted by AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2402.02212v2","updated":"2024-08-16T01:16:20Z","published":"2024-02-03T17:13:03Z","title":"A Data Generation Perspective to the Mechanism of In-Context Learning","summary":" In-Context Learning (ICL) empowers Large Language Models (LLMs) with the\ncapacity to learn in context, achieving downstream generalization without\ngradient updates but with a few in-context examples. Despite the encouraging\nempirical success, the underlying mechanism of ICL remains unclear, and\nexisting research offers various viewpoints of understanding. These studies\npropose intuition-driven and ad-hoc technical solutions for interpreting ICL,\nillustrating an ambiguous road map. In this paper, we leverage a data\ngeneration perspective to reinterpret recent efforts and demonstrate the\npotential broader usage of popular technical solutions, approaching a\nsystematic angle. For a conceptual definition, we rigorously adopt the terms of\nskill learning and skill recognition. The difference between them is skill\nlearning can learn new data generation functions from in-context data. We also\nprovide a comprehensive study on the merits and weaknesses of different\nsolutions, and highlight the uniformity among them given the perspective of\ndata generation, establishing a technical foundation for future research to\nincorporate the strengths of different lines of research.\n","authors":["Haitao Mao","Guangliang Liu","Yao Ma","Rongrong Wang","Kristen Johnson","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.02212v2.pdf","comment":"11 pages, 1 figure"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.08872v1","updated":"2024-08-16T17:57:01Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08870v1","updated":"2024-08-16T17:55:38Z","published":"2024-08-16T17:55:38Z","title":"SAM2-UNet: Segment Anything 2 Makes Strong Encoder for Natural and\n Medical Image Segmentation","summary":" Image segmentation plays an important role in vision understanding. Recently,\nthe emerging vision foundation models continuously achieved superior\nperformance on various tasks. Following such success, in this paper, we prove\nthat the Segment Anything Model 2 (SAM2) can be a strong encoder for U-shaped\nsegmentation models. We propose a simple but effective framework, termed\nSAM2-UNet, for versatile image segmentation. Specifically, SAM2-UNet adopts the\nHiera backbone of SAM2 as the encoder, while the decoder uses the classic\nU-shaped design. Additionally, adapters are inserted into the encoder to allow\nparameter-efficient fine-tuning. Preliminary experiments on various downstream\ntasks, such as camouflaged object detection, salient object detection, marine\nanimal segmentation, mirror detection, and polyp segmentation, demonstrate that\nour SAM2-UNet can simply beat existing specialized state-of-the-art methods\nwithout bells and whistles. Project page:\n\\url{https://github.com/WZH0120/SAM2-UNet}.\n","authors":["Xinyu Xiong","Zihuang Wu","Shuangyi Tan","Wenxue Li","Feilong Tang","Ying Chen","Siying Li","Jie Ma","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2408.08870v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2408.08855v1","updated":"2024-08-16T17:30:27Z","published":"2024-08-16T17:30:27Z","title":"DPA: Dual Prototypes Alignment for Unsupervised Adaptation of\n Vision-Language Models","summary":" Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in\nzero-shot image classification. However, adapting these models to new domains\nremains challenging, especially in unsupervised settings where labelled data is\nunavailable. Recent research has proposed pseudo-labelling approaches to adapt\nCLIP in an unsupervised manner using unlabelled target data. Nonetheless, these\nmethods struggle due to noisy pseudo-labels resulting from the misalignment\nbetween CLIP's visual and textual representations. This study introduces DPA,\nan unsupervised domain adaptation method for VLMs. DPA introduces the concept\nof dual prototypes, acting as distinct classifiers, along with the convex\ncombination of their outputs, thereby leading to accurate pseudo-label\nconstruction. Next, it ranks pseudo-labels to facilitate robust self-training,\nparticularly during early training. Finally, it addresses visual-textual\nmisalignment by aligning textual prototypes with image prototypes to further\nimprove the adaptation performance. Experiments on 13 downstream vision tasks\ndemonstrate that DPA significantly outperforms zero-shot CLIP and the\nstate-of-the-art unsupervised adaptation baselines.\n","authors":["Eman Ali","Sathira Silva","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2408.08855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08847v1","updated":"2024-08-16T17:19:07Z","published":"2024-08-16T17:19:07Z","title":"HistoGym: A Reinforcement Learning Environment for Histopathological\n Image Analysis","summary":" In pathological research, education, and clinical practice, the\ndecision-making process based on pathological images is critically important.\nThis significance extends to digital pathology image analysis: its adequacy is\ndemonstrated by the extensive information contained within tissue structures,\nwhich is essential for accurate cancer classification and grading.\nAdditionally, its necessity is highlighted by the inherent requirement for\ninterpretability in the conclusions generated by algorithms. For humans,\ndetermining tumor type and grade typically involves multi-scale analysis, which\npresents a significant challenge for AI algorithms. Traditional patch-based\nmethods are inadequate for modeling such complex structures, as they fail to\ncapture the intricate, multi-scale information inherent in whole slide images.\nConsequently, there is a pressing need for advanced AI techniques capable of\nefficiently and accurately replicating this complex analytical process. To\naddress this issue, we introduce HistoGym, an open-source reinforcement\nlearning environment for histopathological image analysis. Following OpenAI Gym\nAPIs, HistoGym aims to foster whole slide image diagnosis by mimicking the\nreal-life processes of doctors. Leveraging the pyramid feature of WSIs and the\nOpenSlide API, HistoGym provides a unified framework for various clinical\ntasks, including tumor detection and classification. We detail the observation,\naction, and reward specifications tailored for the histopathological image\nanalysis domain and provide an open-source Python-based interface for both\nclinicians and researchers. To accommodate different clinical demands, we offer\nvarious scenarios for different organs and cancers, including both WSI-based\nand selected region-based scenarios, showcasing several noteworthy results.\n","authors":["Zhi-Bo Liu","Xiaobo Pang","Jizhao Wang","Shuai Liu","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06400v2","updated":"2024-08-16T17:13:02Z","published":"2024-03-11T03:24:44Z","title":"DivCon: Divide and Conquer for Progressive Text-to-Image Generation","summary":" Diffusion-driven text-to-image (T2I) generation has achieved remarkable\nadvancements. To further improve T2I models' capability in numerical and\nspatial reasoning, the layout is employed as an intermedium to bridge large\nlanguage models and layout-based diffusion models. However, these methods still\nstruggle with generating images from textural prompts with multiple objects and\ncomplicated spatial relationships. To tackle this challenge, we introduce a\ndivide-and-conquer approach which decouples the T2I generation task into simple\nsubtasks. Our approach divides the layout prediction stage into numerical &\nspatial reasoning and bounding box prediction. Then, the layout-to-image\ngeneration stage is conducted in an iterative manner to reconstruct objects\nfrom easy ones to difficult ones. We conduct experiments on the HRS and NSR-1K\nbenchmarks and our approach outperforms previous state-of-the-art models with\nnotable margins. In addition, visual results demonstrate that our approach\nsignificantly improves the controllability and consistency in generating\nmultiple objects from complex textural prompts.\n","authors":["Yuhao Jia","Wenhan Tan"],"pdf_url":"https://arxiv.org/pdf/2403.06400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03291v2","updated":"2024-08-16T17:10:27Z","published":"2024-08-06T16:40:04Z","title":"DopQ-ViT: Towards Distribution-Friendly and Outlier-Aware Post-Training\n Quantization for Vision Transformers","summary":" Vision transformers (ViTs) have garnered significant attention for their\nperformance in vision tasks, but the high computational cost and significant\nlatency issues have hindered widespread adoption. Post-training quantization\n(PTQ), a promising method for model compression, still faces accuracy\ndegradation challenges with ViTs. There are two reasons for this: the existing\nquantization paradigm does not fit the power-law distribution of post-Softmax\nactivations well, and accuracy inevitably decreases after reparameterizing\npost-LayerNorm activations. We propose a Distribution-Friendly and\nOutlier-Aware Post-training Quantization method for Vision Transformers, named\nDopQ-ViT. DopQ-ViT analyzes the inefficiencies of current quantizers and\nintroduces a distribution-friendly Tan Quantizer called TanQ. TanQ focuses more\non values near 1, more accurately preserving the power-law distribution of\npost-Softmax activations, and achieves favorable results. Besides, during the\nreparameterization of post-LayerNorm activations from channel-wise to\nlayer-wise quantization, the accuracy degradation is mainly due to the\nsignificant impact of outliers in the scaling factors. Therefore, DopQ-ViT\nproposes a method to select Median as the Optimal Scaling Factor, denoted as\nMOSF, which compensates for the influence of outliers and preserves the\nperformance of the quantization model. DopQ-ViT has been extensively validated\nand significantly improves the performance of quantization models, especially\nin low-bit settings.\n","authors":["Lianwei Yang","Haisong Gong","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2408.03291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07246v2","updated":"2024-08-16T16:46:32Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v2.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2408.08827v1","updated":"2024-08-16T16:22:34Z","published":"2024-08-16T16:22:34Z","title":"RGBT Tracking via All-layer Multimodal Interactions with Progressive\n Fusion Mamba","summary":" Existing RGBT tracking methods often design various interaction models to\nperform cross-modal fusion of each layer, but can not execute the feature\ninteractions among all layers, which plays a critical role in robust multimodal\nrepresentation, due to large computational burden. To address this issue, this\npaper presents a novel All-layer multimodal Interaction Network, named AINet,\nwhich performs efficient and effective feature interactions of all modalities\nand layers in a progressive fusion Mamba, for robust RGBT tracking. Even though\nmodality features in different layers are known to contain different cues, it\nis always challenging to build multimodal interactions in each layer due to\nstruggling in balancing interaction capabilities and efficiency. Meanwhile,\nconsidering that the feature discrepancy between RGB and thermal modalities\nreflects their complementary information to some extent, we design a\nDifference-based Fusion Mamba (DFM) to achieve enhanced fusion of different\nmodalities with linear complexity. When interacting with features from all\nlayers, a huge number of token sequences (3840 tokens in this work) are\ninvolved and the computational burden is thus large. To handle this problem, we\ndesign an Order-dynamic Fusion Mamba (OFM) to execute efficient and effective\nfeature interactions of all layers by dynamically adjusting the scan order of\ndifferent layers in Mamba. Extensive experiments on four public RGBT tracking\ndatasets show that AINet achieves leading performance against existing\nstate-of-the-art methods.\n","authors":["Andong Lu","Wanyu Wang","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.08827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08822v1","updated":"2024-08-16T16:12:44Z","published":"2024-08-16T16:12:44Z","title":"PFDiff: Training-free Acceleration of Diffusion Models through the\n Gradient Guidance of Past and Future","summary":" Diffusion Probabilistic Models (DPMs) have shown remarkable potential in\nimage generation, but their sampling efficiency is hindered by the need for\nnumerous denoising steps. Most existing solutions accelerate the sampling\nprocess by proposing fast ODE solvers. However, the inevitable discretization\nerrors of the ODE solvers are significantly magnified when the number of\nfunction evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel\ntraining-free and orthogonal timestep-skipping strategy, which enables existing\nfast ODE solvers to operate with fewer NFE. Based on two key observations: a\nsignificant similarity in the model's outputs at time step size that is not\nexcessively large during the denoising process of existing ODE solvers, and a\nhigh resemblance between the denoising process and SGD. PFDiff, by employing\ngradient replacement from past time steps and foresight updates inspired by\nNesterov momentum, rapidly updates intermediate states, thereby reducing\nunnecessary NFE while correcting for discretization errors inherent in\nfirst-order ODE solvers. Experimental results demonstrate that PFDiff exhibits\nflexible applicability across various pre-trained DPMs, particularly excelling\nin conditional DPMs and surpassing previous state-of-the-art training-free\nmethods. For instance, using DDIM as a baseline, we achieved 16.46 FID (4 NFE)\ncompared to 138.81 FID with DDIM on ImageNet 64x64 with classifier guidance,\nand 13.06 FID (10 NFE) on Stable Diffusion with 7.5 guidance scale.\n","authors":["Guangyi Wang","Yuren Cai","Lijiang Li","Wei Peng","Songzhi Su"],"pdf_url":"https://arxiv.org/pdf/2408.08822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04745v6","updated":"2024-08-16T15:49:24Z","published":"2022-12-09T09:45:43Z","title":"SLAM for Visually Impaired People: a Survey","summary":" In recent decades, several assistive technologies have been developed to\nimprove the ability of blind and visually impaired (BVI) individuals to\nnavigate independently and safely. At the same time, simultaneous localization\nand mapping (SLAM) techniques have become sufficiently robust and efficient to\nbe adopted in developing these assistive technologies. We present the first\nsystematic literature review of 54 recent studies on SLAM-based solutions for\nblind and visually impaired people, focusing on literature published from 2017\nonward. This review explores various localization and mapping techniques\nemployed in this context. We systematically identified and categorized diverse\nSLAM approaches and analyzed their localization and mapping techniques, sensor\ntypes, computing resources, and machine-learning methods. We discuss the\nadvantages and limitations of these techniques for blind and visually impaired\nnavigation. Moreover, we examine the major challenges described across studies,\nincluding practical challenges and considerations that affect usability and\nadoption. Our analysis also evaluates the effectiveness of these SLAM-based\nsolutions in real-world scenarios and user satisfaction, providing insights\ninto their practical impact on BVI mobility. The insights derived from this\nreview identify critical gaps and opportunities for future research activities,\nparticularly in addressing the challenges presented by dynamic and complex\nenvironments. We explain how SLAM technology offers the potential to improve\nthe ability of visually impaired individuals to navigate effectively. Finally,\nwe present future opportunities and challenges in this domain.\n","authors":["Marziyeh Bamdad","Davide Scaramuzza","Alireza Darvishy"],"pdf_url":"https://arxiv.org/pdf/2212.04745v6.pdf","comment":"47 pages, 42 tables, 6 figures"},{"id":"http://arxiv.org/abs/2408.08813v1","updated":"2024-08-16T15:48:07Z","published":"2024-08-16T15:48:07Z","title":"Retrieval-augmented Few-shot Medical Image Segmentation with Foundation\n Models","summary":" Medical image segmentation is crucial for clinical decision-making, but the\nscarcity of annotated data presents significant challenges. Few-shot\nsegmentation (FSS) methods show promise but often require retraining on the\ntarget domain and struggle to generalize across different modalities.\nSimilarly, adapting foundation models like the Segment Anything Model (SAM) for\nmedical imaging has limitations, including the need for finetuning and\ndomain-specific adaptation. To address these issues, we propose a novel method\nthat adapts DINOv2 and Segment Anything Model 2 (SAM 2) for retrieval-augmented\nfew-shot medical image segmentation. Our approach uses DINOv2's feature as\nquery to retrieve similar samples from limited annotated data, which are then\nencoded as memories and stored in memory bank. With the memory attention\nmechanism of SAM 2, the model leverages these memories as conditions to\ngenerate accurate segmentation of the target image. We evaluated our framework\non three medical image segmentation tasks, demonstrating superior performance\nand generalizability across various modalities without the need for any\nretraining or finetuning. Overall, this method offers a practical and effective\nsolution for few-shot medical image segmentation and holds significant\npotential as a valuable annotation tool in clinical applications.\n","authors":["Lin Zhao","Xiao Chen","Eric Z. Chen","Yikang Liu","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2408.08813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08802v1","updated":"2024-08-16T15:26:23Z","published":"2024-08-16T15:26:23Z","title":"PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors","summary":" Online vectorized High-Definition (HD) map construction is crucial for\nsubsequent prediction and planning tasks in autonomous driving. Following MapTR\nparadigm, recent works have made noteworthy achievements. However, reference\npoints are randomly initialized in mainstream methods, leading to unstable\nmatching between predictions and ground truth. To address this issue, we\nintroduce PriorMapNet to enhance online vectorized HD map construction with\npriors. We propose the PPS-Decoder, which provides reference points with\nposition and structure priors. Fitted from the map elements in the dataset,\nprior reference points lower the learning difficulty and achieve stable\nmatching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV\ntransformation with BEV feature priors. Besides, we propose the DMD\ncross-attention, which decouples cross-attention along multi-scale and\nmulti-sample respectively to achieve efficiency. Our proposed PriorMapNet\nachieves state-of-the-art performance in the online vectorized HD map\nconstruction task on nuScenes and Argoverse2 datasets. The code will be\nreleased publicly soon.\n","authors":["Rongxuan Wang","Xin Lu","Xiaoyang Liu","Xiaoyi Zou","Tongyi Cao","Ying Li"],"pdf_url":"https://arxiv.org/pdf/2408.08802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03967v4","updated":"2024-08-16T15:18:05Z","published":"2023-11-07T13:06:50Z","title":"CeCNN: Copula-enhanced convolutional neural networks in joint prediction\n of refraction error and axial length based on ultra-widefield fundus images","summary":" The ultra-widefield (UWF) fundus image is an attractive 3D biomarker in\nAI-aided myopia screening because it provides much richer myopia-related\ninformation. Though axial length (AL) has been acknowledged to be highly\nrelated to the two key targets of myopia screening, Spherical Equivalence (SE)\nmeasurement and high myopia diagnosis, its prediction based on the UWF fundus\nimage is rarely considered. To save the high expense and time costs of\nmeasuring SE and AL, we propose the Copula-enhanced Convolutional Neural\nNetwork (CeCNN), a one-stop UWF-based ophthalmic AI framework to jointly\npredict SE, AL, and myopia status. The CeCNN formulates a multiresponse\nregression that relates multiple dependent discrete-continuous responses and\nthe image covariate, where the nonlinearity of the association is modeled by a\nbackbone CNN. To thoroughly describe the dependence structure among the\nresponses, we model and incorporate the conditional dependence among responses\nin a CNN through a new copula-likelihood loss. We provide statistical\ninterpretations of the conditional dependence among responses, and reveal that\nsuch dependence is beyond the dependence explained by the image covariate. We\nheuristically justify that the proposed loss can enhance the estimation\nefficiency of the CNN weights. We apply the CeCNN to the UWF dataset collected\nby us and demonstrate that the CeCNN sharply enhances the predictive capability\nof various backbone CNNs. Our study evidences the ophthalmology view that\nbesides SE, AL is also an important measure to myopia.\n","authors":["Chong Zhong","Yang Li","Danjuan Yang","Meiyan Li","Xingyao Zhou","Bo Fu","Catherine C. Liu","A. H. Welsh"],"pdf_url":"https://arxiv.org/pdf/2311.03967v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08793v1","updated":"2024-08-16T15:05:28Z","published":"2024-08-16T15:05:28Z","title":"Backward-Compatible Aligned Representations via an Orthogonal\n Transformation Layer","summary":" Visual retrieval systems face significant challenges when updating models\nwith improved representations due to misalignment between the old and new\nrepresentations. The costly and resource-intensive backfilling process involves\nrecalculating feature vectors for images in the gallery set whenever a new\nmodel is introduced. To address this, prior research has explored\nbackward-compatible training methods that enable direct comparisons between new\nand old representations without backfilling. Despite these advancements,\nachieving a balance between backward compatibility and the performance of\nindependently trained models remains an open problem. In this paper, we address\nit by expanding the representation space with additional dimensions and\nlearning an orthogonal transformation to achieve compatibility with old models\nand, at the same time, integrate new information. This transformation preserves\nthe original feature space's geometry, ensuring that our model aligns with\nprevious versions while also learning new data. Our Orthogonal Compatible\nAligned (OCA) approach eliminates the need for re-indexing during model updates\nand ensures that features can be compared directly across different model\nupdates without additional mapping functions. Experimental results on CIFAR-100\nand ImageNet-1k demonstrate that our method not only maintains compatibility\nwith previous models but also achieves state-of-the-art accuracy, outperforming\nseveral existing methods.\n","authors":["Simone Ricci","Niccolò Biondi","Federico Pernici","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2408.08793v1.pdf","comment":"Accepted at BEW2024 Workshop at ECCV2024"},{"id":"http://arxiv.org/abs/2408.08792v1","updated":"2024-08-16T15:04:13Z","published":"2024-08-16T15:04:13Z","title":"Assessing Generalization Capabilities of Malaria Diagnostic Models from\n Thin Blood Smears","summary":" Malaria remains a significant global health challenge, necessitating rapid\nand accurate diagnostic methods. While computer-aided diagnosis (CAD) tools\nutilizing deep learning have shown promise, their generalization to diverse\nclinical settings remains poorly assessed. This study evaluates the\ngeneralization capabilities of a CAD model for malaria diagnosis from thin\nblood smear images across four sites. We explore strategies to enhance\ngeneralization, including fine-tuning and incremental learning. Our results\ndemonstrate that incorporating site-specific data significantly improves model\nperformance, paving the way for broader clinical application.\n","authors":["Louise Guillon","Soheib Biga","Axel Puyo","Grégoire Pasquier","Valentin Foucher","Yendoubé E. Kantchire","Stéphane E. Sossou","Ameyo M. Dorkenoo","Laurent Bonnardot","Marc Thellier","Laurence Lachaud","Renaud Piarroux"],"pdf_url":"https://arxiv.org/pdf/2408.08792v1.pdf","comment":"MICCAI 2024 AMAI Workshop, Accepted for presentation, Submitted\n Manuscript Version, 10 pages"},{"id":"http://arxiv.org/abs/2408.08790v1","updated":"2024-08-16T15:03:06Z","published":"2024-08-16T15:03:06Z","title":"A Disease-Specific Foundation Model Using Over 100K Fundus Images:\n Release and Validation for Abnormality and Multi-Disease Classification on\n Downstream Tasks","summary":" Artificial intelligence applied to retinal images offers significant\npotential for recognizing signs and symptoms of retinal conditions and\nexpediting the diagnosis of eye diseases and systemic disorders. However,\ndeveloping generalized artificial intelligence models for medical data often\nrequires a large number of labeled images representing various disease signs,\nand most models are typically task-specific, focusing on major retinal\ndiseases. In this study, we developed a Fundus-Specific Pretrained Model\n(Image+Fundus), a supervised artificial intelligence model trained to detect\nabnormalities in fundus images. A total of 57,803 images were used to develop\nthis pretrained model, which achieved superior performance across various\ndownstream tasks, indicating that our proposed model outperforms other general\nmethods. Our Image+Fundus model offers a generalized approach to improve model\nperformance while reducing the number of labeled datasets required.\nAdditionally, it provides more disease-specific insights into fundus images,\nwith visualizations generated by our model. These disease-specific foundation\nmodels are invaluable in enhancing the performance and efficiency of deep\nlearning models in the field of fundus imaging.\n","authors":["Boa Jang","Youngbin Ahn","Eun Kyung Choe","Chang Ki Yoon","Hyuk Jin Choi","Young-Gon Kim"],"pdf_url":"https://arxiv.org/pdf/2408.08790v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08784v1","updated":"2024-08-16T14:56:17Z","published":"2024-08-16T14:56:17Z","title":"Multi-task Learning Approach for Intracranial Hemorrhage Prognosis","summary":" Prognosis after intracranial hemorrhage (ICH) is influenced by a complex\ninterplay between imaging and tabular data. Rapid and reliable prognosis are\ncrucial for effective patient stratification and informed treatment\ndecision-making. In this study, we aim to enhance image-based prognosis by\nlearning a robust feature representation shared between prognosis and the\nclinical and demographic variables most highly correlated with it. Our approach\nmimics clinical decision-making by reinforcing the model to learn valuable\nprognostic data embedded in the image. We propose a 3D multi-task image model\nto predict prognosis, Glasgow Coma Scale and age, improving accuracy and\ninterpretability. Our method outperforms current state-of-the-art baseline\nimage models, and demonstrates superior performance in ICH prognosis compared\nto four board-certified neuroradiologists using only CT scans as input. We\nfurther validate our model with interpretability saliency maps. Code is\navailable at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git.\n","authors":["Miriam Cobo","Amaia Pérez del Barrio","Pablo Menéndez Fernández-Miranda","Pablo Sanz Bellón","Lara Lloret Iglesias","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.08784v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2312.01677v3","updated":"2024-08-16T14:53:09Z","published":"2023-12-04T06:59:55Z","title":"Multi-task Image Restoration Guided By Robust DINO Features","summary":" Multi-task image restoration has gained significant interest due to its\ninherent versatility and efficiency compared to its single-task counterpart.\nHowever, performance decline is observed with an increase in the number of\ntasks, primarily attributed to the restoration model's challenge in handling\ndifferent tasks with distinct natures at the same time. Thus, a perspective\nemerged aiming to explore the degradation-insensitive semantic commonalities\namong different degradation tasks. In this paper, we observe that the features\nof DINOv2 can effectively model semantic information and are independent of\ndegradation factors. Motivated by this observation, we propose\n\\mbox{\\textbf{DINO-IR}}, a multi-task image restoration approach leveraging\nrobust features extracted from DINOv2 to solve multi-task image restoration\nsimultaneously. We first propose a pixel-semantic fusion (PSF) module to\ndynamically fuse DINOV2's shallow features containing pixel-level information\nand deep features containing degradation-independent semantic information. To\nguide the restoration model with the features of DINOv2, we develop a\nDINO-Restore adaption and fusion module to adjust the channel of fused features\nfrom PSF and then integrate them with the features from the restoration model.\nBy formulating these modules into a unified deep model, we propose a DINO\nperception contrastive loss to constrain the model training. Extensive\nexperimental results demonstrate that our DINO-IR performs favorably against\nexisting multi-task image restoration approaches in various tasks by a large\nmargin. The source codes and trained models will be made available.\n","authors":["Xin Lin","Jingtong Yue","Kelvin C. K. Chan","Lu Qi","Chao Ren","Jinshan Pan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01826v2","updated":"2024-08-16T14:45:14Z","published":"2024-08-03T17:18:26Z","title":"GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent\n Diffusion Transformer","summary":" Speech-driven talking head generation is an important but challenging task\nfor many downstream applications such as augmented reality. Existing methods\nhave achieved remarkable performance by utilizing autoregressive models or\ndiffusion models. However, most still suffer from modality inconsistencies,\nspecifically the misalignment between audio and mesh modalities, which causes\ninconsistencies in motion diversity and lip-sync accuracy. To address this\nissue, this paper introduces GLDiTalker, a novel speech-driven 3D facial\nanimation model that employs a Graph Latent Diffusion Transformer. The core\nidea behind GLDiTalker is that the audio-mesh modality misalignment can be\nresolved by diffusing the signal in a latent quantilized spatial-temporal\nspace. To achieve this, GLDiTalker builds upon a quantilized space-time\ndiffusion training pipeline, which consists of a Graph Enhanced Quantilized\nSpace Learning Stage and a Space-Time Powered Latent Diffusion Stage. The first\nstage ensures lip-sync accuracy, while the second stage enhances motion\ndiversity. Together, these stages enable GLDiTalker to generate temporally and\nspatially stable, realistic models. Extensive evaluations on several widely\nused benchmarks demonstrate that our method achieves superior performance\ncompared to existing methods.\n","authors":["Yihong Lin","Zhaoxin Fan","Lingyu Xiong","Liang Peng","Xiandong Li","Wenxiong Kang","Xianjia Wu","Songju Lei","Huang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.01826v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06128v2","updated":"2024-08-16T14:36:20Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08766v1","updated":"2024-08-16T14:22:02Z","published":"2024-08-16T14:22:02Z","title":"VF-NeRF: Learning Neural Vector Fields for Indoor Scene Reconstruction","summary":" Implicit surfaces via neural radiance fields (NeRF) have shown surprising\naccuracy in surface reconstruction. Despite their success in reconstructing\nrichly textured surfaces, existing methods struggle with planar regions with\nweak textures, which account for the majority of indoor scenes. In this paper,\nwe address indoor dense surface reconstruction by revisiting key aspects of\nNeRF in order to use the recently proposed Vector Field (VF) as the implicit\nrepresentation. VF is defined by the unit vector directed to the nearest\nsurface point. It therefore flips direction at the surface and equals to the\nexplicit surface normals. Except for this flip, VF remains constant along\nplanar surfaces and provides a strong inductive bias in representing planar\nsurfaces. Concretely, we develop a novel density-VF relationship and a training\nscheme that allows us to learn VF via volume rendering By doing this, VF-NeRF\ncan model large planar surfaces and sharp corners accurately. We show that,\nwhen depth cues are available, our method further improves and achieves\nstate-of-the-art results in reconstructing indoor scenes and rendering novel\nviews. We extensively evaluate VF-NeRF on indoor datasets and run ablations of\nits components.\n","authors":["Albert Gassol Puigjaner","Edoardo Mello Rella","Erik Sandström","Ajad Chhatkuli","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.08766v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.08753v1","updated":"2024-08-16T13:53:53Z","published":"2024-08-16T13:53:53Z","title":"PCP-MAE: Learning to Predict Centers for Point Masked Autoencoders","summary":" Masked autoencoder has been widely explored in point cloud self-supervised\nlearning, whereby the point cloud is generally divided into visible and masked\nparts. These methods typically include an encoder accepting visible patches\n(normalized) and corresponding patch centers (position) as input, with the\ndecoder accepting the output of the encoder and the centers (position) of the\nmasked parts to reconstruct each point in the masked patches. Then, the\npre-trained encoders are used for downstream tasks. In this paper, we show a\nmotivating empirical result that when directly feeding the centers of masked\npatches to the decoder without information from the encoder, it still\nreconstructs well. In other words, the centers of patches are important and the\nreconstruction objective does not necessarily rely on representations of the\nencoder, thus preventing the encoder from learning semantic representations.\nBased on this key observation, we propose a simple yet effective method, i.e.,\nlearning to Predict Centers for Point Masked AutoEncoders (PCP-MAE) which\nguides the model to learn to predict the significant centers and use the\npredicted centers to replace the directly provided centers. Specifically, we\npropose a Predicting Center Module (PCM) that shares parameters with the\noriginal encoder with extra cross-attention to predict centers. Our method is\nof high pre-training efficiency compared to other alternatives and achieves\ngreat improvement over Point-MAE, particularly outperforming it by 5.50%,\n6.03%, and 5.17% on three variants of ScanObjectNN. The code will be made\npublicly available.\n","authors":["Xiangdong Zhang","Shaofeng Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.08753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08751v1","updated":"2024-08-16T13:50:50Z","published":"2024-08-16T13:50:50Z","title":"Comparative Analysis of Generative Models: Enhancing Image Synthesis\n with VAEs, GANs, and Stable Diffusion","summary":" This paper examines three major generative modelling frameworks: Variational\nAutoencoders (VAEs), Generative Adversarial Networks (GANs), and Stable\nDiffusion models. VAEs are effective at learning latent representations but\nfrequently yield blurry results. GANs can generate realistic images but face\nissues such as mode collapse. Stable Diffusion models, while producing\nhigh-quality images with strong semantic coherence, are demanding in terms of\ncomputational resources. Additionally, the paper explores how incorporating\nGrounding DINO and Grounded SAM with Stable Diffusion improves image accuracy\nby utilising sophisticated segmentation and inpainting techniques. The analysis\nguides on selecting suitable models for various applications and highlights\nareas for further research.\n","authors":["Sanchayan Vivekananthan"],"pdf_url":"https://arxiv.org/pdf/2408.08751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08747v1","updated":"2024-08-16T13:49:18Z","published":"2024-08-16T13:49:18Z","title":"MicroSSIM: Improved Structural Similarity for Comparing Microscopy Data","summary":" Microscopy is routinely used to image biological structures of interest. Due\nto imaging constraints, acquired images are typically low-SNR and contain\nnoise. Over the last few years, regression-based tasks like unsupervised\ndenoising and splitting have found utility in working with such noisy\nmicrographs. For evaluation, Structural Similarity (SSIM) is one of the most\npopular measures used in the field. For such tasks, the best evaluation would\nbe when both low-SNR noisy images and corresponding high-SNR clean images are\nobtained directly from a microscope. However, due to the following three\npeculiar properties of the microscopy data, we observe that SSIM is not well\nsuited to this data regime: (a) high-SNR micrographs have higher intensity\npixels as compared to low SNR micrographs, (b) high-SNR micrographs have higher\nintensity pixels than found in natural images, images for which SSIM was\ndeveloped, and (c) a digitally configurable offset is added by the detector\npresent inside the microscope. We show that SSIM components behave unexpectedly\nwhen the prediction generated from low-SNR input is compared with the\ncorresponding high-SNR data. We explain this behavior by introducing the\nphenomenon of saturation, where the value of SSIM components becomes less\nsensitive to (dis)similarity between the images. We introduce microSSIM, a\nvariant of SSIM, which overcomes the above-discussed issues. We justify the\nsoundness and utility of microSSIM using theoretical and empirical arguments\nand show the utility of microSSIM on two tasks: unsupervised denoising and\njoint image splitting with unsupervised denoising. Since our formulation can be\napplied to a broad family of SSIM-based measures, we also introduce MicroMS3IM,\na microscopy-specific variation of MS-SSIM. The source code and python package\nis available at https://github.com/juglab/MicroSSIM.\n","authors":["Ashesh Ashesh","Joran Deschamps","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2408.08747v1.pdf","comment":"Accepted at BIC workshop, ECCV 24"},{"id":"http://arxiv.org/abs/2408.08742v1","updated":"2024-08-16T13:41:34Z","published":"2024-08-16T13:41:34Z","title":"A lifted Bregman strategy for training unfolded proximal neural network\n Gaussian denoisers","summary":" Unfolded proximal neural networks (PNNs) form a family of methods that\ncombines deep learning and proximal optimization approaches. They consist in\ndesigning a neural network for a specific task by unrolling a proximal\nalgorithm for a fixed number of iterations, where linearities can be learned\nfrom prior training procedure. PNNs have shown to be more robust than\ntraditional deep learning approaches while reaching at least as good\nperformances, in particular in computational imaging. However, training PNNs\nstill depends on the efficiency of available training algorithms. In this work,\nwe propose a lifted training formulation based on Bregman distances for\nunfolded PNNs. Leveraging the deterministic mini-batch block-coordinate\nforward-backward method, we design a bespoke computational strategy beyond\ntraditional back-propagation methods for solving the resulting learning problem\nefficiently. We assess the behaviour of the proposed training approach for PNNs\nthrough numerical simulations on image denoising, considering a denoising PNN\nwhose structure is based on dual proximal-gradient iterations.\n","authors":["Xiaoyu Wang","Martin Benning","Audrey Repetti"],"pdf_url":"https://arxiv.org/pdf/2408.08742v1.pdf","comment":"2024 IEEE International Workshop on Machine Learning for Signal\n Processing, Sept. 22--25, 2024, London, UK"},{"id":"http://arxiv.org/abs/2408.08736v1","updated":"2024-08-16T13:35:52Z","published":"2024-08-16T13:35:52Z","title":"Task-Aware Dynamic Transformer for Efficient Arbitrary-Scale Image\n Super-Resolution","summary":" Arbitrary-scale super-resolution (ASSR) aims to learn a single model for\nimage super-resolution at arbitrary magnifying scales. Existing ASSR networks\ntypically comprise an off-the-shelf scale-agnostic feature extractor and an\narbitrary scale upsampler. These feature extractors often use fixed network\narchitectures to address different ASSR inference tasks, each of which is\ncharacterized by an input image and an upsampling scale. However, this\noverlooks the difficulty variance of super-resolution on different inference\nscenarios, where simple images or small SR scales could be resolved with less\ncomputational effort than difficult images or large SR scales. To tackle this\ndifficulty variability, in this paper, we propose a Task-Aware Dynamic\nTransformer (TADT) as an input-adaptive feature extractor for efficient image\nASSR. Our TADT consists of a multi-scale feature extraction backbone built upon\ngroups of Multi-Scale Transformer Blocks (MSTBs) and a Task-Aware Routing\nController (TARC). The TARC predicts the inference paths within feature\nextraction backbone, specifically selecting MSTBs based on the input images and\nSR scales. The prediction of inference path is guided by a new loss function to\ntrade-off the SR accuracy and efficiency. Experiments demonstrate that, when\nworking with three popular arbitrary-scale upsamplers, our TADT achieves\nstate-of-the-art ASSR performance when compared with mainstream feature\nextractors, but with relatively fewer computational costs. The code will be\npublicly released.\n","authors":["Tianyi Xu","Yiji Zhou","Xiaotao Hu","Kai Zhang","Anran Zhang","Xingye Qiu","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08736v1.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.08723v1","updated":"2024-08-16T13:11:22Z","published":"2024-08-16T13:11:22Z","title":"Correspondence-Guided SfM-Free 3D Gaussian Splatting for NVS","summary":" Novel View Synthesis (NVS) without Structure-from-Motion (SfM) pre-processed\ncamera poses--referred to as SfM-free methods--is crucial for promoting rapid\nresponse capabilities and enhancing robustness against variable operating\nconditions. Recent SfM-free methods have integrated pose optimization,\ndesigning end-to-end frameworks for joint camera pose estimation and NVS.\nHowever, most existing works rely on per-pixel image loss functions, such as L2\nloss. In SfM-free methods, inaccurate initial poses lead to misalignment issue,\nwhich, under the constraints of per-pixel image loss functions, results in\nexcessive gradients, causing unstable optimization and poor convergence for\nNVS. In this study, we propose a correspondence-guided SfM-free 3D Gaussian\nsplatting for NVS. We use correspondences between the target and the rendered\nresult to achieve better pixel alignment, facilitating the optimization of\nrelative poses between frames. We then apply the learned poses to optimize the\nentire scene. Each 2D screen-space pixel is associated with its corresponding\n3D Gaussians through approximated surface rendering to facilitate gradient back\npropagation. Experimental results underline the superior performance and time\nefficiency of the proposed approach compared to the state-of-the-art baselines.\n","authors":["Wei Sun","Xiaosong Zhang","Fang Wan","Yanzhao Zhou","Yuan Li","Qixiang Ye","Jianbin Jiao"],"pdf_url":"https://arxiv.org/pdf/2408.08723v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.07504 by other authors"},{"id":"http://arxiv.org/abs/2401.17542v3","updated":"2024-08-16T12:46:03Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08708v1","updated":"2024-08-16T12:43:11Z","published":"2024-08-16T12:43:11Z","title":"Decoupling Feature Representations of Ego and Other Modalities for\n Incomplete Multi-modal Brain Tumor Segmentation","summary":" Multi-modal brain tumor segmentation typically involves four magnetic\nresonance imaging (MRI) modalities, while incomplete modalities significantly\ndegrade performance. Existing solutions employ explicit or implicit modality\nadaptation, aligning features across modalities or learning a fused feature\nrobust to modality incompleteness. They share a common goal of encouraging each\nmodality to express both itself and the others. However, the two expression\nabilities are entangled as a whole in a seamless feature space, resulting in\nprohibitive learning burdens. In this paper, we propose DeMoSeg to enhance the\nmodality adaptation by Decoupling the task of representing the ego and other\nModalities for robust incomplete multi-modal Segmentation. The decoupling is\nsuper lightweight by simply using two convolutions to map each modality onto\nfour feature sub-spaces. The first sub-space expresses itself (Self-feature),\nwhile the remaining sub-spaces substitute for other modalities\n(Mutual-features). The Self- and Mutual-features interactively guide each other\nthrough a carefully-designed Channel-wised Sparse Self-Attention (CSSA). After\nthat, a Radiologist-mimic Cross-modality expression Relationships (RCR) is\nintroduced to have available modalities provide Self-feature and also `lend'\ntheir Mutual-features to compensate for the absent ones by exploiting the\nclinical prior knowledge. The benchmark results on BraTS2020, BraTS2018 and\nBraTS2015 verify the DeMoSeg's superiority thanks to the alleviated modality\nadaptation difficulty. Concretely, for BraTS2020, DeMoSeg increases Dice by at\nleast 0.92%, 2.95% and 4.95% on whole tumor, tumor core and enhanced tumor\nregions, respectively, compared to other state-of-the-arts. Codes are at\nhttps://github.com/kk42yy/DeMoSeg\n","authors":["Kaixiang Yang","Wenqi Shan","Xudong Li","Xuan Wang","Xikai Yang","Xi Wang","Pheng-Ann Heng","Qiang Li","Zhiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08708v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08704v1","updated":"2024-08-16T12:32:44Z","published":"2024-08-16T12:32:44Z","title":"Beyond the Hype: A dispassionate look at vision-language models in\n medical scenario","summary":" Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated\nremarkable capabilities across diverse tasks, garnering significant attention\nin AI communities. However, their performance and reliability in specialized\ndomains such as medicine remain insufficiently assessed. In particular, most\nassessments over-concentrate in evaluating VLMs based on simple Visual Question\nAnswering (VQA) on multi-modality data, while ignoring the in-depth\ncharacteristic of LVLMs. In this study, we introduce RadVUQA, a novel\nRadiological Visual Understanding and Question Answering benchmark, to\ncomprehensively evaluate existing LVLMs. RadVUQA mainly validates LVLMs across\nfive dimensions: 1) Anatomical understanding, assessing the models' ability to\nvisually identify biological structures; 2) Multimodal comprehension, which\ninvolves the capability of interpreting linguistic and visual instructions to\nproduce desired outcomes; 3) Quantitative and spatial reasoning, evaluating the\nmodels' spatial awareness and proficiency in combining quantitative analysis\nwith visual and linguistic information; 4) Physiological knowledge, measuring\nthe models' capability to comprehend functions and mechanisms of organs and\nsystems; and 5) Robustness, which assesses the models' capabilities against\nunharmonised and synthetic data. The results indicate that both generalized\nLVLMs and medical-specific LVLMs have critical deficiencies with weak\nmultimodal comprehension and quantitative reasoning capabilities. Our findings\nreveal the large gap between existing LVLMs and clinicians, highlighting the\nurgent need for more robust and intelligent LVLMs. The code and dataset will be\navailable after the acceptance of this paper.\n","authors":["Yang Nan","Huichi Zhou","Xiaodan Xing","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08704v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2312.04960v3","updated":"2024-08-16T12:31:38Z","published":"2023-12-08T10:50:02Z","title":"MIMIR: Masked Image Modeling for Mutual Information-based Adversarial\n Robustness","summary":" Vision Transformers (ViTs) achieve excellent performance in various tasks,\nbut they are also vulnerable to adversarial attacks. Building robust ViTs is\nhighly dependent on dedicated Adversarial Training (AT) strategies. However,\ncurrent ViTs' adversarial training only employs well-established training\napproaches from convolutional neural network (CNN) training, where pre-training\nprovides the basis for AT fine-tuning with the additional help of tailored data\naugmentations. In this paper, we take a closer look at the adversarial\nrobustness of ViTs by providing a novel theoretical Mutual Information (MI)\nanalysis in its autoencoder-based self-supervised pre-training. Specifically,\nwe show that MI between the adversarial example and its latent representation\nin ViT-based autoencoders should be constrained by utilizing the MI bounds.\nBased on this finding, we propose a masked autoencoder-based pre-training\nmethod, MIMIR, that employs an MI penalty to facilitate the adversarial\ntraining of ViTs. Extensive experiments show that MIMIR outperforms\nstate-of-the-art adversarially trained ViTs on benchmark datasets with higher\nnatural and robust accuracy, indicating that ViTs can substantially benefit\nfrom exploiting MI. In addition, we consider two adaptive attacks by assuming\nthat the adversary is aware of the MIMIR design, which further verifies the\nprovided robustness.\n","authors":["Xiaoyun Xu","Shujian Yu","Zhuoran Liu","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2312.04960v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08703v1","updated":"2024-08-16T12:30:29Z","published":"2024-08-16T12:30:29Z","title":"TsCA: On the Semantic Consistency Alignment via Conditional Transport\n for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to recognize novel\n\\textit{state-object} compositions by leveraging the shared knowledge of their\nprimitive components. Despite considerable progress, effectively calibrating\nthe bias between semantically similar multimodal representations, as well as\ngeneralizing pre-trained knowledge to novel compositional contexts, remains an\nenduring challenge. In this paper, our interest is to revisit the conditional\ntransport (CT) theory and its homology to the visual-semantics interaction in\nCZSL and further, propose a novel Trisets Consistency Alignment framework\n(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three\ndistinct yet semantically homologous sets, i.e., patches, primitives, and\ncompositions, to construct pairwise CT costs to minimize their semantic\ndiscrepancies. To further ensure the consistency transfer within these sets, we\nimplement a cycle-consistency constraint that refines the learning by\nguaranteeing the feature consistency of the self-mapping during transport flow,\nregardless of modality. Moreover, we extend the CT plans to an open-world\nsetting, which enables the model to effectively filter out unfeasible pairs,\nthereby speeding up the inference as well as increasing the accuracy. Extensive\nexperiments are conducted to verify the effectiveness of the proposed method.\n","authors":["Miaoge Li","Jingcai Guo","Richard Yi Da Xu","Dongsheng Wang","Xiaofeng Cao","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2408.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08700v1","updated":"2024-08-16T12:27:46Z","published":"2024-08-16T12:27:46Z","title":"HyCoT: Hyperspectral Compression Transformer with an Efficient Training\n Strategy","summary":" The development of learning-based hyperspectral image (HSI) compression\nmodels has recently attracted significant interest. Existing models\npredominantly utilize convolutional filters, which capture only local\ndependencies. Furthermore, they often incur high training costs and exhibit\nsubstantial computational complexity. To address these limitations, in this\npaper we propose Hyperspectral Compression Transformer (HyCoT) that is a\ntransformer-based autoencoder for pixelwise HSI compression. Additionally, we\nintroduce an efficient training strategy to accelerate the training process.\nExperimental results on the HySpecNet-11k dataset demonstrate that HyCoT\nsurpasses the state-of-the-art across various compression ratios by over 1 dB\nwith significantly reduced computational requirements. Our code and pre-trained\nweights are publicly available at https://git.tu-berlin.de/rsim/hycot .\n","authors":["Martin Hermann Paul Fuchs","Behnood Rasti","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2408.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02504v2","updated":"2024-08-16T12:24:54Z","published":"2023-02-05T22:51:27Z","title":"Motion-compensated MR CINE reconstruction with reconstruction-driven\n motion estimation","summary":" In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective\napproach to address highly undersampled acquisitions by incorporating motion\ninformation between frames. In this work, we propose a novel perspective for\naddressing the MCMR problem and a more integrated and efficient solution to the\nMCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the\noriginal problem into two sub-optimization problems, i.e. motion estimation and\nreconstruction, we formulate this problem as a single entity with one single\noptimization. Our approach is unique in that the motion estimation is directly\ndriven by the ultimate goal, reconstruction, but not by the canonical\nmotion-warping loss (similarity measurement between motion-warped images and\ntarget images). We align the objectives of motion estimation and\nreconstruction, eliminating the drawbacks of artifacts-affected motion\nestimation and therefore error-propagated reconstruction. Further, we can\ndeliver high-quality reconstruction and realistic motion without applying any\nregularization/smoothness loss terms, circumventing the non-trivial weighting\nfactor tuning. We evaluate our method on two datasets: 1) an in-house acquired\n2D CINE dataset for the retrospective study and 2) the public OCMR cardiac\ndataset for the prospective study. The conducted experiments indicate that the\nproposed MCMR framework can deliver artifact-free motion estimation and\nhigh-quality MR images even for imaging accelerations up to 20x, outperforming\nSOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation\nacross all experiments. The code is available at\nhttps://github.com/JZPeterPan/MCMR-Recon-Driven-Motion.\n","authors":["Jiazhen Pan","Wenqi Huang","Daniel Rueckert","Thomas Küstner","Kerstin Hammernik"],"pdf_url":"https://arxiv.org/pdf/2302.02504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08682v1","updated":"2024-08-16T11:55:44Z","published":"2024-08-16T11:55:44Z","title":"LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression","summary":" The key to effective point cloud compression is to obtain a robust context\nmodel consistent with complex 3D data structures. Recently, the advancement of\nlarge language models (LLMs) has highlighted their capabilities not only as\npowerful generators for in-context learning and generation but also as\neffective compressors. These dual attributes of LLMs make them particularly\nwell-suited to meet the demands of data compression. Therefore, this paper\nexplores the potential of using LLM for compression tasks, focusing on lossless\npoint cloud geometry compression (PCGC) experiments. However, applying LLM\ndirectly to PCGC tasks presents some significant challenges, i.e., LLM does not\nunderstand the structure of the point cloud well, and it is a difficult task to\nfill the gap between text and point cloud through text description, especially\nfor large complicated and small shapeless point clouds. To address these\nproblems, we introduce a novel architecture, namely the Large Language\nModel-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to\ncompress point cloud geometry information without any text description or\naligning operation. By utilizing different adaptation techniques for\ncross-modality representation alignment and semantic consistency, including\nclustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA),\nthe proposed method can translate LLM to a compressor/generator for point\ncloud. To the best of our knowledge, this is the first structure to employ LLM\nas a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC\noutperforms the other existing methods significantly, by achieving -40.213% bit\nrate reduction compared to the reference software of MPEG Geometry-based Point\nCloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction\ncompared to the state-of-the-art learning-based method.\n","authors":["Yuqi Ye","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08191v2","updated":"2024-08-16T11:54:53Z","published":"2024-08-15T14:49:12Z","title":"Beyond Full Label: Single-Point Prompt for Infrared Small Target Label\n Generation","summary":" In this work, we make the first attempt to construct a learning-based\nsingle-point annotation paradigm for infrared small target label generation\n(IRSTLG). Our intuition is that label generation requires just one more point\nprompt than target detection: IRSTLG can be regarded as an infrared small\ntarget detection (IRSTD) task with the target location hint. Based on this\ninsight, we introduce an energy double guided single-point prompt (EDGSP)\nframework, which adeptly transforms the target detection network into a refined\nlabel generation method. Specifically, the proposed EDGSP includes: 1) target\nenergy initialization (TEI) to create a foundational outline for sufficient\nshape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid\nlocalization of interested regions and reinforcement of individual differences\nto avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate\nfalse alarms. Experimental results show that pseudo labels generated by three\nbaselines equipped with EDGSP achieve 100% object-level probability of\ndetection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k\ndatasets, with a pixel-level intersection over union (IoU) improvement of\n13.28% over state-of-the-art (SOTA) label generation methods. In the practical\napplication of downstream IRSTD, EDGSP realizes, for the first time, a\nsingle-point generated pseudo mask beyond the full label. Even with coarse\nsingle-point annotations, it still achieves 99.5% performance of full labeling.\n","authors":["Shuai Yuan","Hanlin Qin","Renke Kou","Xiang Yan","Zechuan Li","Chenxu Peng","Abd-Krim Seghouane"],"pdf_url":"https://arxiv.org/pdf/2408.08191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08671v1","updated":"2024-08-16T11:29:33Z","published":"2024-08-16T11:29:33Z","title":"Towards Physical World Backdoor Attacks against Skeleton Action\n Recognition","summary":" Skeleton Action Recognition (SAR) has attracted significant interest for its\nefficient representation of the human skeletal structure. Despite its\nadvancements, recent studies have raised security concerns in SAR models,\nparticularly their vulnerability to adversarial attacks. However, such\nstrategies are limited to digital scenarios and ineffective in physical\nattacks, limiting their real-world applicability. To investigate the\nvulnerabilities of SAR in the physical world, we introduce the Physical\nSkeleton Backdoor Attacks (PSBA), the first exploration of physical backdoor\nattacks against SAR. Considering the practicalities of physical execution, we\nintroduce a novel trigger implantation method that integrates infrequent and\nimperceivable actions as triggers into the original skeleton data. By\nincorporating a minimal amount of this manipulated data into the training set,\nPSBA enables the system misclassify any skeleton sequences into the target\nclass when the trigger action is present. We examine the resilience of PSBA in\nboth poisoned and clean-label scenarios, demonstrating its efficacy across a\nrange of datasets, poisoning ratios, and model architectures. Additionally, we\nintroduce a trigger-enhancing strategy to strengthen attack performance in the\nclean label setting. The robustness of PSBA is tested against three distinct\nbackdoor defenses, and the stealthiness of PSBA is evaluated using two\nquantitative metrics. Furthermore, by employing a Kinect V2 camera, we compile\na dataset of human actions from the real world to mimic physical attack\nsituations, with our findings confirming the effectiveness of our proposed\nattacks. Our project website can be found at\nhttps://qichenzheng.github.io/psba-website.\n","authors":["Qichen Zheng","Yi Yu","Siyuan Yang","Jun Liu","Kwok-Yan Lam","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2408.08671v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.08670v1","updated":"2024-08-16T11:27:52Z","published":"2024-08-16T11:27:52Z","title":"Adaptive Layer Selection for Efficient Vision Transformer Fine-Tuning","summary":" Recently, foundation models based on Vision Transformers (ViTs) have become\nwidely available. However, their fine-tuning process is highly\nresource-intensive, and it hinders their adoption in several edge or low-energy\napplications. To this end, in this paper we introduce an efficient fine-tuning\nmethod for ViTs called $\\textbf{ALaST}$ ($\\textit{Adaptive Layer Selection\nFine-Tuning for Vision Transformers}$) to speed up the fine-tuning process\nwhile reducing computational cost, memory load, and training time. Our approach\nis based on the observation that not all layers are equally critical during\nfine-tuning, and their importance varies depending on the current mini-batch.\nTherefore, at each fine-tuning step, we adaptively estimate the importance of\nall layers and we assign what we call ``compute budgets'' accordingly. Layers\nthat were allocated lower budgets are either trained with a reduced number of\ninput tokens or kept frozen. Freezing a layer reduces the computational cost\nand memory usage by preventing updates to its weights, while discarding tokens\nremoves redundant data, speeding up processing and reducing memory\nrequirements. We show that this adaptive compute allocation enables a\nnearly-optimal schedule for distributing computational resources across layers,\nresulting in substantial reductions in training time (up to 1.5x), FLOPs (up to\n2x), and memory load (up to 2x) compared to traditional full fine-tuning\napproaches. Additionally, it can be successfully combined with other\nparameter-efficient fine-tuning methods, such as LoRA.\n","authors":["Alessio Devoto","Federico Alvetreti","Jary Pomponi","Paolo Di Lorenzo","Pasquale Minervini","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2408.08670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08665v1","updated":"2024-08-16T11:15:29Z","published":"2024-08-16T11:15:29Z","title":"QMambaBSR: Burst Image Super-Resolution with Query State Space Model","summary":" Burst super-resolution aims to reconstruct high-resolution images with higher\nquality and richer details by fusing the sub-pixel information from multiple\nburst low-resolution frames. In BusrtSR, the key challenge lies in extracting\nthe base frame's content complementary sub-pixel details while simultaneously\nsuppressing high-frequency noise disturbance. Existing methods attempt to\nextract sub-pixels by modeling inter-frame relationships frame by frame while\noverlooking the mutual correlations among multi-current frames and neglecting\nthe intra-frame interactions, leading to inaccurate and noisy sub-pixels for\nbase frame super-resolution. Further, existing methods mainly employ static\nupsampling with fixed parameters to improve spatial resolution for all scenes,\nfailing to perceive the sub-pixel distribution difference across multiple\nframes and cannot balance the fusion weights of different frames, resulting in\nover-smoothed details and artifacts. To address these limitations, we introduce\na novel Query Mamba Burst Super-Resolution (QMambaBSR) network, which\nincorporates a Query State Space Model (QSSM) and Adaptive Up-sampling module\n(AdaUp). Specifically, based on the observation that sub-pixels have consistent\nspatial distribution while random noise is inconsistently distributed, a novel\nQSSM is proposed to efficiently extract sub-pixels through inter-frame querying\nand intra-frame scanning while mitigating noise interference in a single step.\nMoreover, AdaUp is designed to dynamically adjust the upsampling kernel based\non the spatial distribution of multi-frame sub-pixel information in the\ndifferent burst scenes, thereby facilitating the reconstruction of the spatial\narrangement of high-resolution details. Extensive experiments on four popular\nsynthetic and real-world benchmarks demonstrate that our method achieves a new\nstate-of-the-art performance.\n","authors":["Xin Di","Long Peng","Peizhe Xia","Wenbo Li","Renjing Pei","Yang Cao","Yang Wang","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2408.08665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09281v2","updated":"2024-08-16T11:10:24Z","published":"2024-03-14T11:08:33Z","title":"CLIP-EBC: CLIP Can Count Accurately through Enhanced Blockwise\n Classification","summary":" We propose CLIP-EBC, the first fully CLIP-based model for accurate crowd\ndensity estimation. While the CLIP model has demonstrated remarkable success in\naddressing recognition tasks such as zero-shot image classification, its\npotential for counting has been largely unexplored due to the inherent\nchallenges in transforming a regression problem, such as counting, into a\nrecognition task. In this work, we investigate and enhance CLIP's ability to\ncount, focusing specifically on the task of estimating crowd sizes from images.\nExisting classification-based crowd-counting frameworks have significant\nlimitations, including the quantization of count values into bordering\nreal-valued bins and the sole focus on classification errors. These practices\nresult in label ambiguity near the shared borders and inaccurate prediction of\ncount values. Hence, directly applying CLIP within these frameworks may yield\nsuboptimal performance.\n To address these challenges, we first propose the Enhanced Blockwise\nClassification (EBC) framework. Unlike previous methods, EBC utilizes\ninteger-valued bins, effectively reducing ambiguity near bin boundaries.\nAdditionally, it incorporates a regression loss based on density maps to\nimprove the prediction of count values. Within our backbone-agnostic EBC\nframework, we then introduce CLIP-EBC to fully leverage CLIP's recognition\ncapabilities for this task. Extensive experiments demonstrate the effectiveness\nof EBC and the competitive performance of CLIP-EBC. Specifically, our EBC\nframework can improve existing classification-based methods by up to 44.5% on\nthe UCF-QNRF dataset, and CLIP-EBC achieves state-of-the-art performance on the\nNWPU-Crowd test set, with an MAE of 58.2 and an RMSE of 268.5, representing\nimprovements of 8.6% and 13.3% over the previous best method, STEERER. The code\nand weights are available at https://github.com/Yiming-M/CLIP-EBC.\n","authors":["Yiming Ma","Victor Sanchez","Tanaya Guha"],"pdf_url":"https://arxiv.org/pdf/2403.09281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07459v2","updated":"2024-08-16T10:53:01Z","published":"2024-05-13T04:21:00Z","title":"DualFocus: Integrating Plausible Descriptions in Text-based Person\n Re-identification","summary":" Text-based Person Re-identification (TPR) aims to retrieve specific\nindividual images from datasets based on textual descriptions. Existing TPR\nmethods primarily focus on recognizing explicit and positive characteristics,\noften overlooking the role of negative descriptions. This oversight can lead to\nfalse positives-images that meet positive criteria but should be excluded based\non negative descriptions. To address these limitations, we introduce DualFocus,\na unified framework that integrates plausible descriptions to enhance the\ninterpretative accuracy of vision-language models in TPR tasks. DualFocus\nleverages Dual (Positive/Negative) Attribute Prompt Learning (DAPL), which\nincorporates Dual Image-Attribute Contrastive (DIAC) Learning and Sensitive\nImage-Attributes Matching (SIAM) Learning, enabling the detection of\nnon-existent attributes and reducing false positives. To achieve a balance\nbetween coarse and fine-grained alignment of visual and textual embeddings, we\npropose the Dynamic Tokenwise Similarity (DTS) loss, which refines the\nrepresentation of both matching and non-matching descriptions, thereby\nimproving the matching process through detailed and adaptable similarity\nassessments. The comprehensive experiments on CUHK-PEDES, ICFG-PEDES, and\nRSTPReid, DualFocus demonstrates superior performance over state-of-the-art\nmethods, significantly enhancing both precision and robustness in TPR.\n","authors":["Yuchuan Deng","Zhanpeng Hu","Jiakun Han","Chuang Deng","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19821v2","updated":"2024-08-16T10:23:55Z","published":"2024-07-29T09:14:21Z","title":"Distilling High Diagnostic Value Patches for Whole Slide Image\n Classification Using Attention Mechanism","summary":" Multiple Instance Learning (MIL) has garnered widespread attention in the\nfield of Whole Slide Image (WSI) classification as it replaces pixel-level\nmanual annotation with diagnostic reports as labels, significantly reducing\nlabor costs. Recent research has shown that bag-level MIL methods often yield\nbetter results because they can consider all patches of the WSI as a whole.\nHowever, a drawback of such methods is the incorporation of more redundant\npatches, leading to interference. To extract patches with high diagnostic value\nwhile excluding interfering patches to address this issue, we developed an\nattention-based feature distillation multi-instance learning (AFD-MIL)\napproach. This approach proposed the exclusion of redundant patches as a\npreprocessing operation in weakly supervised learning, directly mitigating\ninterference from extensive noise. It also pioneers the use of attention\nmechanisms to distill features with high diagnostic value, as opposed to the\ntraditional practice of indiscriminately and forcibly integrating all patches.\nAdditionally, we introduced global loss optimization to finely control the\nfeature distillation module. AFD-MIL is orthogonal to many existing MIL\nmethods, leading to consistent performance improvements. This approach has\nsurpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy)\nand 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge\n2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The\nCancer Genome Atlas Program: non-small cell lung cancer). Different feature\ndistillation methods were used for the two datasets, tailored to the specific\ndiseases, thereby improving performance and interpretability.\n","authors":["Tianhang Nan","Hao Quan","Yong Ding","Xingyu Li","Kai Yang","Xiaoyu Cui"],"pdf_url":"https://arxiv.org/pdf/2407.19821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08647v1","updated":"2024-08-16T10:22:54Z","published":"2024-08-16T10:22:54Z","title":"Modeling the Neonatal Brain Development Using Implicit Neural\n Representations","summary":" The human brain undergoes rapid development during the third trimester of\npregnancy. In this work, we model the neonatal development of the infant brain\nin this age range. As a basis, we use MR images of preterm- and term-birth\nneonates from the developing human connectome project (dHCP). We propose a\nneural network, specifically an implicit neural representation (INR), to\npredict 2D- and 3D images of varying time points. In order to model a\nsubject-specific development process, it is necessary to disentangle the age\nfrom the subjects' identity in the latent space of the INR. We propose two\nmethods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent\nAugmentation (SGLA), enabling this disentanglement. We perform an analysis of\nthe results and compare our proposed model to an age-conditioned denoising\ndiffusion model as a baseline. We also show that our method can be applied in a\nmemory-efficient way, which is especially important for 3D data.\n","authors":["Florentin Bieder","Paul Friedrich","Hélène Corbaz","Alicia Durrer","Julia Wolleb","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2408.08647v1.pdf","comment":"Preprint, Accepted for PRIME MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08645v1","updated":"2024-08-16T10:21:13Z","published":"2024-08-16T10:21:13Z","title":"Extracting polygonal footprints in off-nadir images with Segment\n Anything Model","summary":" Building Footprint Extraction (BFE) in off-nadir aerial images often relies\non roof segmentation and roof-to-footprint offset prediction, then drugging\nroof-to-footprint via the offset. However, the results from this multi-stage\ninference are not applicable in data production, because of the low quality of\nmasks given by prediction. To solve this problem, we proposed OBMv2 in this\npaper, which supports both end-to-end and promptable polygonal footprint\nprediction. Different from OBM, OBMv2 using a newly proposed Self Offset\nAttention (SOFA) to bridge the performance gap on bungalow and skyscraper,\nwhich realized a real end-to-end footprint polygon prediction without\npostprocessing. %, such as Non-Maximum Suppression (NMS) and Distance NMS\n(DNMS). % To fully use information contained in roof masks, building masks and\noffsets, we proposed a Multi-level Information SyStem (MISS) for footprint\nprediction, with which OBMv2 can predict footprints even with insufficient\npredictions. Additionally, to squeeze information from the same model, we were\ninspired by Retrieval-Augmented Generation (RAG) in Nature Language Processing\nand proposed \"RAG in BFE\" problem. To verify the effectiveness of the proposed\nmethod, experiments were conducted on open datasets BONAI and OmniCity-view3. A\ngeneralization test was also conducted on Huizhou test set. The code will be\navailable at \\url{https://github.com/likaiucas/OBM}.\n","authors":["Kai Li","Jingbo Chen","Yupeng Deng","Yu Meng","Diyou Liu","Junxian Ma","Chenhao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08633v1","updated":"2024-08-16T09:54:12Z","published":"2024-08-16T09:54:12Z","title":"Historical Printed Ornaments: Dataset and Tasks","summary":" This paper aims to develop the study of historical printed ornaments with\nmodern unsupervised computer vision. We highlight three complex tasks that are\nof critical interest to book historians: clustering, element discovery, and\nunsupervised change localization. For each of these tasks, we introduce an\nevaluation benchmark, and we adapt and evaluate state-of-the-art models. Our\nRey's Ornaments dataset is designed to be a representative example of a set of\nornaments historians would be interested in. It focuses on an XVIIIth century\nbookseller, Marc-Michel Rey, providing a consistent set of ornaments with a\nwide diversity and representative challenges. Our results highlight the\nlimitations of state-of-the-art models when faced with real data and show\nsimple baselines such as k-means or congealing can outperform more\nsophisticated approaches on such data. Our dataset and code can be found at\nhttps://printed-ornaments.github.io/.\n","authors":["Sayan Kumar Chaki","Zeynep Sonat Baltaci","Elliot Vincent","Remi Emonet","Fabienne Vial-Bonacci","Christelle Bahier-Porte","Mathieu Aubry","Thierry Fournel"],"pdf_url":"https://arxiv.org/pdf/2408.08633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08632v1","updated":"2024-08-16T09:52:02Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of \\textbf{180 benchmarks} and evaluation for MLLMs,\nfocusing on (1)perception and understanding, (2)cognition and reasoning,\n(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we\ndiscuss the limitations of the current evaluation methods for MLLMs and explore\npromising future directions. Our key argument is that evaluation should be\nregarded as a crucial discipline to better support the development of MLLMs.\nFor more details, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.08632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08013v2","updated":"2024-08-16T09:48:44Z","published":"2024-08-15T08:22:10Z","title":"Adaptive Learning of Consistency and Inconsistency Information for Fake\n News Detection","summary":" The rapid advancement of social media platforms has significantly reduced the\ncost of information dissemination, yet it has also led to a proliferation of\nfake news, posing a threat to societal trust and credibility. Most of fake news\ndetection research focused on integrating text and image information to\nrepresent the consistency of multiple modes in news content, while paying less\nattention to inconsistent information. Besides, existing methods that leveraged\ninconsistent information often caused one mode overshadowing another, leading\nto ineffective use of inconsistent clue. To address these issues, we propose an\nadaptive multi-modal feature fusion network (MFF-Net). Inspired by human\njudgment processes for determining truth and falsity in news, MFF-Net focuses\non inconsistent parts when news content is generally consistent and consistent\nparts when it is generally inconsistent. Specifically, MFF-Net extracts\nsemantic and global features from images and texts respectively, and learns\nconsistency information between modes through a multiple feature fusion module.\nTo deal with the problem of modal information being easily masked, we design a\nsingle modal feature filtering strategy to capture inconsistent information\nfrom corresponding modes separately. Finally, similarity scores are calculated\nbased on global features with adaptive adjustments made to achieve weighted\nfusion of consistent and inconsistent features. Extensive experimental results\ndemonstrate that MFF-Net outperforms state-of-the-art methods across three\npublic news datasets derived from real social medias.\n","authors":["Aohan Li","Jiaxin Chen","Xin Liao","Dengyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07343v2","updated":"2024-08-16T09:44:49Z","published":"2024-08-14T07:37:07Z","title":"Gradient Alignment Improves Test-Time Adaptation for Medical Image\n Segmentation","summary":" Although recent years have witnessed significant advancements in medical\nimage segmentation, the pervasive issue of domain shift among medical images\nfrom diverse centres hinders the effective deployment of pre-trained models.\nMany Test-time Adaptation (TTA) methods have been proposed to address this\nissue by fine-tuning pre-trained models with test data during inference. These\nmethods, however, often suffer from less-satisfactory optimization due to\nsuboptimal optimization direction (dictated by the gradient) and fixed\nstep-size (predicated on the learning rate). In this paper, we propose the\nGradient alignment-based Test-time adaptation (GraTa) method to improve both\nthe gradient direction and learning rate in the optimization procedure. Unlike\nconventional TTA methods, which primarily optimize the pseudo gradient derived\nfrom a self-supervised objective, our method incorporates an auxiliary gradient\nwith the pseudo one to facilitate gradient alignment. Such gradient alignment\nenables the model to excavate the similarities between different gradients and\ncorrect the gradient direction to approximate the empirical gradient related to\nthe current segmentation task. Additionally, we design a dynamic learning rate\nbased on the cosine similarity between the pseudo and auxiliary gradients,\nthereby empowering the adaptive fine-tuning of pre-trained models on diverse\ntest data. Extensive experiments establish the effectiveness of the proposed\ngradient alignment and dynamic learning rate and substantiate the superiority\nof our GraTa method over other state-of-the-art TTA methods on a benchmark\nmedical image segmentation task. The code and weights of pre-trained source\nmodels will be available.\n","authors":["Ziyang Chen","Yiwen Ye","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2408.07343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08623v1","updated":"2024-08-16T09:32:26Z","published":"2024-08-16T09:32:26Z","title":"SketchRef: A Benchmark Dataset and Evaluation Metrics for Automated\n Sketch Synthesis","summary":" Sketch, a powerful artistic technique to capture essential visual information\nabout real-world objects, is increasingly gaining attention in the image\nsynthesis field. However, evaluating the quality of synthesized sketches\npresents unique unsolved challenges. Current evaluation methods for sketch\nsynthesis are inadequate due to the lack of a unified benchmark dataset,\nover-reliance on classification accuracy for recognizability, and unfair\nevaluation of sketches with different levels of simplification. To address\nthese issues, we introduce SketchRef, a benchmark dataset comprising 4\ncategories of reference photos--animals, human faces, human bodies, and common\nobjects--alongside novel evaluation metrics. Considering that classification\naccuracy is insufficient to measure the structural consistency between a sketch\nand its reference photo, we propose the mean Object Keypoint Similarity (mOKS)\nmetric, utilizing pose estimation to assess structure-level recognizability. To\nensure fair evaluation sketches with different simplification levels, we\npropose a recognizability calculation method constrained by simplicity. We also\ncollect 8K responses from art enthusiasts, validating the effectiveness of our\nproposed evaluation methods. We hope this work can provide a comprehensive\nevaluation of sketch synthesis algorithms, thereby aligning their performance\nmore closely with human understanding.\n","authors":["Xingyue Lin","Xingjian Hu","Shuai Peng","Jianhua Zhu","Liangcai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08616v1","updated":"2024-08-16T09:14:12Z","published":"2024-08-16T09:14:12Z","title":"Reference-free Axial Super-resolution of 3D Microscopy Images using\n Implicit Neural Representation with a 2D Diffusion Prior","summary":" Analysis and visualization of 3D microscopy images pose challenges due to\nanisotropic axial resolution, demanding volumetric super-resolution along the\naxial direction. While training a learning-based 3D super-resolution model\nseems to be a straightforward solution, it requires ground truth isotropic\nvolumes and suffers from the curse of dimensionality. Therefore, existing\nmethods utilize 2D neural networks to reconstruct each axial slice, eventually\npiecing together the entire volume. However, reconstructing each slice in the\npixel domain fails to give consistent reconstruction in all directions leading\nto misalignment artifacts. In this work, we present a reconstruction framework\nbased on implicit neural representation (INR), which allows 3D coherency even\nwhen optimized by independent axial slices in a batch-wise manner. Our method\noptimizes a continuous volumetric representation from low-resolution axial\nslices, using a 2D diffusion prior trained on high-resolution lateral slices\nwithout requiring isotropic volumes. Through experiments on real and synthetic\nanisotropic microscopy images, we demonstrate that our method surpasses other\nstate-of-the-art reconstruction methods. The source code is available on\nGitHub: https://github.com/hvcl/INR-diffusion.\n","authors":["Kyungryun Lee","Won-Ki Jeong"],"pdf_url":"https://arxiv.org/pdf/2408.08616v1.pdf","comment":"MICCAI2024 accepted"},{"id":"http://arxiv.org/abs/2408.08610v1","updated":"2024-08-16T08:52:02Z","published":"2024-08-16T08:52:02Z","title":"Generative Dataset Distillation Based on Diffusion Model","summary":" This paper presents our method for the generative track of The First Dataset\nDistillation Challenge at ECCV 2024. Since the diffusion model has become the\nmainstay of generative models because of its high-quality generative effects,\nwe focus on distillation methods based on the diffusion model. Considering that\nthe track can only generate a fixed number of images in 10 minutes using a\ngenerative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a\ngenerative model that can generate images at high speed. In this study, we\nproposed a novel generative dataset distillation method based on Stable\nDiffusion. Specifically, we use the SDXL-Turbo model which can generate images\nat high speed and quality. Compared to other diffusion models that can only\ngenerate images per class (IPC) = 1, our method can achieve an IPC = 10 for\nTiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to\ngenerate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we\nuse the class information as text prompts and post data augmentation for the\nSDXL-Turbo model. Experimental results show the effectiveness of the proposed\nmethod, and we achieved third place in the generative track of the ECCV 2024 DD\nChallenge. Codes are available at https://github.com/Guang000/BANKO.\n","authors":["Duo Su","Junjie Hou","Guang Li","Ren Togo","Rui Song","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2408.08610v1.pdf","comment":"The Third Place Winner in Generative Track of the ECCV 2024 DD\n Challenge"},{"id":"http://arxiv.org/abs/2408.08604v1","updated":"2024-08-16T08:45:25Z","published":"2024-08-16T08:45:25Z","title":"Bi-Directional Deep Contextual Video Compression","summary":" Deep video compression has made remarkable process in recent years, with the\nmajority of advancements concentrated on P-frame coding. Although efforts to\nenhance B-frame coding are ongoing, their compression performance is still far\nbehind that of traditional bi-directional video codecs. In this paper, we\nintroduce a bi-directional deep contextual video compression scheme tailored\nfor B-frames, termed DCVC-B, to improve the compression performance of deep\nB-frame coding. Our scheme mainly has three key innovations. First, we develop\na bi-directional motion difference context propagation method for effective\nmotion difference coding, which significantly reduces the bit cost of\nbi-directional motions. Second, we propose a bi-directional contextual\ncompression model and a corresponding bi-directional temporal entropy model, to\nmake better use of the multi-scale temporal contexts. Third, we propose a\nhierarchical quality structure-based training strategy, leading to an effective\nbit allocation across large groups of pictures (GOP). Experimental results show\nthat our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to\nthe reference software for H.265/HEVC under random access conditions.\nRemarkably, it surpasses the performance of the H.266/VVC reference software on\ncertain test datasets under the same configuration.\n","authors":["Xihua Sheng","Li Li","Dong Liu","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08601v1","updated":"2024-08-16T08:37:56Z","published":"2024-08-16T08:37:56Z","title":"Learning A Low-Level Vision Generalist via Visual Task Prompt","summary":" Building a unified model for general low-level vision tasks holds significant\nresearch and practical value. Current methods encounter several critical\nissues. Multi-task restoration approaches can address multiple\ndegradation-to-clean restoration tasks, while their applicability to tasks with\ndifferent target domains (e.g., image stylization) is limited. Methods like\nPromptGIP can handle multiple input-target domains but rely on the Masked\nAutoencoder (MAE) paradigm. Consequently, they are tied to the ViT\narchitecture, resulting in suboptimal image reconstruction quality. In\naddition, these methods are sensitive to prompt image content and often\nstruggle with low-frequency information processing. In this paper, we propose a\nVisual task Prompt-based Image Processing (VPIP) framework to overcome these\nchallenges. VPIP employs visual task prompts to manage tasks with different\ninput-target domains and allows flexible selection of backbone network suitable\nfor general tasks. Besides, a new prompt cross-attention is introduced to\nfacilitate interaction between the input and prompt information. Based on the\nVPIP framework, we train a low-level vision generalist model, namely GenLV, on\n30 diverse tasks. Experimental results show that GenLV can successfully address\na variety of low-level tasks, significantly outperforming existing methods both\nquantitatively and qualitatively. Codes are available at\nhttps://github.com/chxy95/GenLV.\n","authors":["Xiangyu Chen","Yihao Liu","Yuandong Pu","Wenlong Zhang","Jiantao Zhou","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2408.08601v1.pdf","comment":"Accepted to ACMMM24"},{"id":"http://arxiv.org/abs/2408.08600v1","updated":"2024-08-16T08:34:50Z","published":"2024-08-16T08:34:50Z","title":"MM-UNet: A Mixed MLP Architecture for Improved Ophthalmic Image\n Segmentation","summary":" Ophthalmic image segmentation serves as a critical foundation for ocular\ndisease diagnosis. Although fully convolutional neural networks (CNNs) are\ncommonly employed for segmentation, they are constrained by inductive biases\nand face challenges in establishing long-range dependencies. Transformer-based\nmodels address these limitations but introduce substantial computational\noverhead. Recently, a simple yet efficient Multilayer Perceptron (MLP)\narchitecture was proposed for image classification, achieving competitive\nperformance relative to advanced transformers. However, its effectiveness for\nophthalmic image segmentation remains unexplored. In this paper, we introduce\nMM-UNet, an efficient Mixed MLP model tailored for ophthalmic image\nsegmentation. Within MM-UNet, we propose a multi-scale MLP (MMLP) module that\nfacilitates the interaction of features at various depths through a grouping\nstrategy, enabling simultaneous capture of global and local information. We\nconducted extensive experiments on both a private anterior segment optical\ncoherence tomography (AS-OCT) image dataset and a public fundus image dataset.\nThe results demonstrated the superiority of our MM-UNet model in comparison to\nstate-of-the-art deep segmentation networks.\n","authors":["Zunjie Xiao","Xiaoqing Zhang","Risa Higashita","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08600v1.pdf","comment":"OMIA2024"},{"id":"http://arxiv.org/abs/2407.04203v2","updated":"2024-08-16T08:27:14Z","published":"2024-07-05T01:02:12Z","title":"HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for\n Ultrasound Image Segmentation","summary":" Precise ultrasound segmentation is vital for clinicians to provide\ncomprehensive diagnoses. However, developing a model that accurately segments\nultrasound images is challenging due to the images' low quality and the\nscarcity of extensive labeled data. This results in two main solutions: (1)\noptimizing multi-scale feature representations, and (2) increasing resistance\nto data dependency. The first approach necessitates an advanced network\narchitecture, but a handcrafted network is knowledge-intensive and often yields\nlimited improvement. In contrast, neural architecture search (NAS) can more\neasily attain optimal performance, albeit with significant computational costs.\nRegarding the second issue, semi-supervised learning (SSL) is an established\nmethod, but combining it with complex NAS faces the risk of overfitting to a\nfew labeled samples without extra constraints. Therefore, we introduce a hybrid\nconstraint-driven semi-supervised Transformer-NAS (HCS-TNAS), balancing both\nsolutions for segmentation. HCS-TNAS includes an Efficient NAS-ViT module for\nmulti-scale token search before ViT's attention calculation, effectively\ncapturing contextual and local information with lower computational costs, and\na hybrid SSL framework that adds network independence and contrastive learning\nto the optimization for solving data dependency. By further developing a\nstage-wise optimization strategy, a rational network structure is identified.\nExperiments on public datasets show that HCS-TNAS achieves state-of-the-art\nperformance, pushing the limit of ultrasound segmentation.\n","authors":["Renqi Chen","Xinzhe Zheng","Haoyang Su","Kehan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.04203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08591v1","updated":"2024-08-16T07:52:00Z","published":"2024-08-16T07:52:00Z","title":"Zero-Shot Dual-Path Integration Framework for Open-Vocabulary 3D\n Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation transcends traditional\nclosed-vocabulary methods by enabling the identification of both previously\nseen and unseen objects in real-world scenarios. It leverages a dual-modality\napproach, utilizing both 3D point clouds and 2D multi-view images to generate\nclass-agnostic object mask proposals. Previous efforts predominantly focused on\nenhancing 3D mask proposal models; consequently, the information that could\ncome from 2D association to 3D was not fully exploited. This bias towards 3D\ndata, while effective for familiar indoor objects, limits the system's\nadaptability to new and varied object types, where 2D models offer greater\nutility. Addressing this gap, we introduce Zero-Shot Dual-Path Integration\nFramework that equally values the contributions of both 3D and 2D modalities.\nOur framework comprises three components: 3D pathway, 2D pathway, and Dual-Path\nIntegration. 3D pathway generates spatially accurate class-agnostic mask\nproposals of common indoor objects from 3D point cloud data using a pre-trained\n3D model, while 2D pathway utilizes pre-trained open-vocabulary instance\nsegmentation model to identify a diverse array of object proposals from\nmulti-view RGB-D images. In Dual-Path Integration, our Conditional Integration\nprocess, which operates in two stages, filters and merges the proposals from\nboth pathways adaptively. This process harmonizes output proposals to enhance\nsegmentation capabilities. Our framework, utilizing pre-trained models in a\nzero-shot manner, is model-agnostic and demonstrates superior performance on\nboth seen and unseen data, as evidenced by comprehensive evaluations on the\nScanNet200 and qualitative results on ARKitScenes datasets.\n","authors":["Tri Ton","Ji Woo Hong","SooHwan Eom","Jun Yeop Shim","Junyeong Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2408.08591v1.pdf","comment":"OpenSUN 3D: 2nd Workshop on Open-Vocabulary 3D Scene Understanding\n (CVPR 2024)"},{"id":"http://arxiv.org/abs/2408.08584v1","updated":"2024-08-16T07:37:05Z","published":"2024-08-16T07:37:05Z","title":"S-RAF: A Simulation-Based Robustness Assessment Framework for\n Responsible Autonomous Driving","summary":" As artificial intelligence (AI) technology advances, ensuring the robustness\nand safety of AI-driven systems has become paramount. However, varying\nperceptions of robustness among AI developers create misaligned evaluation\nmetrics, complicating the assessment and certification of safety-critical and\ncomplex AI systems such as autonomous driving (AD) agents. To address this\nchallenge, we introduce Simulation-Based Robustness Assessment Framework\n(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to\nrigorously assess AD agents across diverse conditions, including faulty\nsensors, environmental changes, and complex traffic situations. By quantifying\nrobustness and its relationship with other safety-critical factors, such as\ncarbon emissions, S-RAF aids developers and stakeholders in building safe and\nresponsible driving agents, and streamlining safety certification processes.\nFurthermore, S-RAF offers significant advantages, such as reduced testing\ncosts, and the ability to explore edge cases that may be unsafe to test in the\nreal world. The code for this framework is available here:\nhttps://github.com/cognitive-robots/rai-leaderboard\n","authors":["Daniel Omeiza","Pratik Somaiya","Jo-Ann Pattinson","Carolyn Ten-Holter","Jack Stilgoe","Marina Jirotka","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09032v2","updated":"2024-08-16T07:26:54Z","published":"2024-05-15T02:03:44Z","title":"ICAL: Implicit Character-Aided Learning for Enhanced Handwritten\n Mathematical Expression Recognition","summary":" Significant progress has been made in the field of handwritten mathematical\nexpression recognition, while existing encoder-decoder methods are usually\ndifficult to model global information in $LaTeX$. Therefore, this paper\nintroduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine\nthe global expression information and enhance handwritten mathematical\nexpression recognition. Specifically, we propose the Implicit Character\nConstruction Module (ICCM) to predict implicit character sequences and use a\nFusion Module to merge the outputs of the ICCM and the decoder, thereby\nproducing corrected predictions. By modeling and utilizing implicit character\ninformation, ICAL achieves a more accurate and context-aware interpretation of\nhandwritten mathematical expressions. Experimental results demonstrate that\nICAL notably surpasses the state-of-the-art(SOTA) models, improving the\nexpression recognition rate (ExpRate) by 2.25\\%/1.81\\%/1.39\\% on the CROHME\n2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\\% on the\nchallenging HME100k test set. We make our code available on the GitHub:\nhttps://github.com/qingzhenduyu/ICAL\n","authors":["Jianhua Zhu","Liangcai Gao","Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.09032v2.pdf","comment":"ICDAR 2024 Oral Paper"},{"id":"http://arxiv.org/abs/2408.08578v1","updated":"2024-08-16T07:24:19Z","published":"2024-08-16T07:24:19Z","title":"TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression\n Recognition","summary":" Handwritten Mathematical Expression Recognition (HMER) has extensive\napplications in automated grading and office automation. However, existing\nsequence-based decoding methods, which directly predict $\\LaTeX$ sequences,\nstruggle to understand and model the inherent tree structure of $\\LaTeX$ and\noften fail to ensure syntactic correctness in the decoded results. To address\nthese challenges, we propose a novel model named TAMER (Tree-Aware Transformer)\nfor handwritten mathematical expression recognition. TAMER introduces an\ninnovative Tree-aware Module while maintaining the flexibility and efficient\ntraining of Transformer. TAMER combines the advantages of both sequence\ndecoding and tree decoding models by jointly optimizing sequence prediction and\ntree structure prediction tasks, which enhances the model's understanding and\ngeneralization of complex mathematical expression structures. During inference,\nTAMER employs a Tree Structure Prediction Scoring Mechanism to improve the\nstructural validity of the generated $\\LaTeX$ sequences. Experimental results\non CROHME datasets demonstrate that TAMER outperforms traditional sequence\ndecoding and tree decoding models, especially in handling complex mathematical\nstructures, achieving state-of-the-art (SOTA) performance.\n","authors":["Jianhua Zhu","Wenqi Zhao","Yu Li","Xingjian Hu","Liangcai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08576v1","updated":"2024-08-16T07:23:22Z","published":"2024-08-16T07:23:22Z","title":"Tuning a SAM-Based Model with Multi-Cognitive Visual Adapter to Remote\n Sensing Instance Segmentation","summary":" The Segment Anything Model (SAM), a foundational model designed for\npromptable segmentation tasks, demonstrates exceptional generalization\ncapabilities, making it highly promising for natural scene image segmentation.\nHowever, SAM's lack of pretraining on massive remote sensing images and its\ninteractive structure limit its automatic mask prediction capabilities. In this\npaper, a Multi-Cognitive SAM-Based Instance Segmentation Model (MC-SAM SEG) is\nintroduced to employ SAM on remote sensing domain. The SAM-Mona encoder\nutilizing the Multi-cognitive Visual Adapter (Mona) is conducted to facilitate\nSAM's transfer learning in remote sensing applications. The proposed method\nnamed MC-SAM SEG extracts high-quality features by fine-tuning the SAM-Mona\nencoder along with a feature aggregator. Subsequently, a pixel decoder and\ntransformer decoder are designed for prompt-free mask generation and instance\nclassification. The comprehensive experiments are conducted on the HRSID and\nWHU datasets for instance segmentation tasks on Synthetic Aperture Radar (SAR)\nimages and optical remote sensing images respectively. The evaluation results\nindicate the proposed method surpasses other deep learning algorithms and\nverify its effectiveness and generalization.\n","authors":["Linghao Zheng","Xinyang Pu","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08575v1","updated":"2024-08-16T07:23:18Z","published":"2024-08-16T07:23:18Z","title":"Tell Codec What Worth Compressing: Semantically Disentangled Image\n Coding for Machine with LMMs","summary":" We present a new image compression paradigm to achieve ``intelligently coding\nfor machine'' by cleverly leveraging the common sense of Large Multimodal\nModels (LMMs). We are motivated by the evidence that large language/multimodal\nmodels are powerful general-purpose semantics predictors for understanding the\nreal world. Different from traditional image compression typically optimized\nfor human eyes, the image coding for machines (ICM) framework we focus on\nrequires the compressed bitstream to more comply with different downstream\nintelligent analysis tasks. To this end, we employ LMM to \\textcolor{red}{tell\ncodec what to compress}: 1) first utilize the powerful semantic understanding\ncapability of LMMs w.r.t object grounding, identification, and importance\nranking via prompts, to disentangle image content before compression, 2) and\nthen based on these semantic priors we accordingly encode and transmit objects\nof the image in order with a structured bitstream. In this way, diverse vision\nbenchmarks including image classification, object detection, instance\nsegmentation, etc., can be well supported with such a semantically structured\nbitstream. We dub our method ``\\textit{SDComp}'' for ``\\textit{S}emantically\n\\textit{D}isentangled \\textit{Comp}ression'', and compare it with\nstate-of-the-art codecs on a wide variety of different vision tasks. SDComp\ncodec leads to more flexible reconstruction results, promised decoded visual\nquality, and a more generic/satisfactory intelligent task-supporting ability.\n","authors":["Jinming Liu","Yuntao Wei","Junyan Lin","Shengyang Zhao","Heming Sun","Zhibo Chen","Wenjun Zeng","Xin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08570v1","updated":"2024-08-16T07:12:47Z","published":"2024-08-16T07:12:47Z","title":"EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver\n Attention Estimation","summary":" Associating driver attention with driving scene across two fields of views\n(FOVs) is a hard cross-domain perception problem, which requires comprehensive\nconsideration of cross-view mapping, dynamic driving scene analysis, and driver\nstatus tracking. Previous methods typically focus on a single view or map\nattention to the scene via estimated gaze, failing to exploit the implicit\nconnection between them. Moreover, simple fusion modules are insufficient for\nmodeling the complex relationships between the two views, making information\nintegration challenging. To address these issues, we propose a novel method for\nend-to-end scene-associated driver attention estimation, called EraW-Net. This\nmethod enhances the most discriminative dynamic cues, refines feature\nrepresentations, and facilitates semantically aligned cross-domain integration\nthrough a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive\nFilter Module (DAF-Module) is proposed to address the challenges of frequently\nchanging driving environments by extracting vital regions. It suppresses the\nindiscriminately recorded dynamics and highlights crucial ones by innovative\njoint frequency-spatial analysis, enhancing the model's ability to parse\ncomplex dynamics. Additionally, to track driver states during non-fixed facial\nposes, we propose a Global Context Sharing Module (GCS-Module) to construct\nrefined feature representations by capturing hierarchical features that adapt\nto various scales of head and eye movements. Finally, W-Net achieves systematic\ncross-view information integration through its \"Encoding-Independent Partial\nDecoding-Fusion Decoding\" structure, addressing semantic misalignment in\nheterogeneous data integration. Experiments demonstrate that the proposed\nmethod robustly and accurately estimates the mapping of driver attention in\nscene on large public datasets.\n","authors":["Jun Zhou","Chunsheng Liu","Faliang Chang","Wenqian Wang","Penghui Hao","Yiming Huang","Zhiqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08570v1.pdf","comment":"13pages, 9 figures,"},{"id":"http://arxiv.org/abs/2408.08568v1","updated":"2024-08-16T07:02:19Z","published":"2024-08-16T07:02:19Z","title":"Unsupervised Non-Rigid Point Cloud Matching through Large Vision Models","summary":" In this paper, we propose a novel learning-based framework for non-rigid\npoint cloud matching, which can be trained purely on point clouds without any\ncorrespondence annotation but also be extended naturally to partial-to-full\nmatching. Our key insight is to incorporate semantic features derived from\nlarge vision models (LVMs) to geometry-based shape feature learning. Our\nframework effectively leverages the structural information contained in the\nsemantic features to address ambiguities arise from self-similarities among\nlocal geometries. Furthermore, our framework also enjoys the strong\ngeneralizability and robustness regarding partial observations of LVMs, leading\nto improvements in the regarding point cloud matching tasks. In order to\nachieve the above, we propose a pixel-to-point feature aggregation module, a\nlocal and global attention network as well as a geometrical similarity loss\nfunction. Experimental results show that our method achieves state-of-the-art\nresults in matching non-rigid point clouds in both near-isometric and\nheterogeneous shape collection as well as more realistic partial and noisy\ndata.\n","authors":["Zhangquan Chen","Puhua Jiang","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08568v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08567v1","updated":"2024-08-16T07:01:46Z","published":"2024-08-16T07:01:46Z","title":"S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton\n Sketching","summary":" Attention based models have achieved many remarkable breakthroughs in\nnumerous applications. However, the quadratic complexity of Attention makes the\nvanilla Attention based models hard to apply to long sequence tasks. Various\nimproved Attention structures are proposed to reduce the computation cost by\ninducing low rankness and approximating the whole sequence by sub-sequences.\nThe most challenging part of those approaches is maintaining the proper balance\nbetween information preservation and computation reduction: the longer\nsub-sequences used, the better information is preserved, but at the price of\nintroducing more noise and computational costs. In this paper, we propose a\nsmoothed skeleton sketching based Attention structure, coined S$^3$Attention,\nwhich significantly improves upon the previous attempts to negotiate this\ntrade-off. S$^3$Attention has two mechanisms to effectively minimize the impact\nof noise while keeping the linear complexity to the sequence length: a\nsmoothing block to mix information over long sequences and a matrix sketching\nmethod that simultaneously selects columns and rows from the input matrix. We\nverify the effectiveness of S$^3$Attention both theoretically and empirically.\nExtensive studies over Long Range Arena (LRA) datasets and six time-series\nforecasting show that S$^3$Attention significantly outperforms both vanilla\nAttention and other state-of-the-art variants of Attention structures.\n","authors":["Xue Wang","Tian Zhou","Jianqing Zhu","Jialin Liu","Kun Yuan","Tao Yao","Wotao Yin","Rong Jin","HanQin Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08561v1","updated":"2024-08-16T06:52:38Z","published":"2024-08-16T06:52:38Z","title":"A New Chinese Landscape Paintings Generation Model based on Stable\n Diffusion using DreamBooth","summary":" This study mainly introduces a method combining the Stable Diffusion Model\n(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese\nLandscape Paintings. This training process is accelerated by combining LoRA\nwith pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the\nChinese Landscape Paintings Internet dataset used in this paper, this study\nfinds that SDM combined with DreamBooth exhibits superior performance,\noutperforming other models, including the generic pre-trained SDM and\nLoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of\n12.75 on the dataset and outperforms all other models in terms of expert\nevaluation, highlighting the model's versatility in the field of Chinese\nLandscape Paintings given the unique identifier, high fidelity and high\nquality. This study illustrates the potential of specialised fine-tuning method\nto improve the performance of SDM on domain-specific tasks, particularly in the\ndomain of Landscape Paintings.\n","authors":["Yujia Gu","Xinyu Fang","Xueyuan Deng"],"pdf_url":"https://arxiv.org/pdf/2408.08561v1.pdf","comment":"accepted by AHPCAI"},{"id":"http://arxiv.org/abs/2408.08560v1","updated":"2024-08-16T06:52:06Z","published":"2024-08-16T06:52:06Z","title":"A training regime to learn unified representations from complementary\n breast imaging modalities","summary":" Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT)\nare the two most widely used imaging modalities for breast cancer screening.\nAlthough DBT has increased cancer detection compared to FFDM, its widespread\nadoption in clinical practice has been slowed by increased interpretation times\nand a perceived decrease in the conspicuity of specific lesion types.\nSpecifically, the non-inferiority of DBT for microcalcifications remains under\ndebate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM\nacquisitions remain popular, leading to overall increased exam times and\nradiation dosage. Enabling DBT to provide diagnostic information present in\nboth FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in\nboth quantities. We propose a machine learning methodology that learns\nhigh-level representations leveraging the complementary diagnostic signal from\nboth DBT and FFDM. Experiments on a large-scale data set validate our claims\nand show that our representations enable more accurate breast lesion detection\nthan any DBT- or FFDM-based model.\n","authors":["Umang Sharma","Jungkyu Park","Laura Heacock","Sumit Chopra","Krzysztof Geras"],"pdf_url":"https://arxiv.org/pdf/2408.08560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08189v2","updated":"2024-08-16T06:51:05Z","published":"2024-08-15T14:47:44Z","title":"FancyVideo: Towards Dynamic and Consistent Video Generation via\n Cross-frame Textual Guidance","summary":" Synthesizing motion-rich and temporally consistent videos remains a challenge\nin artificial intelligence, especially when dealing with extended durations.\nExisting text-to-video (T2V) models commonly employ spatial cross-attention for\ntext control, equivalently guiding different frame generations without\nframe-specific textual guidance. Thus, the model's capacity to comprehend the\ntemporal logic conveyed in prompts and generate videos with coherent motion is\nrestricted. To tackle this limitation, we introduce FancyVideo, an innovative\nvideo generator that improves the existing text-control mechanism with the\nwell-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM\nincorporates the Temporal Information Injector (TII), Temporal Affinity Refiner\n(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of\ncross-attention, respectively, to achieve frame-specific textual guidance.\nFirstly, TII injects frame-specific information from latent features into text\nconditions, thereby obtaining cross-frame textual conditions. Then, TAR refines\nthe correlation matrix between cross-frame textual conditions and latent\nfeatures along the time dimension. Lastly, TFB boosts the temporal consistency\nof latent features. Extensive experiments comprising both quantitative and\nqualitative evaluations demonstrate the effectiveness of FancyVideo. Our video\ndemo, code and model are available at https://360cvgroup.github.io/FancyVideo/.\n","authors":["Jiasong Feng","Ao Ma","Jing Wang","Bo Cheng","Xiaodan Liang","Dawei Leng","Yuhui Yin"],"pdf_url":"https://arxiv.org/pdf/2408.08189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14854v2","updated":"2024-08-16T06:47:53Z","published":"2024-06-21T03:54:10Z","title":"PEANO-ViT: Power-Efficient Approximations of Non-Linearities in Vision\n Transformers","summary":" The deployment of Vision Transformers (ViTs) on hardware platforms, specially\nField-Programmable Gate Arrays (FPGAs), presents many challenges, which are\nmainly due to the substantial computational and power requirements of their\nnon-linear functions, notably layer normalization, softmax, and Gaussian Error\nLinear Unit (GELU). These critical functions pose significant obstacles to\nefficient hardware implementation due to their complex mathematical operations\nand the inherent resource count and architectural limitations of FPGAs.\nPEANO-ViT offers a novel approach to streamlining the implementation of the\nlayer normalization layer by introducing a division-free technique that\nsimultaneously approximates the division and square root function.\nAdditionally, PEANO-ViT provides a multi-scale division strategy to eliminate\ndivision operations in the softmax layer, aided by a Pade-based approximation\nfor the exponential function. Finally, PEANO-ViT introduces a piece-wise linear\napproximation for the GELU function, carefully designed to bypass the\ncomputationally intensive operations associated with GELU. In our comprehensive\nevaluations, PEANO-ViT exhibits minimal accuracy degradation (<= 0.5% for\nDeiT-B) while significantly enhancing power efficiency, achieving improvements\nof 1.91x, 1.39x, 8.01x for layer normalization, softmax, and GELU,\nrespectively. This improvement is achieved through substantial reductions in\nDSP, LUT, and register counts for these non-linear operations. Consequently,\nPEANO-ViT enables efficient deployment of Vision Transformers on resource- and\npower-constrained FPGA platforms.\n","authors":["Mohammad Erfan Sadeghi","Arash Fayyazi","Seyedarmin Azizi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2406.14854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08555v1","updated":"2024-08-16T06:40:20Z","published":"2024-08-16T06:40:20Z","title":"Detection and tracking of MAVs using a LiDAR with rosette scanning\n pattern","summary":" The usage of commercial Micro Aerial Vehicles (MAVs) has increased\ndrastically during the last decade. While the added value of MAVs to society is\napparent, their growing use is also coming with increasing risks like violating\npublic airspace at airports or committing privacy violations. To mitigate these\nissues it is becoming critical to develop solutions that incorporate the\ndetection and tracking of MAVs with autonomous systems. This work presents a\nmethod for the detection and tracking of MAVs using a novel, low-cost rosette\nscanning LiDAR on a pan-tilt turret. Once the static background is captured, a\nparticle filter is utilized to detect a possible target and track its position\nwith a physical, programmable pan-tilt system. The tracking makes it possible\nto keep the MAV in the center, maximizing the density of 3D points measured on\nthe target by the LiDAR sensor. The developed algorithm was evaluated within\nthe indoor MIcro aerial vehicle and MOtion capture (MIMO) arena and has\nstate-of-the-art tracking accuracy, stability, and fast re-detection time in\ncase of tracking loss. Based on the outdoor tests, it was possible to\nsignificantly increase the detection distance and number of returned points\ncompared to other similar methods using LiDAR.\n","authors":["Sándor Gazdag","Tom Möller","Tamás Filep","Anita Keszler","András L. Majdik"],"pdf_url":"https://arxiv.org/pdf/2408.08555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19524v3","updated":"2024-08-16T06:24:24Z","published":"2024-07-28T16:24:07Z","title":"VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via\n SLM-Based Prompt Engineering and Generative Adversary","summary":" With the rapid development of Text-to-Image (T2I) models, biases in human\nimage generation against demographic social groups become a significant\nconcern, impacting fairness and ethical standards in AI. Some researchers\npropose their methods to tackle with the issue. However, existing methods are\ndesigned for specific models with fixed prompts, limiting their adaptability to\nthe fast-evolving models and diverse practical scenarios. Moreover, they\nneglect the impact of hallucinations, leading to discrepancies between expected\nand actual results. To address these issues, we introduce VersusDebias, a novel\nand universal debiasing framework for biases in arbitrary T2I models,\nconsisting of an array generation (AG) module and an image generation (IG)\nmodule. The self-adaptive AG module generates specialized attribute arrays to\npost-process hallucinations and debias multiple attributes simultaneously. The\nIG module employs a small language model to modify prompts according to the\narrays and drives the T2I model to generate debiased images, enabling zero-shot\ndebiasing. Extensive experiments demonstrate VersusDebias's capability to\ndebias any models across gender, race, and age simultaneously. In both\nzero-shot and few-shot scenarios, VersusDebias outperforms existing methods,\nshowcasing its exceptional utility. Our work is accessible at\nhttps://github.com/VersusDebias/VersusDebias to ensure reproducibility and\nfacilitate further research.\n","authors":["Hanjun Luo","Ziye Deng","Haoyu Huang","Xuecheng Liu","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07307v3","updated":"2024-08-16T06:14:18Z","published":"2023-05-12T08:27:03Z","title":"Self-Learning Symmetric Multi-view Probabilistic Clustering","summary":" Multi-view Clustering (MVC) has achieved significant progress, with many\nefforts dedicated to learn knowledge from multiple views. However, most\nexisting methods are either not applicable or require additional steps for\nincomplete MVC. Such a limitation results in poor-quality clustering\nperformance and poor missing view adaptation. Besides, noise or outliers might\nsignificantly degrade the overall clustering performance, which are not handled\nwell by most existing methods. In this paper, we propose a novel unified\nframework for incomplete and complete MVC named self-learning symmetric\nmulti-view probabilistic clustering (SLS-MPC). SLS-MPC proposes a novel\nsymmetric multi-view probability estimation and equivalently transforms\nmulti-view pairwise posterior matching probability into composition of each\nview's individual distribution, which tolerates data missing and might extend\nto any number of views. Then, SLS-MPC proposes a novel self-learning\nprobability function without any prior knowledge and hyper-parameters to learn\neach view's individual distribution. Next, graph-context-aware refinement with\npath propagation and co-neighbor propagation is used to refine pairwise\nprobability, which alleviates the impact of noise and outliers. Finally,\nSLS-MPC proposes a probabilistic clustering algorithm to adjust clustering\nassignments by maximizing the joint probability iteratively without category\ninformation. Extensive experiments on multiple benchmarks show that SLS-MPC\noutperforms previous state-of-the-art methods.\n","authors":["Junjie Liu","Junlong Liu","Rongxin Jiang","Yaowu Chen","Chen Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2305.07307v3.pdf","comment":"accepted by IEEE Transactions on Knowledge and Data Engineering(TKDE)"},{"id":"http://arxiv.org/abs/2408.08544v1","updated":"2024-08-16T06:04:25Z","published":"2024-08-16T06:04:25Z","title":"Scaling up Multimodal Pre-training for Sign Language Understanding","summary":" Sign language serves as the primary meaning of communication for the\ndeaf-mute community. Different from spoken language, it commonly conveys\ninformation by the collaboration of manual features, i.e., hand gestures and\nbody movements, and non-manual features, i.e., facial expressions and mouth\ncues. To facilitate communication between the deaf-mute and hearing people, a\nseries of sign language understanding (SLU) tasks have been studied in recent\nyears, including isolated/continuous sign language recognition (ISLR/CSLR),\ngloss-free sign language translation (GF-SLT) and sign language retrieval\n(SL-RT). Sign language recognition and translation aims to understand the\nsemantic meaning conveyed by sign languages from gloss-level and\nsentence-level, respectively. In contrast, SL-RT focuses on retrieving sign\nvideos or corresponding texts from a closed-set under the query-by-example\nsearch paradigm. These tasks investigate sign language topics from diverse\nperspectives and raise challenges in learning effective representation of sign\nlanguage videos. To advance the development of sign language understanding,\nexploring a generalized model that is applicable across various SLU tasks is a\nprofound research direction.\n","authors":["Wengang Zhou","Weichao Zhao","Hezhen Hu","Zecheng Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.08544v1.pdf","comment":"Sign language recognition; Sign language translation; Sign language\n retrieval"},{"id":"http://arxiv.org/abs/2408.08543v1","updated":"2024-08-16T05:57:22Z","published":"2024-08-16T05:57:22Z","title":"Language-Driven Interactive Shadow Detection","summary":" Traditional shadow detectors often identify all shadow regions of static\nimages or video sequences. This work presents the Referring Video Shadow\nDetection (RVSD), which is an innovative task that rejuvenates the classic\nparadigm by facilitating the segmentation of particular shadows in videos based\non descriptive natural language prompts. This novel RVSD not only achieves\nsegmentation of arbitrary shadow areas of interest based on descriptions\n(flexibility) but also allows users to interact with visual content more\ndirectly and naturally by using natural language prompts (interactivity),\npaving the way for abundant applications ranging from advanced video editing to\nvirtual reality experiences. To pioneer the RVSD research, we curated a\nwell-annotated RVSD dataset, which encompasses 86 videos and a rich set of\n15,011 paired textual descriptions with corresponding shadows. To the best of\nour knowledge, this dataset is the first one for addressing RVSD. Based on this\ndataset, we propose a Referring Shadow-Track Memory Network (RSM-Net) for\naddressing the RVSD task. In our RSM-Net, we devise a Twin-Track Synergistic\nMemory (TSM) to store intra-clip memory features and hierarchical inter-clip\nmemory features, and then pass these memory features into a memory read module\nto refine features of the current video frame for referring shadow detection.\nWe also develop a Mixed-Prior Shadow Attention (MSA) to utilize physical priors\nto obtain a coarse shadow map for learning more visual features by weighting it\nwith the input video frame. Experimental results show that our RSM-Net achieves\nstate-of-the-art performance for RVSD with a notable Overall IOU increase of\n4.4\\%. Our code and dataset are available at https://github.com/whq-xxh/RVSD.\n","authors":["Hongqiu Wang","Wei Wang","Haipeng Zhou","Huihui Xu","Shaozhi Wu","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08543v1.pdf","comment":"ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.15240v3","updated":"2024-08-16T05:53:16Z","published":"2024-07-21T18:09:40Z","title":"BIGbench: A Unified Benchmark for Social Bias in Text-to-Image\n Generative Models Based on Multi-modal LLM","summary":" Text-to-Image (T2I) generative models are becoming increasingly crucial due\nto their ability to generate high-quality images, which also raises concerns\nabout the social biases in their outputs, especially in the human generation.\nSociological research has established systematic classifications of bias.\nHowever, existing bias research about T2I models conflates different types of\nbias, impeding methodological progress. In this paper, we introduce BIGbench, a\nunified benchmark for Biases of Image Generation, featuring a meticulously\ndesigned dataset. Unlike existing benchmarks, BIGbench classifies and evaluates\nbiases across four dimensions: manifestation of bias, visibility of bias,\nacquired attributes, and protected attributes, which ensures exceptional\naccuracy for analysis. Furthermore, BIGbench applies advanced multi-modal large\nlanguage models to achieve fully automated and highly accurate evaluations. We\napply BIGbench to evaluate eight representative general T2I models and three\ndebiased methods. Our human evaluation results underscore BIGbench's\neffectiveness in aligning images and identifying various biases. Besides, our\nstudy also reveal new research directions about biases, such as the effect of\ndistillation and irrelevant protected attributes. Our benchmark is openly\naccessible at https://github.com/BIGbench2024/BIGbench2024/ to ensure\nreproducibility.\n","authors":["Hanjun Luo","Haoyu Huang","Ziye Deng","Xuecheng Liu","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15240v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.17814"},{"id":"http://arxiv.org/abs/2404.15506v2","updated":"2024-08-16T05:36:12Z","published":"2024-03-22T02:30:46Z","title":"Metric3D v2: A Versatile Monocular Geometric Foundation Model for\n Zero-shot Metric Depth and Surface Normal Estimation","summary":" We introduce Metric3D v2, a geometric foundation model for zero-shot metric\ndepth and surface normal estimation from a single image, which is crucial for\nmetric 3D recovery. While depth and normal are geometrically related and highly\ncomplimentary, they present distinct challenges. SoTA monocular depth methods\nachieve zero-shot generalization by learning affine-invariant depths, which\ncannot recover real-world metrics. Meanwhile, SoTA normal estimation methods\nhave limited zero-shot performance due to the lack of large-scale labeled data.\nTo tackle these issues, we propose solutions for both metric depth estimation\nand surface normal estimation. For metric depth estimation, we show that the\nkey to a zero-shot single-view model lies in resolving the metric ambiguity\nfrom various camera models and large-scale data training. We propose a\ncanonical camera space transformation module, which explicitly addresses the\nambiguity problem and can be effortlessly plugged into existing monocular\nmodels. For surface normal estimation, we propose a joint depth-normal\noptimization module to distill diverse data knowledge from metric depth,\nenabling normal estimators to learn beyond normal labels. Equipped with these\nmodules, our depth-normal models can be stably trained with over 16 million of\nimages from thousands of camera models with different-type annotations,\nresulting in zero-shot generalization to in-the-wild images with unseen camera\nsettings. Our method enables the accurate recovery of metric 3D structures on\nrandomly collected internet images, paving the way for plausible single-image\nmetrology. Our project page is at https://JUGGHM.github.io/Metric3Dv2.\n","authors":["Mu Hu","Wei Yin","Chi Zhang","Zhipeng Cai","Xiaoxiao Long","Hao Chen","Kaixuan Wang","Gang Yu","Chunhua Shen","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2404.15506v2.pdf","comment":"Our project page is at https://JUGGHM.github.io/Metric3Dv2. Accpeted\n to TPAMI. arXiv admin note: text overlap with arXiv:2307.10984"},{"id":"http://arxiv.org/abs/2311.17957v2","updated":"2024-08-16T05:35:21Z","published":"2023-11-29T08:52:08Z","title":"HandRefiner: Refining Malformed Hands in Generated Images by\n Diffusion-based Conditional Inpainting","summary":" Diffusion models have achieved remarkable success in generating realistic\nimages but suffer from generating accurate human hands, such as incorrect\nfinger counts or irregular shapes. This difficulty arises from the complex task\nof learning the physical structure and pose of hands from training images,\nwhich involves extensive deformations and occlusions. For correct hand\ngeneration, our paper introduces a lightweight post-processing solution called\n$\\textbf{HandRefiner}$. HandRefiner employs a conditional inpainting approach\nto rectify malformed hands while leaving other parts of the image untouched. We\nleverage the hand mesh reconstruction model that consistently adheres to the\ncorrect number of fingers and hand shape, while also being capable of fitting\nthe desired hand pose in the generated image. Given a generated failed image\ndue to malformed hands, we utilize ControlNet modules to re-inject such correct\nhand information. Additionally, we uncover a phase transition phenomenon within\nControlNet as we vary the control strength. It enables us to take advantage of\nmore readily available synthetic data without suffering from the domain gap\nbetween realistic and synthetic hands. Experiments demonstrate that HandRefiner\ncan significantly improve the generation quality quantitatively and\nqualitatively. The code is available at\nhttps://github.com/wenquanlu/HandRefiner .\n","authors":["Wenquan Lu","Yufei Xu","Jing Zhang","Chaoyue Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2311.17957v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08529v1","updated":"2024-08-16T04:57:21Z","published":"2024-08-16T04:57:21Z","title":"Privacy-Preserving Vision Transformer Using Images Encrypted with\n Restricted Random Permutation Matrices","summary":" We propose a novel method for privacy-preserving fine-tuning vision\ntransformers (ViTs) with encrypted images. Conventional methods using encrypted\nimages degrade model performance compared with that of using plain images due\nto the influence of image encryption. In contrast, the proposed encryption\nmethod using restricted random permutation matrices can provide a higher\nperformance than the conventional ones.\n","authors":["Kouki Horio","Kiyoshi Nishikawa","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2408.08529v1.pdf","comment":"4 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.08527v1","updated":"2024-08-16T04:54:10Z","published":"2024-08-16T04:54:10Z","title":"Focus on Focus: Focus-oriented Representation Learning and Multi-view\n Cross-modal Alignment for Glioma Grading","summary":" Recently, multimodal deep learning, which integrates histopathology slides\nand molecular biomarkers, has achieved a promising performance in glioma\ngrading. Despite great progress, due to the intra-modality complexity and\ninter-modality heterogeneity, existing studies suffer from inadequate\nhistopathology representation learning and inefficient molecular-pathology\nknowledge alignment. These two issues hinder existing methods to precisely\ninterpret diagnostic molecular-pathology features, thereby limiting their\ngrading performance. Moreover, the real-world applicability of existing\nmultimodal approaches is significantly restricted as molecular biomarkers are\nnot always available during clinical deployment. To address these problems, we\nintroduce a novel Focus on Focus (FoF) framework with paired pathology-genomic\ntraining and applicable pathology-only inference, enhancing molecular-pathology\nrepresentation effectively. Specifically, we propose a Focus-oriented\nRepresentation Learning (FRL) module to encourage the model to identify regions\npositively or negatively related to glioma grading and guide it to focus on the\ndiagnostic areas with a consistency constraint. To effectively link the\nmolecular biomarkers to morphological features, we propose a Multi-view\nCross-modal Alignment (MCA) module that projects histopathology representations\ninto molecular subspaces, aligning morphological features with corresponding\nmolecular biomarker status by supervised contrastive learning. Experiments on\nthe TCGA GBM-LGG dataset demonstrate that our FoF framework significantly\nimproves the glioma grading. Remarkably, our FoF achieves superior performance\nusing only histopathology slides compared to existing multimodal methods. The\nsource code is available at https://github.com/peterlipan/FoF.\n","authors":["Li Pan","Yupei Zhang","Qiushi Yang","Tan Li","Xiaohan Xing","Maximus C. F. Yeung","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17074v3","updated":"2024-08-16T04:46:19Z","published":"2023-09-29T09:10:04Z","title":"AdaDiff: Accelerating Diffusion Models through Step-Wise Adaptive\n Computation","summary":" Diffusion models achieve great success in generating diverse and\nhigh-fidelity images, yet their widespread application, especially in real-time\nscenarios, is hampered by their inherently slow generation speed. The slow\ngeneration stems from the necessity of multi-step network inference. While some\ncertain predictions benefit from the full computation of the model in each\nsampling iteration, not every iteration requires the same amount of\ncomputation, potentially leading to inefficient computation. Unlike typical\nadaptive computation challenges that deal with single-step generation problems,\ndiffusion processes with a multi-step generation need to dynamically adjust\ntheir computational resource allocation based on the ongoing assessment of each\nstep's importance to the final image output, presenting a unique set of\nchallenges. In this work, we propose AdaDiff, an adaptive framework that\ndynamically allocates computation resources in each sampling step to improve\nthe generation efficiency of diffusion models. To assess the effects of changes\nin computational effort on image quality, we present a timestep-aware\nuncertainty estimation module (UEM). Integrated at each intermediate layer, the\nUEM evaluates the predictive uncertainty. This uncertainty measurement serves\nas an indicator for determining whether to terminate the inference process.\nAdditionally, we introduce an uncertainty-aware layer-wise loss aimed at\nbridging the performance gap between full models and their adaptive\ncounterparts.\n","authors":["Shengkun Tang","Yaqing Wang","Caiwen Ding","Yi Liang","Yao Li","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2309.17074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08524v1","updated":"2024-08-16T04:38:31Z","published":"2024-08-16T04:38:31Z","title":"GS-ID: Illumination Decomposition on Gaussian Splatting via Diffusion\n Prior and Parametric Light Source Optimization","summary":" We present GS-ID, a novel framework for illumination decomposition on\nGaussian Splatting, achieving photorealistic novel view synthesis and intuitive\nlight editing. Illumination decomposition is an ill-posed problem facing three\nmain challenges: 1) priors for geometry and material are often lacking; 2)\ncomplex illumination conditions involve multiple unknown light sources; and 3)\ncalculating surface shading with numerous light sources is computationally\nexpensive. To address these challenges, we first introduce intrinsic diffusion\npriors to estimate the attributes for physically based rendering. Then we\ndivide the illumination into environmental and direct components for joint\noptimization. Last, we employ deferred rendering to reduce the computational\nload. Our framework uses a learnable environment map and Spherical Gaussians\n(SGs) to represent light sources parametrically, therefore enabling\ncontrollable and photorealistic relighting on Gaussian Splatting. Extensive\nexperiments and applications demonstrate that GS-ID produces state-of-the-art\nillumination decomposition results while achieving better geometry\nreconstruction and rendering performance.\n","authors":["Kang Du","Zhihao Liang","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08524v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.08518v1","updated":"2024-08-16T04:14:28Z","published":"2024-08-16T04:14:28Z","title":"Visual-Friendly Concept Protection via Selective Adversarial\n Perturbations","summary":" Personalized concept generation by tuning diffusion models with a few images\nraises potential legal and ethical concerns regarding privacy and intellectual\nproperty rights. Researchers attempt to prevent malicious personalization using\nadversarial perturbations. However, previous efforts have mainly focused on the\neffectiveness of protection while neglecting the visibility of perturbations.\nThey utilize global adversarial perturbations, which introduce noticeable\nalterations to original images and significantly degrade visual quality. In\nthis work, we propose the Visual-Friendly Concept Protection (VCPro) framework,\nwhich prioritizes the protection of key concepts chosen by the image owner\nthrough adversarial perturbations with lower perceptibility. To ensure these\nperturbations are as inconspicuous as possible, we introduce a relaxed\noptimization objective to identify the least perceptible yet effective\nadversarial perturbations, solved using the Lagrangian multiplier method.\nQualitative and quantitative experiments validate that VCPro achieves a better\ntrade-off between the visibility of perturbations and protection effectiveness,\neffectively prioritizing the protection of target concepts in images with less\nperceptible perturbations.\n","authors":["Xiaoyue Mi","Fan Tang","Juan Cao","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08518v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2405.13571v2","updated":"2024-08-16T03:33:25Z","published":"2024-05-22T12:08:56Z","title":"Incomplete Multimodal Industrial Anomaly Detection via Cross-Modal\n Distillation","summary":" Recent studies of multimodal industrial anomaly detection (IAD) based on 3D\npoint clouds and RGB images have highlighted the importance of exploiting the\nredundancy and complementarity among modalities for accurate classification and\nsegmentation. However, achieving multimodal IAD in practical production lines\nremains a work in progress. It is essential to consider the trade-offs between\nthe costs and benefits associated with the introduction of new modalities while\nensuring compatibility with current processes. Existing quality control\nprocesses combine rapid in-line inspections, such as optical and infrared\nimaging with high-resolution but time-consuming near-line characterization\ntechniques, including industrial CT and electron microscopy to manually or\nsemi-automatically locate and analyze defects in the production of Li-ion\nbatteries and composite materials. Given the cost and time limitations, only a\nsubset of the samples can be inspected by all in-line and near-line methods,\nand the remaining samples are only evaluated through one or two forms of\nin-line inspection. To fully exploit data for deep learning-driven automatic\ndefect detection, the models must have the ability to leverage multimodal\ntraining and handle incomplete modalities during inference. In this paper, we\npropose CMDIAD, a Cross-Modal Distillation framework for IAD to demonstrate the\nfeasibility of a Multi-modal Training, Few-modal Inference (MTFI) pipeline. Our\nfindings show that the MTFI pipeline can more effectively utilize incomplete\nmultimodal information compared to applying only a single modality for training\nand inference. Moreover, we investigate the reasons behind the asymmetric\nperformance improvement using point clouds or RGB images as the main modality\nof inference. This provides a foundation for our future multimodal dataset\nconstruction with additional modalities from manufacturing scenarios.\n","authors":["Wenbo Sui","Daniel Lichau","Josselin Lefèvre","Harold Phelippeau"],"pdf_url":"https://arxiv.org/pdf/2405.13571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08502v1","updated":"2024-08-16T03:01:07Z","published":"2024-08-16T03:01:07Z","title":"Efficient Image-to-Image Diffusion Classifier for Adversarial Robustness","summary":" Diffusion models (DMs) have demonstrated great potential in the field of\nadversarial robustness, where DM-based defense methods can achieve superior\ndefense capability without adversarial training. However, they all require huge\ncomputational costs due to the usage of large-scale pre-trained DMs, making it\ndifficult to conduct full evaluation under strong attacks and compare with\ntraditional CNN-based methods. Simply reducing the network size and timesteps\nin DMs could significantly harm the image generation quality, which invalidates\nprevious frameworks. To alleviate this issue, we redesign the diffusion\nframework from generating high-quality images to predicting distinguishable\nimage labels. Specifically, we employ an image translation framework to learn\nmany-to-one mapping from input samples to designed orthogonal image labels.\nBased on this framework, we introduce an efficient Image-to-Image diffusion\nclassifier with a pruned U-Net structure and reduced diffusion timesteps.\nBesides the framework, we redesign the optimization objective of DMs to fit the\ntarget of image classification, where a new classification loss is incorporated\nin the DM-based image translation framework to distinguish the generated label\nfrom those of other classes. We conduct sufficient evaluations of the proposed\nclassifier under various attacks on popular benchmarks. Extensive experiments\nshow that our method achieves better adversarial robustness with fewer\ncomputational costs than DM-based and CNN-based methods. The code is available\nat https://github.com/hfmei/IDC.\n","authors":["Hefei Mei","Minjing Dong","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08500v1","updated":"2024-08-16T02:55:10Z","published":"2024-08-16T02:55:10Z","title":"CoSEC: A Coaxial Stereo Event Camera Dataset for Autonomous Driving","summary":" Conventional frame camera is the mainstream sensor of the autonomous driving\nscene perception, while it is limited in adverse conditions, such as low light.\nEvent camera with high dynamic range has been applied in assisting frame camera\nfor the multimodal fusion, which relies heavily on the pixel-level spatial\nalignment between various modalities. Typically, existing multimodal datasets\nmainly place event and frame cameras in parallel and directly align them\nspatially via warping operation. However, this parallel strategy is less\neffective for multimodal fusion, since the large disparity exacerbates spatial\nmisalignment due to the large event-frame baseline. We argue that baseline\nminimization can reduce alignment error between event and frame cameras. In\nthis work, we introduce hybrid coaxial event-frame devices to build the\nmultimodal system, and propose a coaxial stereo event camera (CoSEC) dataset\nfor autonomous driving. As for the multimodal system, we first utilize the\nmicrocontroller to achieve time synchronization, and then spatially calibrate\ndifferent sensors, where we perform intra- and inter-calibration of stereo\ncoaxial devices. As for the multimodal dataset, we filter LiDAR point clouds to\ngenerate depth and optical flow labels using reference depth, which is further\nimproved by fusing aligned event and frame data in nighttime conditions. With\nthe help of the coaxial device, the proposed dataset can promote the all-day\npixel-level multimodal fusion. Moreover, we also conduct experiments to\ndemonstrate that the proposed dataset can improve the performance and\ngeneralization of the multimodal fusion.\n","authors":["Shihan Peng","Hanyu Zhou","Hao Dong","Zhiwei Shi","Haoyue Liu","Yuxing Duan","Yi Chang","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2408.08500v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2408.08495v1","updated":"2024-08-16T02:33:55Z","published":"2024-08-16T02:33:55Z","title":"Achieving Complex Image Edits via Function Aggregation with Diffusion\n Models","summary":" Diffusion models have demonstrated strong performance in generative tasks,\nmaking them ideal candidates for image editing. Recent studies highlight their\nability to apply desired edits effectively by following textual instructions,\nyet two key challenges persist. First, these models struggle to apply multiple\nedits simultaneously, resulting in computational inefficiencies due to their\nreliance on sequential processing. Second, relying on textual prompts to\ndetermine the editing region can lead to unintended alterations in other parts\nof the image. In this work, we introduce FunEditor, an efficient diffusion\nmodel designed to learn atomic editing functions and perform complex edits by\naggregating simpler functions. This approach enables complex editing tasks,\nsuch as object movement, by aggregating multiple functions and applying them\nsimultaneously to specific areas. FunEditor is 5 to 24 times faster inference\nthan existing methods on complex tasks like object movement. Our experiments\ndemonstrate that FunEditor significantly outperforms recent baselines,\nincluding both inference-time optimization methods and fine-tuned models,\nacross various metrics, such as image quality assessment (IQA) and\nobject-background consistency.\n","authors":["Mohammadreza Samadi","Fred X. Han","Mohammad Salameh","Hao Wu","Fengyu Sun","Chunhua Zhou","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2408.08495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08092v2","updated":"2024-08-16T02:28:12Z","published":"2024-08-15T11:34:53Z","title":"OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse\n Click Annotation","summary":" LiDAR-based outdoor 3D object detection has received widespread attention.\nHowever, training 3D detectors from the LiDAR point cloud typically relies on\nexpensive bounding box annotations. This paper presents OC3D, an innovative\nweakly supervised method requiring only coarse clicks on the bird's eye view of\nthe 3D point cloud. A key challenge here is the absence of complete geometric\ndescriptions of the target objects from such simple click annotations. To\naddress this problem, our proposed OC3D adopts a two-stage strategy. In the\nfirst stage, we initially design a novel dynamic and static classification\nstrategy and then propose the Click2Box and Click2Mask modules to generate\nbox-level and mask-level pseudo-labels for static and dynamic instances,\nrespectively. In the second stage, we design a Mask2Box module, leveraging the\nlearning capabilities of neural networks to update mask-level pseudo-labels,\nwhich contain less information, to box-level pseudo-labels. Experimental\nresults on the widely used KITTI and nuScenes datasets demonstrate that our\nOC3D with only coarse clicks achieves state-of-the-art performance compared to\nweakly-supervised 3D detection methods. Combining OC3D with a missing click\nmining strategy, we propose an OC3D++ pipeline, which requires only 0.2%\nannotation cost in the KITTI dataset to achieve performance comparable to fully\nsupervised methods. The code will be made publicly available.\n","authors":["Qiming Xia","Hongwei Lin","Wei Ye","Hai Wu","Yadan Luo","Shijia Zhao","Xin Li","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21308v2","updated":"2024-08-16T02:28:07Z","published":"2024-07-31T03:20:11Z","title":"Enhanced Self-Checkout System for Retail Based on Improved YOLOv10","summary":" With the rapid advancement of deep learning technologies, computer vision has\nshown immense potential in retail automation. This paper presents a novel\nself-checkout system for retail based on an improved YOLOv10 network, aimed at\nenhancing checkout efficiency and reducing labor costs. We propose targeted\noptimizations to the YOLOv10 model, by incorporating the detection head\nstructure from YOLOv8, which significantly improves product recognition\naccuracy. Additionally, we develop a post-processing algorithm tailored for\nself-checkout scenarios, to further enhance the application of system.\nExperimental results demonstrate that our system outperforms existing methods\nin both product recognition accuracy and checkout speed. This research not only\nprovides a new technical solution for retail automation but offers valuable\ninsights into optimizing deep learning models for real-world applications.\n","authors":["Lianghao Tan","Shubing Liu","Jing Gao","Xiaoyi Liu","Linyue Chu","Huangqi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08489v1","updated":"2024-08-16T02:18:23Z","published":"2024-08-16T02:18:23Z","title":"DFT-Based Adversarial Attack Detection in MRI Brain Imaging: Enhancing\n Diagnostic Accuracy in Alzheimer's Case Studies","summary":" Recent advancements in deep learning, particularly in medical imaging, have\nsignificantly propelled the progress of healthcare systems. However, examining\nthe robustness of medical images against adversarial attacks is crucial due to\ntheir real-world applications and profound impact on individuals' health. These\nattacks can result in misclassifications in disease diagnosis, potentially\nleading to severe consequences. Numerous studies have explored both the\nimplementation of adversarial attacks on medical images and the development of\ndefense mechanisms against these threats, highlighting the vulnerabilities of\ndeep neural networks to such adversarial activities. In this study, we\ninvestigate adversarial attacks on images associated with Alzheimer's disease\nand propose a defensive method to counteract these attacks. Specifically, we\nexamine adversarial attacks that employ frequency domain transformations on\nAlzheimer's disease images, along with other well-known adversarial attacks.\nOur approach utilizes a convolutional neural network (CNN)-based autoencoder\narchitecture in conjunction with the two-dimensional Fourier transform of\nimages for detection purposes. The simulation results demonstrate that our\ndetection and defense mechanism effectively mitigates several adversarial\nattacks, thereby enhancing the robustness of deep neural networks against such\nvulnerabilities.\n","authors":["Mohammad Hossein Najafi","Mohammad Morsali","Mohammadmahdi Vahediahmar","Saeed Bagheri Shouraki"],"pdf_url":"https://arxiv.org/pdf/2408.08489v1.pdf","comment":"10 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2408.07988v2","updated":"2024-08-16T01:50:47Z","published":"2024-08-15T07:30:21Z","title":"Exploring learning environments for label\\-efficient cancer diagnosis","summary":" Despite significant research efforts and advancements, cancer remains a\nleading cause of mortality. Early cancer prediction has become a crucial focus\nin cancer research to streamline patient care and improve treatment outcomes.\nManual tumor detection by histopathologists can be time consuming, prompting\nthe need for computerized methods to expedite treatment planning. Traditional\napproaches to tumor detection rely on supervised learning, necessitates a large\namount of annotated data for model training. However, acquiring such extensive\nlabeled data can be laborious and time\\-intensive. This research examines the\nthree learning environments: supervised learning (SL), semi\\-supervised\nlearning (Semi\\-SL), and self\\-supervised learning (Self\\-SL): to predict\nkidney, lung, and breast cancer. Three pre\\-trained deep learning models\n(Residual Network\\-50, Visual Geometry Group\\-16, and EfficientNetB0) are\nevaluated based on these learning settings using seven carefully curated\ntraining sets. To create the first training set (TS1), SL is applied to all\nannotated image samples. Five training sets (TS2\\-TS6) with different ratios of\nlabeled and unlabeled cancer images are used to evaluateSemi\\-SL. Unlabeled\ncancer images from the final training set (TS7) are utilized for Self\\-SL\nassessment. Among different learning environments, outcomes from the Semi\\-SL\nsetting show a strong degree of agreement with the outcomes achieved in the SL\nsetting. The uniform pattern of observations from the pre\\-trained models\nacross all three datasets validates the methodology and techniques of the\nresearch. Based on modest number of labeled samples and minimal computing cost,\nour study suggests that the Semi\\-SL option can be a highly viable replacement\nfor the SL option under label annotation constraint scenarios.\n","authors":["Samta Rani","Tanvir Ahmad","Sarfaraz Masood","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2408.07988v2.pdf","comment":"Submitted to the journal"},{"id":"http://arxiv.org/abs/2402.17987v3","updated":"2024-08-16T01:37:41Z","published":"2024-02-28T02:11:47Z","title":"Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A\n Bayesian Fusion Approach","summary":" Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs)\ninvolves transmitting Electromagnetic Waves (EMWs) and performing target type\nrecognition on the received radar echo, crucial for defense and aerospace\napplications. Previous studies highlighted the advantages of multistatic radar\nconfigurations over monostatic ones in RATR. However, fusion methods in\nmultistatic radar configurations often suboptimally combine classification\nvectors from individual radars probabilistically. To address this, we propose a\nfully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to\naggregate classification probability vectors from multiple radars. OBF, based\non expected 0-1 loss, updates a Recursive Bayesian Classification (RBC)\nposterior distribution for target UAV type, conditioned on historical\nobservations across multiple time steps. We evaluate the approach using\nsimulated random walk trajectories for seven drones, correlating target aspect\nangles to Radar Cross Section (RCS) measurements in an anechoic chamber.\nComparing against single radar Automated Target Recognition (ATR) systems and\nsuboptimal fusion methods, our empirical results demonstrate that the OBF\nmethod integrated with RBC significantly enhances classification accuracy\ncompared to other fusion methods and single radar configurations.\n","authors":["Michael Potter","Murat Akcakaya","Marius Necsoiu","Gunar Schirner","Deniz Erdogmus","Tales Imbiriba"],"pdf_url":"https://arxiv.org/pdf/2402.17987v3.pdf","comment":"Accepted to IEEE Transactions on Aerospace and Electronic Systems"},{"id":"http://arxiv.org/abs/2310.15130v2","updated":"2024-08-16T01:35:52Z","published":"2023-10-23T17:34:31Z","title":"Novel-View Acoustic Synthesis from 3D Reconstructed Rooms","summary":" We investigate the benefit of combining blind audio recordings with 3D scene\ninformation for novel-view acoustic synthesis. Given audio recordings from 2-4\nmicrophones and the 3D geometry and material of a scene containing multiple\nunknown sound sources, we estimate the sound anywhere in the scene. We identify\nthe main challenges of novel-view acoustic synthesis as sound source\nlocalization, separation, and dereverberation. While naively training an\nend-to-end network fails to produce high-quality results, we show that\nincorporating room impulse responses (RIRs) derived from 3D reconstructed rooms\nenables the same network to jointly tackle these tasks. Our method outperforms\nexisting methods designed for the individual tasks, demonstrating its\neffectiveness at utilizing 3D visual information. In a simulated study on the\nMatterport3D-NVAS dataset, our model achieves near-perfect accuracy on source\nlocalization, a PSNR of 26.44dB and a SDR of 14.23dB for source separation and\ndereverberation, resulting in a PSNR of 25.55 dB and a SDR of 14.20 dB on\nnovel-view acoustic synthesis. We release our code and model on our project\nwebsite at https://github.com/apple/ml-nvas3d. Please wear headphones when\nlistening to the results.\n","authors":["Byeongjoo Ahn","Karren Yang","Brian Hamilton","Jonathan Sheaffer","Anurag Ranjan","Miguel Sarabia","Oncel Tuzel","Jen-Hao Rick Chang"],"pdf_url":"https://arxiv.org/pdf/2310.15130v2.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.14001v2","updated":"2024-08-16T01:27:59Z","published":"2024-07-19T03:22:04Z","title":"Component Selection for Craft Assembly Tasks","summary":" Inspired by traditional handmade crafts, where a person improvises assemblies\nbased on the available objects, we formally introduce the Craft Assembly Task.\nIt is a robotic assembly task that involves building an accurate representation\nof a given target object using the available objects, which do not directly\ncorrespond to its parts. In this work, we focus on selecting the subset of\navailable objects for the final craft, when the given input is an RGB image of\nthe target in the wild. We use a mask segmentation neural network to identify\nvisible parts, followed by retrieving labelled template meshes. These meshes\nundergo pose optimization to determine the most suitable template. Then, we\npropose to simplify the parts of the transformed template mesh to primitive\nshapes like cuboids or cylinders. Finally, we design a search algorithm to find\ncorrespondences in the scene based on local and global proportions. We develop\nbaselines for comparison that consider all possible combinations, and choose\nthe highest scoring combination for common metrics used in foreground maps and\nmask accuracy. Our approach achieves comparable results to the baselines for\ntwo different scenes, and we show qualitative results for an implementation in\na real-world scenario.\n","authors":["Vitor Hideyo Isume","Takuya Kiyokawa","Natsuki Yamanobe","Yukiyasu Domae","Weiwei Wan","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2407.14001v2.pdf","comment":"Published on IEEE RA-L"},{"id":"http://arxiv.org/abs/2306.00416v4","updated":"2024-08-16T01:07:21Z","published":"2023-06-01T07:48:34Z","title":"Interactive Character Control with Auto-Regressive Motion Diffusion\n Models","summary":" Real-time character control is an essential component for interactive\nexperiences, with a broad range of applications, including physics simulations,\nvideo games, and virtual reality. The success of diffusion models for image\nsynthesis has led to the use of these models for motion synthesis. However, the\nmajority of these motion diffusion models are primarily designed for offline\napplications, where space-time models are used to synthesize an entire sequence\nof frames simultaneously with a pre-specified length. To enable real-time\nmotion synthesis with diffusion model that allows time-varying controls, we\npropose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional\ndiffusion model takes an initial pose as input, and auto-regressively generates\nsuccessive motion frames conditioned on the previous frame. Despite its\nstreamlined network architecture, which uses simple MLPs, our framework is\ncapable of generating diverse, long-horizon, and high-fidelity motion\nsequences. Furthermore, we introduce a suite of techniques for incorporating\ninteractive controls into A-MDM, such as task-oriented sampling, in-painting,\nand hierarchical reinforcement learning. These techniques enable a pre-trained\nA-MDM to be efficiently adapted for a variety of new downstream tasks. We\nconduct a comprehensive suite of experiments to demonstrate the effectiveness\nof A-MDM, and compare its performance against state-of-the-art auto-regressive\nmethods.\n","authors":["Yi Shi","Jingbo Wang","Xuekun Jiang","Bingkun Lin","Bo Dai","Xue Bin Peng"],"pdf_url":"https://arxiv.org/pdf/2306.00416v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08461v1","updated":"2024-08-16T00:05:16Z","published":"2024-08-16T00:05:16Z","title":"TEXTOC: Text-driven Object-Centric Style Transfer","summary":" We present Text-driven Object-Centric Style Transfer (TEXTOC), a novel method\nthat guides style transfer at an object-centric level using textual inputs. The\ncore of TEXTOC is our Patch-wise Co-Directional (PCD) loss, meticulously\ndesigned for precise object-centric transformations that are closely aligned\nwith the input text. This loss combines a patch directional loss for\ntext-guided style direction and a patch distribution consistency loss for even\nCLIP embedding distribution across object regions. It ensures a seamless and\nharmonious style transfer across object regions. Key to our method are the\nText-Matched Patch Selection (TMPS) and Pre-fixed Region Selection (PRS)\nmodules for identifying object locations via text, eliminating the need for\nsegmentation masks. Lastly, we introduce an Adaptive Background Preservation\n(ABP) loss to maintain the original style and structural essence of the image's\nbackground. This loss is applied to dynamically identified background areas.\nExtensive experiments underline the effectiveness of our approach in creating\nvisually coherent and textually aligned style transfers.\n","authors":["Jihun Park","Jongmin Gim","Kyoungmin Lee","Seunghun Lee","Sunghoon Im"],"pdf_url":"https://arxiv.org/pdf/2408.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11201v3","updated":"2024-08-16T00:00:28Z","published":"2023-01-26T16:22:01Z","title":"Relative-Interior Solution for the (Incomplete) Linear Assignment\n Problem with Applications to the Quadratic Assignment Problem","summary":" We study the set of optimal solutions of the dual linear programming\nformulation of the linear assignment problem (LAP) to propose a method for\ncomputing a solution from the relative interior of this set. Assuming that an\narbitrary dual-optimal solution and an optimal assignment are available (for\nwhich many efficient algorithms already exist), our method computes a\nrelative-interior solution in linear time. Since the LAP occurs as a subproblem\nin the linear programming (LP) relaxation of the quadratic assignment problem\n(QAP), we employ our method as a new component in the family of dual-ascent\nalgorithms that provide bounds on the optimal value of the QAP. To make our\nresults applicable to the incomplete QAP, which is of interest in practical\nuse-cases, we also provide a linear-time reduction from the incomplete LAP to\nthe complete LAP along with a mapping that preserves optimality and membership\nin the relative interior. Our experiments on publicly available benchmarks\nindicate that our approach with relative-interior solution can frequently\nprovide bounds near the optimum of the LP relaxation and its runtime is much\nlower when compared to a commercial LP solver.\n","authors":["Tomáš Dlask","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2301.11201v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.08821v1","updated":"2024-08-16T16:09:59Z","published":"2024-08-16T16:09:59Z","title":"EasyRec: Simple yet Effective Language Models for Recommendation","summary":" Deep neural networks have become a powerful technique for learning\nrepresentations from user-item interaction data in collaborative filtering (CF)\nfor recommender systems. However, many existing methods heavily rely on unique\nuser and item IDs, which limits their ability to perform well in practical\nzero-shot learning scenarios where sufficient training data may be unavailable.\nInspired by the success of language models (LMs) and their strong\ngeneralization capabilities, a crucial question arises: How can we harness the\npotential of language models to empower recommender systems and elevate its\ngeneralization capabilities to new heights? In this study, we propose EasyRec -\nan effective and easy-to-use approach that seamlessly integrates text-based\nsemantic understanding with collaborative signals. EasyRec employs a\ntext-behavior alignment framework, which combines contrastive learning with\ncollaborative language model tuning, to ensure a strong alignment between the\ntext-enhanced semantic space and the collaborative behavior information.\nExtensive empirical evaluations across diverse real-world datasets demonstrate\nthe superior performance of EasyRec compared to state-of-the-art alternative\nmodels, particularly in the challenging text-based zero-shot recommendation\nscenarios. Furthermore, the study highlights the potential of seamlessly\nintegrating EasyRec as a plug-and-play component into text-enhanced\ncollaborative filtering frameworks, thereby empowering existing recommender\nsystems to elevate their recommendation performance and adapt to the evolving\nuser preferences in dynamic environments. For better result reproducibility of\nour EasyRec framework, the model implementation details, source code, and\ndatasets are available at the link: https://github.com/HKUDS/EasyRec.\n","authors":["Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12934v2","updated":"2024-08-16T13:47:59Z","published":"2024-05-21T17:05:02Z","title":"Address-Specific Sustainable Accommodation Choice Through Real-World\n Data Integration","summary":" Consumers wish to choose sustainable accommodation for their travels, and in\nthe case of corporations, may be required to do so. Yet accommodation\nmarketplaces provide no meaningful capability for sustainable choice: typically\nCO2 estimates are provided that are identical for all accommodation of the same\ntype across an entire country. We propose a decision support system that\nenables real choice of sustainable accommodation. We develop a data-driven\naddress-specific metric called EcoGrade, which integrates government approved\ndatasets and uses interpolation where data is sparse. We validate the metric on\n10,000 UK addresses in 10 cities, showing the match of our interpolations to\nreality is statistically significant. We show how the metric has been embedded\ninto a decision support system for a global accommodation marketplace and\ntested by real users over several months with positive user feedback. In the\nEU, forty percent of final energy consumption is from buildings. We need to\nencourage all building owners to make their accommodation more efficient. The\nrental sector is one area where change can occur rapidly, as rented\naccommodation is renovated frequently. We anticipate our decision support\nsystem using EcoGrade will encourage this positive change.\n","authors":["Peter J. Bentley","Rajat Mathur","Soo Ling Lim","Sid Narang"],"pdf_url":"https://arxiv.org/pdf/2405.12934v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.08713v1","updated":"2024-08-16T12:51:52Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n Interaction Modeling in CTR Prediction","summary":" Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Wu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v1.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2408.08709v1","updated":"2024-08-16T12:43:38Z","published":"2024-08-16T12:43:38Z","title":"Multimodal Relational Triple Extraction with Query-based Entity Object\n Transformer","summary":" Multimodal Relation Extraction is crucial for constructing flexible and\nrealistic knowledge graphs. Recent studies focus on extracting the relation\ntype with entity pairs present in different modalities, such as one entity in\nthe text and another in the image. However, existing approaches require\nentities and objects given beforehand, which is costly and impractical. To\naddress the limitation, we propose a novel task, Multimodal Entity-Object\nRelational Triple Extraction, which aims to extract all triples (entity span,\nrelation, object region) from image-text pairs. To facilitate this study, we\nmodified a multimodal relation extraction dataset MORE, which includes 21\nrelation types, to create a new dataset containing 20,264 triples, averaging\n5.75 triples per image-text pair. Moreover, we propose QEOT, a query-based\nmodel with a selective attention mechanism, to dynamically explore the\ninteraction and fusion of textual and visual information. In particular, the\nproposed method can simultaneously accomplish entity extraction, relation\nclassification, and object detection with a set of queries. Our method is\nsuitable for downstream applications and reduces error accumulation due to the\npipeline-style approaches. Extensive experimental results demonstrate that our\nproposed method outperforms the existing baselines by 8.06% and achieves\nstate-of-the-art performance.\n","authors":["Lei Hei","Ning An","Tingjing Liao","Qi Ma","Jiaqi Wang","Feiliang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.08709v1.pdf","comment":"15 pages, 7 figures, preprint"},{"id":"http://arxiv.org/abs/2408.08686v1","updated":"2024-08-16T11:59:01Z","published":"2024-08-16T11:59:01Z","title":"SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking\n for~Sequential Recommendation","summary":" Language Models (LMs) are increasingly employed in recommendation systems due\nto their advanced language understanding and generation capabilities. Recent\nrecommender systems based on generative retrieval have leveraged the\ninferential abilities of LMs to directly generate the index tokens of the next\nitem, based on item sequences within the user's interaction history. Previous\nstudies have mostly focused on item indices based solely on textual semantic or\ncollaborative information. However, although the standalone effectiveness of\nthese aspects has been demonstrated, the integration of this information has\nremained unexplored. Our in-depth analysis finds that there is a significant\ndifference in the knowledge captured by the model from heterogeneous item\nindices and diverse input prompts, which can have a high potential for\ncomplementarity. In this paper, we propose SC-Rec, a unified recommender system\nthat learns diverse preference knowledge from two distinct item indices and\nmultiple prompt templates. Furthermore, SC-Rec adopts a novel reranking\nstrategy that aggregates a set of ranking results, inferred based on different\nindices and prompts, to achieve the self-consistency of the model. Our\nempirical evaluation on three real-world datasets demonstrates that SC-Rec\nconsiderably outperforms the state-of-the-art methods for sequential\nrecommendation, effectively incorporating complementary knowledge from varied\noutputs of the model.\n","authors":["Tongyoung Kim","Soojin Yoon","Seongku Kang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2408.08686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16120v2","updated":"2024-08-16T11:15:18Z","published":"2024-05-25T08:17:19Z","title":"Guaranteeing Accuracy and Fairness under Fluctuating User Traffic: A\n Bankruptcy-Inspired Re-ranking Approach","summary":" Out of sustainable and economical considerations, two-sided recommendation\nplatforms must satisfy the needs of both users and providers. Previous studies\noften show that the two sides' needs show different urgency: providers need a\nrelatively long-term exposure demand while users want more short-term and\naccurate service. However, our empirical study reveals that previous methods\nfor trading off fairness-accuracy often fail to guarantee long-term fairness\nand short-term accuracy simultaneously in real applications of fluctuating user\ntraffic. Especially, when user traffic is low, the user experience often drops\na lot. Our theoretical analysis also confirms that user traffic is a key factor\nin such a trade-off problem. How to guarantee accuracy and fairness under\nfluctuating user traffic remains a problem. Inspired by the bankruptcy problem\nin economics, we propose a novel fairness-aware re-ranking approach named\nBankFair. Intuitively, BankFair employs the Talmud rule to leverage periods of\nabundant user traffic to offset periods of user traffic scarcity, ensuring\nconsistent user service at every period while upholding long-term fairness.\nSpecifically, BankFair consists of two modules: (1) employing the Talmud rule\nto determine the required fairness degree under varying periods of user\ntraffic; and (2) conducting an online re-ranking algorithm based on the\nfairness degree determined by the Talmud rule. Experiments on two real-world\nrecommendation datasets show that BankFair outperforms all baselines regarding\naccuracy and provider fairness.\n","authors":["Xiaopeng Ye","Chen Xu","Jun Xu","Xuyang Xie","Gang Wang","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2405.16120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2408.08585v1","updated":"2024-08-16T07:39:38Z","published":"2024-08-16T07:39:38Z","title":"OptDist: Learning Optimal Distribution for Customer Lifetime Value\n Prediction","summary":" Customer Lifetime Value (CLTV) prediction is a critical task in business\napplications. Accurately predicting CLTV is challenging in real-world business\nscenarios, as the distribution of CLTV is complex and mutable. Firstly, there\nis a large number of users without any consumption consisting of a long-tailed\npart that is too complex to fit. Secondly, the small set of high-value users\nspent orders of magnitude more than a typical user leading to a wide range of\nthe CLTV distribution which is hard to capture in a single distribution.\nExisting approaches for CLTV estimation either assume a prior probability\ndistribution and fit a single group of distribution-related parameters for all\nsamples, or directly learn from the posterior distribution with manually\npredefined buckets in a heuristic manner. However, all these methods fail to\nhandle complex and mutable distributions. In this paper, we propose a novel\noptimal distribution selection model OptDist for CLTV prediction, which\nutilizes an adaptive optimal sub-distribution selection mechanism to improve\nthe accuracy of complex distribution modeling. Specifically, OptDist trains\nseveral candidate sub-distribution networks in the distribution learning module\n(DLM) for modeling the probability distribution of CLTV. Then, a distribution\nselection module (DSM) is proposed to select the sub-distribution for each\nsample, thus making the selection automatically and adaptively. Besides, we\ndesign an alignment mechanism that connects both modules, which effectively\nguides the optimization. We conduct extensive experiments on both two public\nand one private dataset to verify that OptDist outperforms state-of-the-art\nbaselines. Furthermore, OptDist has been deployed on a large-scale financial\nplatform for customer acquisition marketing campaigns and the online\nexperiments also demonstrate the effectiveness of OptDist.\n","authors":["Yunpeng Weng","Xing Tang","Zhenhao Xu","Fuyuan Lyu","Dugang Liu","Zexu Sun","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.08585v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08564v1","updated":"2024-08-16T06:54:10Z","published":"2024-08-16T06:54:10Z","title":"Collaborative Cross-modal Fusion with Large Language Model for\n Recommendation","summary":" Despite the success of conventional collaborative filtering (CF) approaches\nfor recommendation systems, they exhibit limitations in leveraging semantic\nknowledge within the textual attributes of users and items. Recent focus on the\napplication of large language models for recommendation (LLM4Rec) has\nhighlighted their capability for effective semantic knowledge capture. However,\nthese methods often overlook the collaborative signals in user behaviors. Some\nsimply instruct-tune a language model, while others directly inject the\nembeddings of a CF-based model, lacking a synergistic fusion of different\nmodalities. To address these issues, we propose a framework of Collaborative\nCross-modal Fusion with Large Language Models, termed CCF-LLM, for\nrecommendation. In this framework, we translate the user-item interactions into\na hybrid prompt to encode both semantic knowledge and collaborative signals,\nand then employ an attentive cross-modal fusion strategy to effectively fuse\nlatent embeddings of both modalities. Extensive experiments demonstrate that\nCCF-LLM outperforms existing methods by effectively utilizing semantic and\ncollaborative signals in the LLM4Rec context.\n","authors":["Zhongzhou Liu","Hao Zhang","Kuicai Dong","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08564v1.pdf","comment":"10 pages, 4 figures, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08538v1","updated":"2024-08-16T05:51:00Z","published":"2024-08-16T05:51:00Z","title":"Don't Click the Bait: Title Debiasing News Recommendation via\n Cross-Field Contrastive Learning","summary":" News recommendation emerges as a primary means for users to access content of\ninterest from the vast amount of news. The title clickbait extensively exists\nin news domain and increases the difficulty for news recommendation to offer\nsatisfactory services for users. Fortunately, we find that news abstract, as a\ncritical field of news, aligns cohesively with the news authenticity. To this\nend, we propose a Title Debiasing News Recommendation with Cross-field\nContrastive learning (TDNR-C2) to overcome the title bias by incorporating news\nabstract. Specifically, a multi-field knowledge extraction module is devised to\nextract multi-view knowledge about news from various fields. Afterwards, we\npresent a cross-field contrastive learning module to conduct bias removal via\ncontrasting learned knowledge from title and abstract fileds. Experimental\nresults on a real-world dataset demonstrate the superiority of the proposed\nTDNR-C2 over existing state-of-the-art methods. Further analysis also indicates\nthe significance of news abstract for title debiasing.\n","authors":["Yijie Shu","Xiaokun Zhang","Youlin Wu","Bo Xu","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16357v2","updated":"2024-08-16T05:16:31Z","published":"2024-07-23T10:00:45Z","title":"TWIN V2: Scaling Ultra-Long User Behavior Sequence Modeling for Enhanced\n CTR Prediction at Kuaishou","summary":" The significance of modeling long-term user interests for CTR prediction\ntasks in large-scale recommendation systems is progressively gaining attention\namong researchers and practitioners. Existing work, such as SIM and TWIN,\ntypically employs a two-stage approach to model long-term user behavior\nsequences for efficiency concerns. The first stage rapidly retrieves a subset\nof sequences related to the target item from a long sequence using a\nsearch-based mechanism namely the General Search Unit (GSU), while the second\nstage calculates the interest scores using the Exact Search Unit (ESU) on the\nretrieved results. Given the extensive length of user behavior sequences\nspanning the entire life cycle, potentially reaching up to 10^6 in scale, there\nis currently no effective solution for fully modeling such expansive user\ninterests. To overcome this issue, we introduced TWIN-V2, an enhancement of\nTWIN, where a divide-and-conquer approach is applied to compress life-cycle\nbehaviors and uncover more accurate and diverse user interests. Specifically, a\nhierarchical clustering method groups items with similar characteristics in\nlife-cycle behaviors into a single cluster during the offline phase. By\nlimiting the size of clusters, we can compress behavior sequences well beyond\nthe magnitude of 10^5 to a length manageable for online inference in GSU\nretrieval. Cluster-aware target attention extracts comprehensive and\nmulti-faceted long-term interests of users, thereby making the final\nrecommendation results more accurate and diverse. Extensive offline experiments\non a multi-billion-scale industrial dataset and online A/B tests have\ndemonstrated the effectiveness of TWIN-V2. Under an efficient deployment\nframework, TWIN-V2 has been successfully deployed to the primary traffic that\nserves hundreds of millions of daily active users at Kuaishou.\n","authors":["Zihua Si","Lin Guan","ZhongXiang Sun","Xiaoxue Zang","Jing Lu","Yiqun Hui","Xingchao Cao","Zeyu Yang","Yichen Zheng","Dewei Leng","Kai Zheng","Chenbin Zhang","Yanan Niu","Yang Song","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2407.16357v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08521v1","updated":"2024-08-16T04:32:10Z","published":"2024-08-16T04:32:10Z","title":"MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement\n Framework for Multimodal Question Answering","summary":" Recent advancements in retrieval-augmented generation (RAG) have demonstrated\nimpressive performance in the question-answering (QA) task. However, most\nprevious works predominantly focus on text-based answers. While some studies\naddress multimodal data, they still fall short in generating comprehensive\nmultimodal answers, particularly for explaining concepts or providing\nstep-by-step tutorials on how to accomplish specific goals. This capability is\nespecially valuable for applications such as enterprise chatbots and settings\nsuch as customer service and educational systems, where the answers are sourced\nfrom multimodal data. In this paper, we introduce a simple and effective\nframework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR\nenhances text-based answers by retrieving relevant multimodal data and refining\nthe responses to create coherent multimodal answers. This framework can be\neasily extended to support multimodal answers in enterprise chatbots with\nminimal modifications. Human evaluation results indicate that multimodal\nanswers generated by MuRAR are more useful and readable compared to plain text\nanswers.\n","authors":["Zhengyuan Zhu","Daniel Lee","Hong Zhang","Sai Sree Harsha","Loic Feujio","Akash Maharaj","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.08521v1.pdf","comment":"Preprint"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.08873v1","updated":"2024-08-16T17:59:46Z","published":"2024-08-16T17:59:46Z","title":"Accelerating Giant Impact Simulations with Machine Learning","summary":" Constraining planet formation models based on the observed exoplanet\npopulation requires generating large samples of synthetic planetary systems,\nwhich can be computationally prohibitive. A significant bottleneck is\nsimulating the giant impact phase, during which planetary embryos evolve\ngravitationally and combine to form planets, which may themselves experience\nlater collisions. To accelerate giant impact simulations, we present a machine\nlearning (ML) approach to predicting collisional outcomes in multiplanet\nsystems. Trained on more than 500,000 $N$-body simulations of three-planet\nsystems, we develop an ML model that can accurately predict which two planets\nwill experience a collision, along with the state of the post-collision\nplanets, from a short integration of the system's initial conditions. Our model\ngreatly improves on non-ML baselines that rely on metrics from dynamics theory,\nwhich struggle to accurately predict which pair of planets will experience a\ncollision. By combining with a model for predicting long-term stability, we\ncreate an efficient ML-based giant impact emulator, which can predict the\noutcomes of giant impact simulations with a speedup of up to four orders of\nmagnitude. We expect our model to enable analyses that would not otherwise be\ncomputationally feasible. As such, we release our full training code, along\nwith an easy-to-use API for our collision outcome model and giant impact\nemulator.\n","authors":["Caleb Lammers","Miles Cranmer","Sam Hadden","Shirley Ho","Norman Murray","Daniel Tamayo"],"pdf_url":"https://arxiv.org/pdf/2408.08873v1.pdf","comment":"15 pages, 7 figures, 1 table. Easy-to-use API available at\n https://github.com/dtamayo/spock"},{"id":"http://arxiv.org/abs/2405.17243v2","updated":"2024-08-16T17:55:32Z","published":"2024-05-27T14:58:24Z","title":"Surprise-Adaptive Intrinsic Motivation for Unsupervised Reinforcement\n Learning","summary":" Both entropy-minimizing and entropy-maximizing (curiosity) objectives for\nunsupervised reinforcement learning (RL) have been shown to be effective in\ndifferent environments, depending on the environment's level of natural\nentropy. However, neither method alone results in an agent that will\nconsistently learn intelligent behavior across environments. In an effort to\nfind a single entropy-based method that will encourage emergent behaviors in\nany environment, we propose an agent that can adapt its objective online,\ndepending on the entropy conditions by framing the choice as a multi-armed\nbandit problem. We devise a novel intrinsic feedback signal for the bandit,\nwhich captures the agent's ability to control the entropy in its environment.\nWe demonstrate that such agents can learn to control entropy and exhibit\nemergent behaviors in both high- and low-entropy regimes and can learn skillful\nbehaviors in benchmark tasks. Videos of the trained agents and summarized\nfindings can be found on our project page\nhttps://sites.google.com/view/surprise-adaptive-agents\n","authors":["Adriana Hugessen","Roger Creus Castanyer","Faisal Mohamed","Glen Berseth"],"pdf_url":"https://arxiv.org/pdf/2405.17243v2.pdf","comment":"Published at the Reinforcement Learning Conference 2024"},{"id":"http://arxiv.org/abs/2408.08869v1","updated":"2024-08-16T17:54:09Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable gains in accuracy for Large\nLanguage Models (LLMs). However, such techniques depend on the availability of\nan accurate answer extraction process to aggregate across multiple outputs.\nMoreover, they acquire higher inference cost, in comparison to Greedy Decoding,\ndue to generation of relatively higher number of output tokens. Research has\nshown that the free form text outputs from Self-Consistency can be aggregated\nreliably using LLMs to produce the final output. Additionally, recent\nadvancements in LLM inference have demonstrated that usage of diverse exemplars\nin prompts have the ability to induce diversity in the LLM outputs. Such proven\ntechniques can be easily extended to self-ensembling based approaches to\nachieve enhanced results in text generation. In this paper, we introduce PEDAL\n(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid\nself-ensembling approach, that combines the strengths of diverse exemplar based\nprompts and LLM based aggregation to achieve improvement in overall\nperformance. On the publicly available SVAMP and ARC datasets, our experiments\nreveal that PEDAL can achieve better accuracy than Greedy Decoding based\nstrategies with lower inference cost compared to Self Consistency based\napproaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08868v1","updated":"2024-08-16T17:52:22Z","published":"2024-08-16T17:52:22Z","title":"A Hassle-free Algorithm for Private Learning in Practice: Don't Use Tree\n Aggregation, Use BLTs","summary":" The state-of-the-art for training on-device language models for mobile\nkeyboard applications combines federated learning (FL) with differential\nprivacy (DP) via the DP-Follow-the-Regularized-Leader (DP-FTRL) algorithm. Two\nvariants of DP-FTRL are used in practice, tree aggregation and matrix\nfactorization. However, tree aggregation suffers from significantly suboptimal\nprivacy/utility tradeoffs, while matrix mechanisms require expensive\noptimization parameterized by hard-to-estimate-in-advance constants, and high\nruntime memory costs.This paper extends the recently introduced Buffered Linear\nToeplitz (BLT) mechanism to multi-participation scenarios. Our BLT-DP-FTRL\nmaintains the ease-of-use advantages of tree aggregation, while essentially\nmatching matrix factorization in terms of utility and privacy. We evaluate\nBLT-DP-FTRL on the StackOverflow dataset, serving as a re-producible simulation\nbenchmark, and across four on-device language model tasks in a production FL\nsystem. Our empirical results highlight the advantages of the BLT mechanism and\nelevate the practicality and effectiveness of DP in real-world scenarios.\n","authors":["H. Brendan McMahan","Zheng Xu","Yanxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08862v1","updated":"2024-08-16T17:44:02Z","published":"2024-08-16T17:44:02Z","title":"Visual Agents as Fast and Slow Thinkers","summary":" Achieving human-level intelligence requires refining cognitive distinctions\nbetween System 1 and System 2 thinking. While contemporary AI, driven by large\nlanguage models, demonstrates human-like traits, it falls short of genuine\ncognition. Transitioning from structured benchmarks to real-world scenarios\npresents challenges for visual agents, often leading to inaccurate and overly\nconfident responses. To address the challenge, we introduce FaST, which\nincorporates the Fast and Slow Thinking mechanism into visual agents. FaST\nemploys a switch adapter to dynamically select between System 1/2 modes,\ntailoring the problem-solving approach to different task complexity. It tackles\nuncertain and unseen objects by adjusting model confidence and integrating new\ncontextual data. With this novel design, we advocate a flexible system,\nhierarchical reasoning capabilities, and a transparent decision-making\npipeline, all of which contribute to its ability to emulate human-like\ncognitive processes in visual intelligence. Empirical results demonstrate that\nFaST outperforms various well-known baselines, achieving 80.8% accuracy over\nVQA^{v2} for visual question answering and 48.7% GIoU score over ReasonSeg for\nreasoning segmentation, demonstrate FaST's superior performance. Extensive\ntesting validates the efficacy and robustness of FaST's core components,\nshowcasing its potential to advance the development of cognitive visual agents\nin AI systems.\n","authors":["Guangyan Sun","Mingyu Jin","Zhenting Wang","Cheng-Long Wang","Siqi Ma","Qifan Wang","Ying Nian Wu","Yongfeng Zhang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08859v1","updated":"2024-08-16T17:41:35Z","published":"2024-08-16T17:41:35Z","title":"Stochastic Bandits Robust to Adversarial Attacks","summary":" This paper investigates stochastic multi-armed bandit algorithms that are\nrobust to adversarial attacks, where an attacker can first observe the\nlearner's action and {then} alter their reward observation. We study two cases\nof this model, with or without the knowledge of an attack budget $C$, defined\nas an upper bound of the summation of the difference between the actual and\naltered rewards. For both cases, we devise two types of algorithms with regret\nbounds having additive or multiplicative $C$ dependence terms. For the known\nattack budget case, we prove our algorithms achieve the regret bound of\n${O}((K/\\Delta)\\log T + KC)$ and $\\tilde{O}(\\sqrt{KTC})$ for the additive and\nmultiplicative $C$ terms, respectively, where $K$ is the number of arms, $T$ is\nthe time horizon, $\\Delta$ is the gap between the expected rewards of the\noptimal arm and the second-best arm, and $\\tilde{O}$ hides the logarithmic\nfactors. For the unknown case, we prove our algorithms achieve the regret bound\nof $\\tilde{O}(\\sqrt{KT} + KC^2)$ and $\\tilde{O}(KC\\sqrt{T})$ for the additive\nand multiplicative $C$ terms, respectively. In addition to these upper bound\nresults, we provide several lower bounds showing the tightness of our bounds\nand the optimality of our algorithms. These results delineate an intrinsic\nseparation between the bandits with attacks and corruption models [Lykouris et\nal., 2018].\n","authors":["Xuchuang Wang","Jinhang Zuo","Xutong Liu","John C. S. Lui","Mohammad Hajiesmaili"],"pdf_url":"https://arxiv.org/pdf/2408.08859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02304v3","updated":"2024-08-16T17:28:08Z","published":"2023-10-03T17:59:32Z","title":"Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation","summary":" Several recent advances in AI systems solve problems by providing a\n\"scaffolding\" program that structures multiple calls to language models (LMs)\nto generate better outputs. A scaffolding program is written in a programming\nlanguage such as Python. In this work, we use a language-model-infused\nscaffolding program to improve itself. We start with a seed \"improver\" that\nimproves an input program according to a given utility function by querying an\nLM several times and returning the best solution. We then run this seed\nimprover to improve itself. Across a small set of downstream tasks, the\nresulting improved improver generates programs with significantly better\nperformance than its seed improver. A variety of self-improvement strategies\nare proposed by the language model, including beam search, genetic algorithms,\nand simulated annealing. Since the language models themselves are not altered,\nthis is not full recursive self-improvement. Nonetheless, it demonstrates that\na modern language model, GPT-4 in our experiments, is capable of writing code\nthat can call itself to improve itself. We consider concerns around the\ndevelopment of self-improving technologies and evaluate the frequency with\nwhich the generated code bypasses a sandbox.\n","authors":["Eric Zelikman","Eliana Lorch","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2310.02304v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08852v1","updated":"2024-08-16T17:26:42Z","published":"2024-08-16T17:26:42Z","title":"GeoTransformer: Enhancing Urban Forecasting with Geospatial Attention\n Mechanisms","summary":" Recent advancements have focused on encoding urban spatial information into\nhigh-dimensional spaces, with notable efforts dedicated to integrating\nsociodemographic data and satellite imagery. These efforts have established\nfoundational models in this field. However, the effective utilization of these\nspatial representations for urban forecasting applications remains\nunder-explored. To address this gap, we introduce GeoTransformer, a novel\nstructure that synergizes the Transformer architecture with geospatial\nstatistics prior. GeoTransformer employs an innovative geospatial attention\nmechanism to incorporate extensive urban information and spatial dependencies\ninto a unified predictive model. Specifically, we compute geospatial weighted\nattention scores between the target region and surrounding regions and leverage\nthe integrated urban information for predictions. Extensive experiments on GDP\nand ride-share demand prediction tasks demonstrate that GeoTransformer\nsignificantly outperforms existing baseline models, showcasing its potential to\nenhance urban forecasting tasks.\n","authors":["Yuhao Jia","Zile Wu","Shengao Yi","Yifei Sun"],"pdf_url":"https://arxiv.org/pdf/2408.08852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09173v2","updated":"2024-08-16T17:24:54Z","published":"2024-06-13T14:35:11Z","title":"Potion: Towards Poison Unlearning","summary":" Adversarial attacks by malicious actors on machine learning systems, such as\nintroducing poison triggers into training datasets, pose significant risks. The\nchallenge in resolving such an attack arises in practice when only a subset of\nthe poisoned data can be identified. This necessitates the development of\nmethods to remove, i.e. unlearn, poison triggers from already trained models\nwith only a subset of the poison data available. The requirements for this task\nsignificantly deviate from privacy-focused unlearning where all of the data to\nbe forgotten by the model is known. Previous work has shown that the\nundiscovered poisoned samples lead to a failure of established unlearning\nmethods, with only one method, Selective Synaptic Dampening (SSD), showing\nlimited success. Even full retraining, after the removal of the identified\npoison, cannot address this challenge as the undiscovered poison samples lead\nto a reintroduction of the poison trigger in the model. Our work addresses two\nkey challenges to advance the state of the art in poison unlearning. First, we\nintroduce a novel outlier-resistant method, based on SSD, that significantly\nimproves model protection and unlearning performance. Second, we introduce\nPoison Trigger Neutralisation (PTN) search, a fast, parallelisable,\nhyperparameter search that utilises the characteristic \"unlearning versus model\nprotection\" trade-off to find suitable hyperparameters in settings where the\nforget set size is unknown and the retain set is contaminated. We benchmark our\ncontributions using ResNet-9 on CIFAR10 and WideResNet-28x10 on CIFAR100.\nExperimental results show that our method heals 93.72% of poison compared to\nSSD with 83.41% and full retraining with 40.68%. We achieve this while also\nlowering the average model accuracy drop caused by unlearning from 5.68% (SSD)\nto 1.41% (ours).\n","authors":["Stefan Schoepf","Jack Foster","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2406.09173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12041v3","updated":"2024-08-16T17:22:23Z","published":"2023-09-21T13:09:10Z","title":"S-BDT: Distributed Differentially Private Boosted Decision Trees","summary":" We introduce S-BDT: a novel $(\\varepsilon,\\delta)$-differentially private\ndistributed gradient boosted decision tree (GBDT) learner that improves the\nprotection of single training data points (privacy) while achieving meaningful\nlearning goals, such as accuracy or regression error (utility). S-BDT uses less\nnoise by relying on non-spherical multivariate Gaussian noise, for which we\nshow tight subsampling bounds for privacy amplification and incorporate that\ninto a R\\'enyi filter for individual privacy accounting. We experimentally\nreach the same utility while saving $50\\%$ in terms of epsilon for $\\varepsilon\n\\le 0.5$ on the Abalone regression dataset (dataset size $\\approx 4K$), saving\n$30\\%$ in terms of epsilon for $\\varepsilon \\le 0.08$ for the Adult\nclassification dataset (dataset size $\\approx 50K$), and saving $30\\%$ in terms\nof epsilon for $\\varepsilon\\leq0.03$ for the Spambase classification dataset\n(dataset size $\\approx 5K$). Moreover, we show that for situations where a GBDT\nis learning a stream of data that originates from different subpopulations\n(non-IID), S-BDT improves the saving of epsilon even further.\n","authors":["Thorsten Peinemann","Moritz Kirschte","Joshua Stock","Carlos Cotrini","Esfandiar Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2309.12041v3.pdf","comment":"The first two authors equally contributed to this work"},{"id":"http://arxiv.org/abs/2408.08847v1","updated":"2024-08-16T17:19:07Z","published":"2024-08-16T17:19:07Z","title":"HistoGym: A Reinforcement Learning Environment for Histopathological\n Image Analysis","summary":" In pathological research, education, and clinical practice, the\ndecision-making process based on pathological images is critically important.\nThis significance extends to digital pathology image analysis: its adequacy is\ndemonstrated by the extensive information contained within tissue structures,\nwhich is essential for accurate cancer classification and grading.\nAdditionally, its necessity is highlighted by the inherent requirement for\ninterpretability in the conclusions generated by algorithms. For humans,\ndetermining tumor type and grade typically involves multi-scale analysis, which\npresents a significant challenge for AI algorithms. Traditional patch-based\nmethods are inadequate for modeling such complex structures, as they fail to\ncapture the intricate, multi-scale information inherent in whole slide images.\nConsequently, there is a pressing need for advanced AI techniques capable of\nefficiently and accurately replicating this complex analytical process. To\naddress this issue, we introduce HistoGym, an open-source reinforcement\nlearning environment for histopathological image analysis. Following OpenAI Gym\nAPIs, HistoGym aims to foster whole slide image diagnosis by mimicking the\nreal-life processes of doctors. Leveraging the pyramid feature of WSIs and the\nOpenSlide API, HistoGym provides a unified framework for various clinical\ntasks, including tumor detection and classification. We detail the observation,\naction, and reward specifications tailored for the histopathological image\nanalysis domain and provide an open-source Python-based interface for both\nclinicians and researchers. To accommodate different clinical demands, we offer\nvarious scenarios for different organs and cancers, including both WSI-based\nand selected region-based scenarios, showcasing several noteworthy results.\n","authors":["Zhi-Bo Liu","Xiaobo Pang","Jizhao Wang","Shuai Liu","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15455v2","updated":"2024-08-16T17:12:27Z","published":"2024-03-18T23:41:52Z","title":"Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams","summary":" The proliferation of textual data on the Internet presents a unique\nopportunity for institutions and companies to monitor public opinion about\ntheir services and products. Given the rapid generation of such data, the text\nstream mining setting, which handles sequentially arriving, potentially\ninfinite text streams, is often more suitable than traditional batch learning.\nWhile pre-trained language models are commonly employed for their high-quality\ntext vectorization capabilities in streaming contexts, they face challenges\nadapting to concept drift - the phenomenon where the data distribution changes\nover time, adversely affecting model performance. Addressing the issue of\nconcept drift, this study explores the efficacy of seven text sampling methods\ndesigned to selectively fine-tune language models, thereby mitigating\nperformance degradation. We precisely assess the impact of these methods on\nfine-tuning the SBERT model using four different loss functions. Our\nevaluation, focused on Macro F1-score and elapsed time, employs two text stream\ndatasets and an incremental SVM classifier to benchmark performance. Our\nfindings indicate that Softmax loss and Batch All Triplets loss are\nparticularly effective for text stream classification, demonstrating that\nlarger sample sizes generally correlate with improved macro F1-scores. Notably,\nour proposed WordPieceToken ratio sampling method significantly enhances\nperformance with the identified loss functions, surpassing baseline results.\n","authors":["Cristiano Mesquita Garcia","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2403.15455v2.pdf","comment":"Accepted for presentation at the 27th International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2408.08845v1","updated":"2024-08-16T17:06:07Z","published":"2024-08-16T17:06:07Z","title":"Shapley Marginal Surplus for Strong Models","summary":" Shapley values have seen widespread use in machine learning as a way to\nexplain model predictions and estimate the importance of covariates. Accurately\nexplaining models is critical in real-world models to both aid in decision\nmaking and to infer the properties of the true data-generating process (DGP).\nIn this paper, we demonstrate that while model-based Shapley values might be\naccurate explainers of model predictions, machine learning models themselves\nare often poor explainers of the DGP even if the model is highly accurate.\nParticularly in the presence of interrelated or noisy variables, the output of\na highly predictive model may fail to account for these relationships. This\nimplies explanations of a trained model's behavior may fail to provide\nmeaningful insight into the DGP. In this paper we introduce a novel variable\nimportance algorithm, Shapley Marginal Surplus for Strong Models, that samples\nthe space of possible models to come up with an inferential measure of feature\nimportance. We compare this method to other popular feature importance methods,\nboth Shapley-based and non-Shapley based, and demonstrate significant\noutperformance in inferential capabilities relative to other methods.\n","authors":["Daniel de Marchi","Michael Kosorok","Scott de Marchi"],"pdf_url":"https://arxiv.org/pdf/2408.08845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07246v2","updated":"2024-08-16T16:46:32Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v2.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2408.08837v1","updated":"2024-08-16T16:41:27Z","published":"2024-08-16T16:41:27Z","title":"Entropy Coding of Unordered Data Structures","summary":" We present shuffle coding, a general method for optimal compression of\nsequences of unordered objects using bits-back coding. Data structures that can\nbe compressed using shuffle coding include multisets, graphs, hypergraphs, and\nothers. We release an implementation that can easily be adapted to different\ndata types and statistical models, and demonstrate that our implementation\nachieves state-of-the-art compression rates on a range of graph datasets\nincluding molecular data.\n","authors":["Julius Kunze","Daniel Severo","Giulio Zani","Jan-Willem van de Meent","James Townsend"],"pdf_url":"https://arxiv.org/pdf/2408.08837v1.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2204.08335v3","updated":"2024-08-16T16:40:52Z","published":"2022-04-18T14:27:31Z","title":"Active Learning with Weak Supervision for Gaussian Processes","summary":" Annotating data for supervised learning can be costly. When the annotation\nbudget is limited, active learning can be used to select and annotate those\nobservations that are likely to give the most gain in model performance. We\npropose an active learning algorithm that, in addition to selecting which\nobservation to annotate, selects the precision of the annotation that is\nacquired. Assuming that annotations with low precision are cheaper to obtain,\nthis allows the model to explore a larger part of the input space, with the\nsame annotation budget. We build our acquisition function on the previously\nproposed BALD objective for Gaussian Processes, and empirically demonstrate the\ngains of being able to adjust the annotation precision in the active learning\nloop.\n","authors":["Amanda Olmin","Jakob Lindqvist","Lennart Svensson","Fredrik Lindsten"],"pdf_url":"https://arxiv.org/pdf/2204.08335v3.pdf","comment":"This version of the contribution has been accepted for publication,\n after peer review but is not the Version of Record and does not reflect\n post-acceptance improvements, or any corrections. The Version of Record is\n available online at: http://dx.doi.org/10.1007/978-981-99-1642-9_17. Use of\n this Accepted Version is subject to the publisher's Accepted Manuscript terms\n of use"},{"id":"http://arxiv.org/abs/2407.03194v4","updated":"2024-08-16T16:37:09Z","published":"2024-07-03T15:26:02Z","title":"Prediction Instability in Machine Learning Ensembles","summary":" In machine learning ensembles predictions from multiple models are\naggregated. Despite widespread use and strong performance of ensembles in\napplied problems little is known about the mathematical properties of\naggregating models and associated consequences for safe, explainable use of\nsuch models. In this paper we prove a theorem that shows that any ensemble will\nexhibit at least one of the following forms of prediction instability. It will\neither ignore agreement among all underlying models, change its mind when none\nof the underlying models have done so, or be manipulable through inclusion or\nexclusion of options it would never actually predict. As a consequence,\nensemble aggregation procedures will always need to balance the benefits of\ninformation use against the risk of these prediction instabilities. This\nanalysis also sheds light on what specific forms of prediction instability to\nexpect from particular ensemble algorithms; for example popular tree ensembles\nlike random forest, or xgboost will violate basic, intuitive fairness\nproperties. Finally, we show that this can be ameliorated by using consistent\nmodels in asymptotic conditions.\n","authors":["Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2407.03194v4.pdf","comment":"15 pages, uses a modified version of ICML2024.sty"},{"id":"http://arxiv.org/abs/2311.00201v2","updated":"2024-08-16T16:34:00Z","published":"2023-11-01T00:15:18Z","title":"Federated Natural Policy Gradient and Actor Critic Methods for\n Multi-task Reinforcement Learning","summary":" Federated reinforcement learning (RL) enables collaborative decision making\nof multiple distributed agents without sharing local data trajectories. In this\nwork, we consider a multi-task setting, in which each agent has its own private\nreward function corresponding to different tasks, while sharing the same\ntransition kernel of the environment. Focusing on infinite-horizon Markov\ndecision processes, the goal is to learn a globally optimal policy that\nmaximizes the sum of the discounted total rewards of all the agents in a\ndecentralized manner, where each agent only communicates with its neighbors\nover some prescribed graph topology.\n We develop federated vanilla and entropy-regularized natural policy gradient\n(NPG) methods in the tabular setting under softmax parameterization, where\ngradient tracking is applied to estimate the global Q-function to mitigate the\nimpact of imperfect information sharing. We establish non-asymptotic global\nconvergence guarantees under exact policy evaluation, where the rates are\nnearly independent of the size of the state-action space and illuminate the\nimpacts of network size and connectivity. To the best of our knowledge, this is\nthe first time that near dimension-free global convergence is established for\nfederated multi-task RL using policy optimization. We further go beyond the\ntabular setting by proposing a federated natural actor critic (NAC) method for\nmulti-task RL with function approximation, and establish its finite-time sample\ncomplexity taking the errors of function approximation into account.\n","authors":["Tong Yang","Shicong Cen","Yuting Wei","Yuxin Chen","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2311.00201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01008v3","updated":"2024-08-16T16:26:11Z","published":"2023-03-31T16:11:56Z","title":"Self-Supervised Multimodal Learning: A Survey","summary":" Multimodal learning, which aims to understand and analyze information from\nmultiple modalities, has achieved substantial progress in the supervised regime\nin recent years. However, the heavy dependence on data paired with expensive\nhuman annotations impedes scaling up models. Meanwhile, given the availability\nof large-scale unannotated data in the wild, self-supervised learning has\nbecome an attractive strategy to alleviate the annotation bottleneck. Building\non these two directions, self-supervised multimodal learning (SSML) provides\nways to learn from raw multimodal data. In this survey, we provide a\ncomprehensive review of the state-of-the-art in SSML, in which we elucidate\nthree major challenges intrinsic to self-supervised learning with multimodal\ndata: (1) learning representations from multimodal data without labels, (2)\nfusion of different modalities, and (3) learning with unaligned data. We then\ndetail existing solutions to these challenges. Specifically, we consider (1)\nobjectives for learning from multimodal unlabeled data via self-supervision,\n(2) model architectures from the perspective of different multimodal fusion\nstrategies, and (3) pair-free learning strategies for coarse-grained and\nfine-grained alignment. We also review real-world applications of SSML\nalgorithms in diverse fields such as healthcare, remote sensing, and machine\ntranslation. Finally, we discuss challenges and future directions for SSML. A\ncollection of related resources can be found at:\nhttps://github.com/ys-zong/awesome-self-supervised-multimodal-learning.\n","authors":["Yongshuo Zong","Oisin Mac Aodha","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2304.01008v3.pdf","comment":"Accepted to IEEE T-PAMI"},{"id":"http://arxiv.org/abs/2401.11325v3","updated":"2024-08-16T16:18:28Z","published":"2024-01-20T21:09:27Z","title":"Detecting Hidden Triggers: Mapping Non-Markov Reward Functions to Markov","summary":" Many Reinforcement Learning algorithms assume a Markov reward function to\nguarantee optimality. However, not all reward functions are Markov. This paper\nproposes a framework for mapping non-Markov reward functions into equivalent\nMarkov ones by learning specialized reward automata, Reward Machines. Unlike\nthe general practice of learning Reward Machines, we do not require a set of\nhigh-level propositional symbols from which to learn. Rather, we learn hidden\ntriggers, directly from data, that construct them. We demonstrate the\nimportance of learning Reward Machines over their Deterministic Finite-State\nAutomata counterparts given their ability to model reward dependencies. We\nformalize this distinction in our learning objective. Our mapping process is\nconstructed as an Integer Linear Programming problem. We prove that our\nmappings form a suitable proxy for maximizing reward expectations. We\nempirically validate our approach by learning black-box, non-Markov reward\nfunctions in the Officeworld domain. Additionally, we demonstrate the\neffectiveness of learning reward dependencies in a new domain, Breakfastworld.\n","authors":["Gregory Hyde","Eugene Santos Jr"],"pdf_url":"https://arxiv.org/pdf/2401.11325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08824v1","updated":"2024-08-16T16:15:57Z","published":"2024-08-16T16:15:57Z","title":"LEVIS: Large Exact Verifiable Input Spaces for Neural Networks","summary":" The robustness of neural networks is paramount in safety-critical\napplications. While most current robustness verification methods assess the\nworst-case output under the assumption that the input space is known,\nidentifying a verifiable input space $\\mathcal{C}$, where no adversarial\nexamples exist, is crucial for effective model selection, robustness\nevaluation, and the development of reliable control strategies. To address this\nchallenge, we introduce a novel framework, $\\texttt{LEVIS}$, comprising\n$\\texttt{LEVIS}$-$\\alpha$ and $\\texttt{LEVIS}$-$\\beta$.\n$\\texttt{LEVIS}$-$\\alpha$ locates the largest possible verifiable ball within\nthe central region of $\\mathcal{C}$ that intersects at least two boundaries. In\ncontrast, $\\texttt{LEVIS}$-$\\beta$ integrates multiple verifiable balls to\nencapsulate the entirety of the verifiable space comprehensively. Our\ncontributions are threefold: (1) We propose $\\texttt{LEVIS}$ equipped with\nthree pioneering techniques that identify the maximum verifiable ball and the\nnearest adversarial point along collinear or orthogonal directions. (2) We\noffer a theoretical analysis elucidating the properties of the verifiable balls\nacquired through $\\texttt{LEVIS}$-$\\alpha$ and $\\texttt{LEVIS}$-$\\beta$. (3) We\nvalidate our methodology across diverse applications, including electrical\npower flow regression and image classification, showcasing performance\nenhancements and visualizations of the searching characteristics.\n","authors":["Mohamad Fares El Hajj Chehade","Brian Wesley Bell","Russell Bent","Hao Zhu","Wenting Li"],"pdf_url":"https://arxiv.org/pdf/2408.08824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08823v1","updated":"2024-08-16T16:15:18Z","published":"2024-08-16T16:15:18Z","title":"Optimal Symmetries in Binary Classification","summary":" We explore the role of group symmetries in binary classification tasks,\npresenting a novel framework that leverages the principles of Neyman-Pearson\noptimality. Contrary to the common intuition that larger symmetry groups lead\nto improved classification performance, our findings show that selecting the\nappropriate group symmetries is crucial for optimising generalisation and\nsample efficiency. We develop a theoretical foundation for designing group\nequivariant neural networks that align the choice of symmetries with the\nunderlying probability distributions of the data. Our approach provides a\nunified methodology for improving classification accuracy across a broad range\nof applications by carefully tailoring the symmetry group to the specific\ncharacteristics of the problem. Theoretical analysis and experimental results\ndemonstrate that optimal classification performance is not always associated\nwith the largest equivariant groups possible in the domain, even when the\nlikelihood ratio is invariant under one of its proper subgroups, but rather\nwith those subgroups themselves. This work offers insights and practical\nguidelines for constructing more effective group equivariant architectures in\ndiverse machine-learning contexts.\n","authors":["Vishal S. Ngairangbam","Michael Spannowsky"],"pdf_url":"https://arxiv.org/pdf/2408.08823v1.pdf","comment":"13 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2405.15019v2","updated":"2024-08-16T15:56:46Z","published":"2024-05-23T19:44:03Z","title":"Agentic Skill Discovery","summary":" Language-conditioned robotic skills make it possible to apply the high-level\nreasoning of Large Language Models (LLMs) to low-level robotic control. A\nremaining challenge is to acquire a diverse set of fundamental skills. Existing\napproaches either manually decompose a complex task into atomic robotic actions\nin a top-down fashion, or bootstrap as many combinations as possible in a\nbottom-up fashion to cover a wider range of task possibilities. These\ndecompositions or combinations, however, require an initial skill library. For\nexample, a ``grasping'' capability can never emerge from a skill library\ncontaining only diverse ``pushing'' skills. Existing skill discovery techniques\nwith reinforcement learning acquire skills by an exhaustive exploration but\noften yield non-meaningful behaviors. In this study, we introduce a novel\nframework for skill discovery that is entirely driven by LLMs. The framework\nbegins with an LLM generating task proposals based on the provided scene\ndescription and the robot's configurations, aiming to incrementally acquire new\nskills upon task completion. For each proposed task, a series of reinforcement\nlearning processes are initiated, utilizing reward and success determination\nfunctions sampled by the LLM to develop the corresponding policy. The\nreliability and trustworthiness of learned behaviors are further ensured by an\nindependent vision-language model. We show that starting with zero skill, the\nskill library emerges and expands to more and more meaningful and reliable\nskills, enabling the robot to efficiently further propose and complete advanced\ntasks. Project page: \\url{https://agentic-skill-discovery.github.io}.\n","authors":["Xufeng Zhao","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2405.15019v2.pdf","comment":"Webpage see https://agentic-skill-discovery.github.io/"},{"id":"http://arxiv.org/abs/2408.08815v1","updated":"2024-08-16T15:49:30Z","published":"2024-08-16T15:49:30Z","title":"An Empirical Examination of Balancing Strategy for Counterfactual\n Estimation on Time Series","summary":" Counterfactual estimation from observations represents a critical endeavor in\nnumerous application fields, such as healthcare and finance, with the primary\nchallenge being the mitigation of treatment bias. The balancing strategy aimed\nat reducing covariate disparities between different treatment groups serves as\na universal solution. However, when it comes to the time series data, the\neffectiveness of balancing strategies remains an open question, with a thorough\nanalysis of the robustness and applicability of balancing strategies still\nlacking. This paper revisits counterfactual estimation in the temporal setting\nand provides a brief overview of recent advancements in balancing strategies.\nMore importantly, we conduct a critical empirical examination for the\neffectiveness of the balancing strategies within the realm of temporal\ncounterfactual estimation in various settings on multiple datasets. Our\nfindings could be of significant interest to researchers and practitioners and\ncall for a reexamination of the balancing strategy in time series settings.\n","authors":["Qiang Huang","Chuizheng Meng","Defu Cao","Biwei Huang","Yi Chang","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08815v1.pdf","comment":"ICML 2024 Carema Ready Version. 20 Pages, 12 Figures, 10 Tables"},{"id":"http://arxiv.org/abs/2408.08812v1","updated":"2024-08-16T15:47:08Z","published":"2024-08-16T15:47:08Z","title":"CAT: Caution Aware Transfer in Reinforcement Learning via Distributional\n Risk","summary":" Transfer learning in reinforcement learning (RL) has become a pivotal\nstrategy for improving data efficiency in new, unseen tasks by utilizing\nknowledge from previously learned tasks. This approach is especially beneficial\nin real-world deployment scenarios where computational resources are\nconstrained and agents must adapt rapidly to novel environments. However,\ncurrent state-of-the-art methods often fall short in ensuring safety during the\ntransfer process, particularly when unforeseen risks emerge in the deployment\nphase. In this work, we address these limitations by introducing a novel\nCaution-Aware Transfer Learning (CAT) framework. Unlike traditional approaches\nthat limit risk considerations to mean-variance, we define \"caution\" as a more\ngeneralized and comprehensive notion of risk. Our core innovation lies in\noptimizing a weighted sum of reward return and caution-based on state-action\noccupancy measures-during the transfer process, allowing for a rich\nrepresentation of diverse risk factors. To the best of our knowledge, this is\nthe first work to explore the optimization of such a generalized risk notion\nwithin the context of transfer RL. Our contributions are threefold: (1) We\npropose a Caution-Aware Transfer (CAT) framework that evaluates source policies\nwithin the test environment and constructs a new policy that balances reward\nmaximization and caution. (2) We derive theoretical sub-optimality bounds for\nour method, providing rigorous guarantees of its efficacy. (3) We empirically\nvalidate CAT, demonstrating that it consistently outperforms existing methods\nby delivering safer policies under varying risk conditions in the test tasks.\n","authors":["Mohamad Fares El Hajj Chehade","Amrit Singh Bedi","Amy Zhang","Hao Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08808v1","updated":"2024-08-16T15:41:43Z","published":"2024-08-16T15:41:43Z","title":"Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge","summary":" Large Language Models (LLMs) have revolutionized the landscape of machine\nlearning, yet current benchmarks often fall short in capturing the diverse\nbehavior of these models in real-world applications. A benchmark's usefulness\nis determined by its ability to clearly differentiate between models of varying\ncapabilities (separability) and closely align with human preferences. Existing\nframeworks like Alpaca-Eval 2.0 LC\n\\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1\n\\cite{li2024crowdsourced} are limited by their focus on general-purpose queries\nand lack of diversity across domains such as law, medicine, and multilingual\ncontexts. In this paper, we address these limitations by introducing a novel\ndata pipeline that curates diverse, domain-specific evaluation sets tailored\nfor LLM-as-a-Judge frameworks. Our approach leverages a combination of manual\ncuration, semi-supervised learning to generate clusters, and stratified\nsampling to ensure balanced representation across a wide range of domains and\nlanguages. The resulting evaluation set, which includes 1573 samples across 14\ncategories, demonstrates high separability (84\\%) across ten top-ranked models,\nand agreement (84\\%) with Chatbot Arena and (0.915) Spearman correlation. The\nagreement values are 9\\% better than Arena Hard and 20\\% better than AlpacaEval\n2.0 LC, while the Spearman coefficient is 0.7 more than the next best\nbenchmark, showcasing a significant improvement in the usefulness of the\nbenchmark. We further provide an open-source evaluation tool that enables\nfine-grained analysis of model performance across user-defined categories,\noffering valuable insights for practitioners. This work contributes to the\nongoing effort to enhance the transparency, diversity, and effectiveness of LLM\nevaluation methodologies.\n","authors":["Ravi Raju","Swayambhoo Jain","Bo Li","Jonathan Li","Urmish Thakkar"],"pdf_url":"https://arxiv.org/pdf/2408.08808v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.17391v2","updated":"2024-08-16T15:29:52Z","published":"2024-05-27T17:44:33Z","title":"Dataset-learning duality and emergent criticality","summary":" In artificial neural networks, the activation dynamics of non-trainable\nvariables is strongly coupled to the learning dynamics of trainable variables.\nDuring the activation pass, the boundary neurons (e.g., input neurons) are\nmapped to the bulk neurons (e.g., hidden neurons), and during the learning\npass, both bulk and boundary neurons are mapped to changes in trainable\nvariables (e.g., weights and biases). For example, in feed-forward neural\nnetworks, forward propagation is the activation pass and backward propagation\nis the learning pass. We show that a composition of the two maps establishes a\nduality map between a subspace of non-trainable boundary variables (e.g.,\ndataset) and a tangent subspace of trainable variables (i.e., learning). In\ngeneral, the dataset-learning duality is a complex non-linear map between\nhigh-dimensional spaces, but in a learning equilibrium, the problem can be\nlinearized and reduced to many weakly coupled one-dimensional problems. We use\nthe duality to study the emergence of criticality, or the power-law\ndistributions of fluctuations of the trainable variables. In particular, we\nshow that criticality can emerge in the learning system even from the dataset\nin a non-critical state, and that the power-law distribution can be modified by\nchanging either the activation function or the loss function.\n","authors":["Ekaterina Kukleva","Vitaly Vanchurin"],"pdf_url":"https://arxiv.org/pdf/2405.17391v2.pdf","comment":"29 pages, 9 figures, 1 table, minor corrections"},{"id":"http://arxiv.org/abs/2408.08799v1","updated":"2024-08-16T15:16:35Z","published":"2024-08-16T15:16:35Z","title":"Representation Learning of Geometric Trees","summary":" Geometric trees are characterized by their tree-structured layout and\nspatially constrained nodes and edges, which significantly impacts their\ntopological attributes. This inherent hierarchical structure plays a crucial\nrole in domains such as neuron morphology and river geomorphology, but\ntraditional graph representation methods often overlook these specific\ncharacteristics of tree structures. To address this, we introduce a new\nrepresentation learning framework tailored for geometric trees. It first\nfeatures a unique message passing neural network, which is both provably\ngeometrical structure-recoverable and rotation-translation invariant. To\naddress the data label scarcity issue, our approach also includes two\ninnovative training targets that reflect the hierarchical ordering and\ngeometric structure of these geometric trees. This enables fully\nself-supervised learning without explicit labels. We validate our method's\neffectiveness on eight real-world datasets, demonstrating its capability to\nrepresent geometric trees.\n","authors":["Zheng Zhang","Allen Zhang","Ruth Nelson","Giorgio Ascoli","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.08799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06945v2","updated":"2024-08-16T15:09:09Z","published":"2024-08-13T15:03:46Z","title":"Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation","summary":" By using an parametric value function to replace the Monte-Carlo rollouts for\nvalue estimation, the actor-critic (AC) algorithms can reduce the variance of\nstochastic policy gradient so that to improve the convergence rate. While\nexisting works mainly focus on analyzing convergence rate of AC algorithms\nunder Markovian noise, the impacts of momentum on AC algorithms remain largely\nunexplored. In this work, we first propose a heavy-ball momentum based\nadvantage actor-critic (\\mbox{HB-A2C}) algorithm by integrating the heavy-ball\nmomentum into the critic recursion that is parameterized by a linear function.\nWhen the sample trajectory follows a Markov decision process, we quantitatively\ncertify the acceleration capability of the proposed HB-A2C algorithm. Our\ntheoretical results demonstrate that the proposed HB-A2C finds an\n$\\epsilon$-approximate stationary point with $\\oo{\\epsilon^{-2}}$ iterations\nfor reinforcement learning tasks with Markovian noise. Moreover, we also reveal\nthe dependence of learning rates on the length of the sample trajectory. By\ncarefully selecting the momentum factor of the critic recursion, the proposed\nHB-A2C can balance the errors introduced by the initialization and the\nstoschastic approximation.\n","authors":["Yanjie Dong","Haijun Zhang","Gang Wang","Shisheng Cui","Xiping Hu"],"pdf_url":"https://arxiv.org/pdf/2408.06945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14322v3","updated":"2024-08-16T15:02:45Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08788v1","updated":"2024-08-16T15:01:28Z","published":"2024-08-16T15:01:28Z","title":"Neighbor Overlay-Induced Graph Attention Network","summary":" Graph neural networks (GNNs) have garnered significant attention due to their\nability to represent graph data. Among various GNN variants, graph attention\nnetwork (GAT) stands out since it is able to dynamically learn the importance\nof different nodes. However, present GATs heavily rely on the smoothed node\nfeatures to obtain the attention coefficients rather than graph structural\ninformation, which fails to provide crucial contextual cues for node\nrepresentations. To address this issue, this study proposes a neighbor\noverlay-induced graph attention network (NO-GAT) with the following two-fold\nideas: a) learning favorable structural information, i.e., overlaid neighbors,\noutside the node feature propagation process from an adjacency matrix; b)\ninjecting the information of overlaid neighbors into the node feature\npropagation process to compute the attention coefficient jointly. Empirical\nstudies on graph benchmark datasets indicate that the proposed NO-GAT\nconsistently outperforms state-of-the-art models.\n","authors":["Tiqiao Wei","Ye Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.08788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06183v2","updated":"2024-08-16T14:59:52Z","published":"2024-08-12T14:29:54Z","title":"Centralized and Federated Heart Disease Classification Models Using UCI\n Dataset and their Shapley-value Based Interpretability","summary":" Cardiovascular diseases are a leading cause of mortality worldwide,\nhighlighting the need for accurate diagnostic methods. This study benchmarks\ncentralized and federated machine learning algorithms for heart disease\nclassification using the UCI dataset which includes 920 patient records from\nfour hospitals in the USA, Hungary and Switzerland. Our benchmark is supported\nby Shapley-value interpretability analysis to quantify features' importance for\nclassification. In the centralized setup, various binary classification\nalgorithms are trained on pooled data, with a support vector machine (SVM)\nachieving the highest testing accuracy of 83.3\\%, surpassing the established\nbenchmark of 78.7\\% with logistic regression. Additionally, federated learning\nalgorithms with four clients (hospitals) are explored, leveraging the dataset's\nnatural partition to enhance privacy without sacrificing accuracy. Federated\nSVM, an uncommon approach in the literature, achieves a top testing accuracy of\n73.8\\%. Our interpretability analysis aligns with existing medical knowledge of\nheart disease indicators. Overall, this study establishes a benchmark for\nefficient and interpretable pre-screening tools for heart disease while\nmaintaining patients' privacy. This work is available at\nhttps://github.com/padillma1/Heart-Disease-Classification-on-UCI-dataset-and-Shapley-Interpretability-Analysis.\n","authors":["Mario Padilla Rodriguez","Mohamed Nafea"],"pdf_url":"https://arxiv.org/pdf/2408.06183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08785v1","updated":"2024-08-16T14:59:00Z","published":"2024-08-16T14:59:00Z","title":"A Transparency Paradox? Investigating the Impact of Explanation\n Specificity and Autonomous Vehicle Perceptual Inaccuracies on Passengers","summary":" Transparency in automated systems could be afforded through the provision of\nintelligible explanations. While transparency is desirable, might it lead to\ncatastrophic outcomes (such as anxiety), that could outweigh its benefits? It's\nquite unclear how the specificity of explanations (level of transparency)\ninfluences recipients, especially in autonomous driving (AD). In this work, we\nexamined the effects of transparency mediated through varying levels of\nexplanation specificity in AD. We first extended a data-driven explainer model\nby adding a rule-based option for explanation generation in AD, and then\nconducted a within-subject lab study with 39 participants in an immersive\ndriving simulator to study the effect of the resulting explanations.\nSpecifically, our investigation focused on: (1) how different types of\nexplanations (specific vs. abstract) affect passengers' perceived safety,\nanxiety, and willingness to take control of the vehicle when the vehicle\nperception system makes erroneous predictions; and (2) the relationship between\npassengers' behavioural cues and their feelings during the autonomous drives.\nOur findings showed that passengers felt safer with specific explanations when\nthe vehicle's perception system had minimal errors, while abstract explanations\nthat hid perception errors led to lower feelings of safety. Anxiety levels\nincreased when specific explanations revealed perception system errors (high\ntransparency). We found no significant link between passengers' visual patterns\nand their anxiety levels. Our study suggests that passengers prefer clear and\nspecific explanations (high transparency) when they originate from autonomous\nvehicles (AVs) with optimal perceptual accuracy.\n","authors":["Daniel Omeiza","Raunak Bhattacharyya","Marina Jirotka","Nick Hawes","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08785v1.pdf","comment":"Submitted to Transportation Research Part F: Traffic Psychology and\n Behaviour. arXiv admin note: text overlap with arXiv:2307.00633"},{"id":"http://arxiv.org/abs/2405.13712v3","updated":"2024-08-16T14:54:02Z","published":"2024-05-22T15:04:06Z","title":"Learning Diffusion Priors from Observations by Expectation Maximization","summary":" Diffusion models recently proved to be remarkable priors for Bayesian inverse\nproblems. However, training these models typically requires access to large\namounts of clean data, which could prove difficult in some settings. In this\nwork, we present a novel method based on the expectation-maximization algorithm\nfor training diffusion models from incomplete and noisy observations only.\nUnlike previous works, our method leads to proper diffusion models, which is\ncrucial for downstream tasks. As part of our method, we propose and motivate an\nimproved posterior sampling scheme for unconditional diffusion models. We\npresent empirical evidence supporting the effectiveness of our method.\n","authors":["François Rozet","Gérôme Andry","François Lanusse","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2405.13712v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12641v2","updated":"2024-08-16T14:49:02Z","published":"2024-03-19T11:24:14Z","title":"Automated Contrastive Learning Strategy Search for Time Series","summary":" In recent years, Contrastive Learning (CL) has become a predominant\nrepresentation learning paradigm for time series. Most existing methods\nmanually build specific CL Strategies (CLS) by human heuristics for certain\ndatasets and tasks. However, manually developing CLS usually requires excessive\nprior knowledge about the data, and massive experiments to determine the\ndetailed CL configurations. In this paper, we present an Automated Machine\nLearning (AutoML) practice at Microsoft, which automatically learns CLS for\ntime series datasets and tasks, namely Automated Contrastive Learning (AutoCL).\nWe first construct a principled search space of size over $3\\times10^{12}$,\ncovering data augmentation, embedding transformation, contrastive pair\nconstruction, and contrastive losses. Further, we introduce an efficient\nreinforcement learning algorithm, which optimizes CLS from the performance on\nthe validation tasks, to obtain effective CLS within the space. Experimental\nresults on various real-world datasets demonstrate that AutoCL could\nautomatically find the suitable CLS for the given dataset and task. From the\ncandidate CLS found by AutoCL on several public datasets/tasks, we compose a\ntransferable Generally Good Strategy (GGS), which has a strong performance for\nother datasets. We also provide empirical analysis as a guide for the future\ndesign of CLS.\n","authors":["Baoyu Jing","Yansen Wang","Guoxin Sui","Jing Hong","Jingrui He","Yuqing Yang","Dongsheng Li","Kan Ren"],"pdf_url":"https://arxiv.org/pdf/2403.12641v2.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2403.11960v2","updated":"2024-08-16T14:47:22Z","published":"2024-03-18T16:57:16Z","title":"Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal\n Time Series Imputation","summary":" Spatiotemporal time series are usually collected via monitoring sensors\nplaced at different locations, which usually contain missing values due to\nvarious mechanical failures. Imputing the missing values is crucial for\nanalyzing time series. When recovering a specific data point, most existing\nmethods consider all the information relevant to that point regardless of the\ncause-and-effect relationship. During data collection, it is inevitable that\nsome unknown confounders are included, e.g., background noise in time series\nand non-causal shortcut edges in the constructed sensor network. These\nconfounders could open backdoor paths and establish non-causal correlations\nbetween the input and output. Over-exploiting these non-causal correlations\ncould cause overfitting. In this paper, we first revisit spatiotemporal time\nseries imputation from a causal perspective and show how to block the\nconfounders via the frontdoor adjustment. Based on the results of frontdoor\nadjustment, we introduce a novel Causality-Aware Spatiotemporal Graph Neural\nNetwork (Casper), which contains a novel Prompt Based Decoder (PBD) and a\nSpatiotemporal Causal Attention (SCA). PBD could reduce the impact of\nconfounders and SCA could discover the sparse causal relationships among\nembeddings. Theoretical analysis reveals that SCA discovers causal\nrelationships based on the values of gradients. We evaluate Casper on three\nreal-world datasets, and the experimental results show that Casper could\noutperform the baselines and could effectively discover causal relationships.\n","authors":["Baoyu Jing","Dawei Zhou","Kan Ren","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11960v2.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2408.08776v1","updated":"2024-08-16T14:38:14Z","published":"2024-08-16T14:38:14Z","title":"NEAR: A Training-Free Pre-Estimator of Machine Learning Model\n Performance","summary":" Artificial neural networks have been shown to be state-of-the-art machine\nlearning models in a wide variety of applications, including natural language\nprocessing and image recognition. However, building a performant neural network\nis a laborious task and requires substantial computing power. Neural\nArchitecture Search (NAS) addresses this issue by an automatic selection of the\noptimal network from a set of potential candidates. While many NAS methods\nstill require training of (some) neural networks, zero-cost proxies promise to\nidentify the optimal network without training. In this work, we propose the\nzero-cost proxy Network Expressivity by Activation Rank (NEAR). It is based on\nthe effective rank of the pre- and post-activation matrix, i.e., the values of\na neural network layer before and after applying its activation function. We\ndemonstrate the cutting-edge correlation between this network score and the\nmodel accuracy on NAS-Bench-101 and NATS-Bench-SSS/TSS. In addition, we present\na simple approach to estimate the optimal layer sizes in multi-layer\nperceptrons. Furthermore, we show that this score can be utilized to select\nhyperparameters such as the activation function and the neural network weight\ninitialization scheme.\n","authors":["Raphael T. Husistein","Markus Reiher","Marco Eckhoff"],"pdf_url":"https://arxiv.org/pdf/2408.08776v1.pdf","comment":"12 pages, 4 figures, 10 tables"},{"id":"http://arxiv.org/abs/2408.08774v1","updated":"2024-08-16T14:33:02Z","published":"2024-08-16T14:33:02Z","title":"Speckle Noise Analysis for Synthetic Aperture Radar (SAR) Space Data","summary":" This research tackles the challenge of speckle noise in Synthetic Aperture\nRadar (SAR) space data, a prevalent issue that hampers the clarity and utility\nof SAR images. The study presents a comparative analysis of six distinct\nspeckle noise reduction techniques: Lee Filtering, Frost Filtering, Kuan\nFiltering, Gaussian Filtering, Median Filtering, and Bilateral Filtering. These\nmethods, selected for their unique approaches to noise reduction and image\npreservation, were applied to SAR datasets sourced from the Alaska Satellite\nFacility (ASF). The performance of each technique was evaluated using a\ncomprehensive set of metrics, including Peak Signal-to-Noise Ratio (PSNR), Mean\nSquared Error (MSE), Structural Similarity Index (SSIM), Equivalent Number of\nLooks (ENL), and Speckle Suppression Index (SSI). The study concludes that both\nthe Lee and Kuan Filters are effective, with the choice of filter depending on\nthe specific application requirements for image quality and noise suppression.\nThis work provides valuable insights into optimizing SAR image processing, with\nsignificant implications for remote sensing, environmental monitoring, and\ngeological surveying.\n","authors":["Sanjjushri Varshini R","Rohith Mahadevan","Bagiya Lakshmi S","Mathivanan Periasamy","Raja CSP Raman","Lokesh M"],"pdf_url":"https://arxiv.org/pdf/2408.08774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08770v1","updated":"2024-08-16T14:25:20Z","published":"2024-08-16T14:25:20Z","title":"Pessimistic Iterative Planning for Robust POMDPs","summary":" Robust partially observable Markov decision processes (robust POMDPs) extend\nclassical POMDPs to handle additional uncertainty on the transition and\nobservation probabilities via so-called uncertainty sets. Policies for robust\nPOMDPs must not only be memory-based to account for partial observability but\nalso robust against model uncertainty to account for the worst-case instances\nfrom the uncertainty sets. We propose the pessimistic iterative planning (PIP)\nframework, which finds robust memory-based policies for robust POMDPs. PIP\nalternates between two main steps: (1) selecting an adversarial (non-robust)\nPOMDP via worst-case probability instances from the uncertainty sets; and (2)\ncomputing a finite-state controller (FSC) for this adversarial POMDP. We\nevaluate the performance of this FSC on the original robust POMDP and use this\nevaluation in step (1) to select the next adversarial POMDP. Within PIP, we\npropose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC through\na recurrent neural network trained using supervision policies optimized for the\nadversarial POMDP. The empirical evaluation in four benchmark environments\nshowcases improved robustness against a baseline method in an ablation study\nand competitive performance compared to a state-of-the-art robust POMDP solver.\n","authors":["Maris F. L. Galesloot","Marnix Suilen","Thiago D. Simão","Steven Carr","Matthijs T. J. Spaan","Ufuk Topcu","Nils Jansen"],"pdf_url":"https://arxiv.org/pdf/2408.08770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08761v1","updated":"2024-08-16T14:04:40Z","published":"2024-08-16T14:04:40Z","title":"SYMPOL: Symbolic Tree-Based On-Policy Reinforcement Learning","summary":" Reinforcement learning (RL) has seen significant success across various\ndomains, but its adoption is often limited by the black-box nature of neural\nnetwork policies, making them difficult to interpret. In contrast, symbolic\npolicies allow representing decision-making strategies in a compact and\ninterpretable way. However, learning symbolic policies directly within\non-policy methods remains challenging. In this paper, we introduce SYMPOL, a\nnovel method for SYMbolic tree-based on-POLicy RL. SYMPOL employs a tree-based\nmodel integrated with a policy gradient method, enabling the agent to learn and\nadapt its actions while maintaining a high level of interpretability. We\nevaluate SYMPOL on a set of benchmark RL tasks, demonstrating its superiority\nover alternative tree-based RL approaches in terms of performance and\ninterpretability. To the best of our knowledge, this is the first method, that\nallows a gradient-based end-to-end learning of interpretable, axis-aligned\ndecision trees on-policy. Therefore, SYMPOL can become the foundation for a new\nclass of interpretable RL based on decision trees. Our implementation is\navailable under: https://github.com/s-marton/SYMPOL\n","authors":["Sascha Marton","Tim Grams","Florian Vogt","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2408.08761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08754v1","updated":"2024-08-16T13:54:50Z","published":"2024-08-16T13:54:50Z","title":"SE-SGformer: A Self-Explainable Signed Graph Transformer for Link Sign\n Prediction","summary":" Signed Graph Neural Networks (SGNNs) have been shown to be effective in\nanalyzing complex patterns in real-world situations where positive and negative\nlinks coexist. However, SGNN models suffer from poor explainability, which\nlimit their adoptions in critical scenarios that require understanding the\nrationale behind predictions. To the best of our knowledge, there is currently\nno research work on the explainability of the SGNN models. Our goal is to\naddress the explainability of decision-making for the downstream task of link\nsign prediction specific to signed graph neural networks. Since post-hoc\nexplanations are not derived directly from the models, they may be biased and\nmisrepresent the true explanations. Therefore, in this paper we introduce a\nSelf-Explainable Signed Graph transformer (SE-SGformer) framework, which can\nnot only outputs explainable information while ensuring high prediction\naccuracy. Specifically, We propose a new Transformer architecture for signed\ngraphs and theoretically demonstrate that using positional encoding based on\nsigned random walks has greater expressive power than current SGNN methods and\nother positional encoding graph Transformer-based approaches. We constructs a\nnovel explainable decision process by discovering the $K$-nearest (farthest)\npositive (negative) neighbors of a node to replace the neural network-based\ndecoder for predicting edge signs. These $K$ positive (negative) neighbors\nrepresent crucial information about the formation of positive (negative) edges\nbetween nodes and thus can serve as important explanatory information in the\ndecision-making process. We conducted experiments on several real-world\ndatasets to validate the effectiveness of SE-SGformer, which outperforms the\nstate-of-the-art methods by improving 2.2\\% prediction accuracy and 73.1\\%\nexplainablity accuracy in the best-case scenario.\n","authors":["Lu Li","Jiale Liu","Xingyu Ji","Maojun Wang","Zeyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06277v2","updated":"2024-08-16T13:51:59Z","published":"2024-08-12T16:39:18Z","title":"Multi-marginal Schrödinger Bridges with Iterative Reference Refinement","summary":" Practitioners frequently aim to infer an unobserved population trajectory\nusing sample snapshots at multiple time points. For instance, in single-cell\nsequencing, scientists would like to learn how gene expression evolves over\ntime. But sequencing any cell destroys that cell. So we cannot access any\ncell's full trajectory, but we can access snapshot samples from many cells.\nStochastic differential equations are commonly used to analyze systems with\nfull individual-trajectory access; since here we have only sample snapshots,\nthese methods are inapplicable. The deep learning community has recently\nexplored using Schr\\\"odinger bridges (SBs) and their extensions to estimate\nthese dynamics. However, these methods either (1) interpolate between just two\ntime points or (2) require a single fixed reference dynamic within the SB,\nwhich is often just set to be Brownian motion. But learning piecewise from\nadjacent time points can fail to capture long-term dependencies. And\npractitioners are typically able to specify a model class for the reference\ndynamic but not the exact values of the parameters within it. So we propose a\nnew method that (1) learns the unobserved trajectories from sample snapshots\nacross multiple time points and (2) requires specification only of a class of\nreference dynamics, not a single fixed one. In particular, we suggest an\niterative projection method inspired by Schr\\\"odinger bridges; we alternate\nbetween learning a piecewise SB on the unobserved trajectories and using the\nlearned SB to refine our best guess for the dynamics within the reference\nclass. We demonstrate the advantages of our method via a well-known simulated\nparametric model from ecology, simulated and real data from systems biology,\nand real motion-capture data.\n","authors":["Yunyi Shen","Renato Berlinghieri","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2408.06277v2.pdf","comment":"Updated to fix title error"},{"id":"http://arxiv.org/abs/2408.08749v1","updated":"2024-08-16T13:50:04Z","published":"2024-08-16T13:50:04Z","title":"ML Study of MaliciousTransactions in Ethereum","summary":" Smart contracts are a major tool in Ethereum transactions. Therefore hackers\ncan exploit them by adding code vulnerabilities to their sources and using\nthese vulnerabilities for performing malicious transactions. This paper\npresents two successful approaches for detecting malicious contracts: one uses\nopcode and relies on GPT2 and the other uses the Solidity source and a LORA\nfine-tuned CodeLlama. Finally, we present an XGBOOST model that combines gas\nproperties and Hexa-decimal signatures for detecting malicious transactions.\nThis approach relies on early assumptions that maliciousness is manifested by\nthe uncommon usage of the contracts' functions and the effort to pursue the\ntransaction.\n","authors":["Natan Katz"],"pdf_url":"https://arxiv.org/pdf/2408.08749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05807v2","updated":"2024-08-16T13:03:02Z","published":"2024-08-11T15:56:44Z","title":"Kernel Density Estimators in Large Dimensions","summary":" This paper studies Kernel density estimation for a high-dimensional\ndistribution $\\rho(x)$. Traditional approaches have focused on the limit of\nlarge number of data points $n$ and fixed dimension $d$. We analyze instead the\nregime where both the number $n$ of data points $y_i$ and their dimensionality\n$d$ grow with a fixed ratio $\\alpha=(\\log n)/d$. Our study reveals three\ndistinct statistical regimes for the kernel-based estimate of the density $\\hat\n\\rho_h^{\\mathcal {D}}(x)=\\frac{1}{n h^d}\\sum_{i=1}^n\nK\\left(\\frac{x-y_i}{h}\\right)$, depending on the bandwidth $h$: a classical\nregime for large bandwidth where the Central Limit Theorem (CLT) holds, which\nis akin to the one found in traditional approaches. Below a certain value of\nthe bandwidth, $h_{CLT}(\\alpha)$, we find that the CLT breaks down. The\nstatistics of $\\hat \\rho_h^{\\mathcal {D}}(x)$ for a fixed $x$ drawn from\n$\\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable\ndistribution). In particular below a value $h_G(\\alpha)$, we find that $\\hat\n\\rho_h^{\\mathcal {D}}(x)$ is governed by extreme value statistics: only a few\npoints in the database matter and give the dominant contribution to the density\nestimator. We provide a detailed analysis for high-dimensional multivariate\nGaussian data. We show that the optimal bandwidth threshold based on\nKullback-Leibler divergence lies in the new statistical regime identified in\nthis paper. Our findings reveal limitations of classical approaches, show the\nrelevance of these new statistical regimes, and offer new insights for Kernel\ndensity estimation in high-dimensional settings.\n","authors":["Giulio Biroli","Marc Mézard"],"pdf_url":"https://arxiv.org/pdf/2408.05807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08713v1","updated":"2024-08-16T12:51:52Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n Interaction Modeling in CTR Prediction","summary":" Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Wu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v1.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2403.13108v2","updated":"2024-08-16T12:49:56Z","published":"2024-03-19T19:15:38Z","title":"Resilience in Online Federated Learning: Mitigating Model-Poisoning\n Attacks via Partial Sharing","summary":" Federated learning (FL) allows training machine learning models on\ndistributed data without compromising privacy. However, FL is vulnerable to\nmodel-poisoning attacks where malicious clients tamper with their local models\nto manipulate the global model. In this work, we investigate the resilience of\nthe partial-sharing online FL (PSO-Fed) algorithm against such attacks. PSO-Fed\nreduces communication overhead by allowing clients to share only a fraction of\ntheir model updates with the server. We demonstrate that this partial sharing\nmechanism has the added advantage of enhancing PSO-Fed's robustness to\nmodel-poisoning attacks. Through theoretical analysis, we show that PSO-Fed\nmaintains convergence even under Byzantine attacks, where malicious clients\ninject noise into their updates. Furthermore, we derive a formula for PSO-Fed's\nmean square error, considering factors like stepsize, attack probability, and\nthe number of malicious clients. Interestingly, we find a non-trivial optimal\nstepsize that maximizes PSO-Fed's resistance to these attacks. Extensive\nnumerical experiments confirm our theoretical findings and showcase PSO-Fed's\nsuperior performance against model-poisoning attacks compared to other leading\nFL algorithms.\n","authors":["Ehsan Lari","Reza Arablouei","Vinay Chakravarthi Gogineni","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2403.13108v2.pdf","comment":"13 pages, 9 figures, Submitted to TSIPN"},{"id":"http://arxiv.org/abs/2401.17542v3","updated":"2024-08-16T12:46:03Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08707v1","updated":"2024-08-16T12:40:01Z","published":"2024-08-16T12:40:01Z","title":"Beam Prediction based on Large Language Models","summary":" Millimeter-wave (mmWave) communication is promising for next-generation\nwireless networks but suffers from significant path loss, requiring extensive\nantenna arrays and frequent beam training. Traditional deep learning models,\nsuch as long short-term memory (LSTM), enhance beam tracking accuracy however\nare limited by poor robustness and generalization. In this letter, we use large\nlanguage models (LLMs) to improve the robustness of beam prediction. By\nconverting time series data into text-based representations and employing the\nPrompt-as-Prefix (PaP) technique for contextual enrichment, our approach\nunleashes the strength of LLMs for time series forecasting. Simulation results\ndemonstrate that our LLM-based method offers superior robustness and\ngeneralization compared to LSTM-based models, showcasing the potential of LLMs\nin wireless communications.\n","authors":["Yucheng Sheng","Kai Huang","Le Liang","Peng Liu","Shi Jin","Geoffrey Ye Li"],"pdf_url":"https://arxiv.org/pdf/2408.08707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08706v1","updated":"2024-08-16T12:33:40Z","published":"2024-08-16T12:33:40Z","title":"Efficient Multi-Policy Evaluation for Reinforcement Learning","summary":" To unbiasedly evaluate multiple target policies, the dominant approach among\nRL practitioners is to run and evaluate each target policy separately. However,\nthis evaluation method is far from efficient because samples are not shared\nacross policies, and running target policies to evaluate themselves is actually\nnot optimal. In this paper, we address these two weaknesses by designing a\ntailored behavior policy to reduce the variance of estimators across all target\npolicies. Theoretically, we prove that executing this behavior policy with\nmanyfold fewer samples outperforms on-policy evaluation on every target policy\nunder characterized conditions. Empirically, we show our estimator has a\nsubstantially lower variance compared with previous best methods and achieves\nstate-of-the-art performance in a broad range of environments.\n","authors":["Shuze Liu","Yuxin Chen","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08699v1","updated":"2024-08-16T12:26:36Z","published":"2024-08-16T12:26:36Z","title":"RBLA: Rank-Based-LoRA-Aggregation for Fine-tuning Heterogeneous Models\n in FLaaS","summary":" Federated Learning (FL) is a promising privacy-aware distributed learning\nframework that can be deployed on various devices, such as mobile phones,\ndesktops, and devices equipped with CPUs or GPUs. In the context of\nserver-based Federated Learning as a Service (FLaas), FL enables the central\nserver to coordinate the training process across multiple devices without\ndirect access to the local data, thereby enhancing privacy and data security.\nLow-Rank Adaptation (LoRA) is a method that fine-tunes models efficiently by\nfocusing on a low-dimensional subspace of the model's parameters. This approach\nsignificantly reduces computational and memory costs compared to fine-tuning\nall parameters from scratch. When integrated with FL, especially in a FLaas\nenvironment, LoRA allows for flexible and efficient deployment across diverse\nhardware with varying computational capabilities by adjusting the local model's\nrank. However, in LoRA-enabled FL, different clients may train models with\nvarying ranks, which poses a challenge for model aggregation on the server.\nCurrent methods of aggregating models of different ranks require padding\nweights to a uniform shape, which can degrade the global model's performance.\nTo address this issue, we propose Rank-Based LoRA Aggregation (RBLA), a novel\nmodel aggregation method designed for heterogeneous LoRA structures. RBLA\npreserves key features across models with different ranks. This paper analyzes\nthe issues with current padding methods that reshape models for aggregation in\na FLaas environment. Then, we introduce RBLA, a rank-based aggregation method\nthat maintains both low-rank and high-rank features. Finally, we demonstrate\nthe effectiveness of RBLA through comparative experiments with state-of-the-art\nmethods.\n","authors":["Shuaijun Chen","Omid Tavallaie","Niousha Nazemi","Albert Y. Zomaya"],"pdf_url":"https://arxiv.org/pdf/2408.08699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02504v2","updated":"2024-08-16T12:24:54Z","published":"2023-02-05T22:51:27Z","title":"Motion-compensated MR CINE reconstruction with reconstruction-driven\n motion estimation","summary":" In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective\napproach to address highly undersampled acquisitions by incorporating motion\ninformation between frames. In this work, we propose a novel perspective for\naddressing the MCMR problem and a more integrated and efficient solution to the\nMCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the\noriginal problem into two sub-optimization problems, i.e. motion estimation and\nreconstruction, we formulate this problem as a single entity with one single\noptimization. Our approach is unique in that the motion estimation is directly\ndriven by the ultimate goal, reconstruction, but not by the canonical\nmotion-warping loss (similarity measurement between motion-warped images and\ntarget images). We align the objectives of motion estimation and\nreconstruction, eliminating the drawbacks of artifacts-affected motion\nestimation and therefore error-propagated reconstruction. Further, we can\ndeliver high-quality reconstruction and realistic motion without applying any\nregularization/smoothness loss terms, circumventing the non-trivial weighting\nfactor tuning. We evaluate our method on two datasets: 1) an in-house acquired\n2D CINE dataset for the retrospective study and 2) the public OCMR cardiac\ndataset for the prospective study. The conducted experiments indicate that the\nproposed MCMR framework can deliver artifact-free motion estimation and\nhigh-quality MR images even for imaging accelerations up to 20x, outperforming\nSOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation\nacross all experiments. The code is available at\nhttps://github.com/JZPeterPan/MCMR-Recon-Driven-Motion.\n","authors":["Jiazhen Pan","Wenqi Huang","Daniel Rueckert","Thomas Küstner","Kerstin Hammernik"],"pdf_url":"https://arxiv.org/pdf/2302.02504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08696v1","updated":"2024-08-16T12:20:56Z","published":"2024-08-16T12:20:56Z","title":"Turning Trash into Treasure: Accelerating Inference of Large Language\n Models with Token Recycling","summary":" The rapid growth in the parameters of large language models (LLMs) has made\ninference latency a fundamental bottleneck, limiting broader application of\nLLMs. Speculative decoding represents a lossless approach to accelerate\ninference through a guess-and-verify paradigm, leveraging the parallel\ncapabilities of modern hardware. Some speculative decoding methods rely on\nadditional structures to guess draft tokens, such as small models or\nparameter-efficient architectures, which need extra training before use.\nAlternatively, retrieval-based train-free techniques build libraries from\npre-existing corpora or by n-gram generation. However, they face challenges\nlike large storage requirements, time-consuming retrieval, and limited\nadaptability. Observing that candidate tokens generated during the decoding\nprocess are likely to reoccur in future sequences, we propose Token Recycling.\nThis approach stores candidate tokens in an adjacency matrix and employs a\nbreadth-first search (BFS)-like algorithm on the matrix to construct a draft\ntree. The tree is then validated through tree attention. New candidate tokens\nfrom the decoding process are then used to update the matrix. Token Recycling\nrequires \\textless2MB of additional storage and achieves approximately 2x\nspeedup across all sizes of LLMs. It significantly outperforms existing\ntrain-free methods by 30\\% and even a training method by 25\\%. It can be\ndirectly applied to any existing LLMs and tasks without the need for\nadaptation.\n","authors":["Xianzhen Luo","Yixuan Wang","Qingfu Zhu","Zhiming Zhang","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08696v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.07107v2","updated":"2024-08-16T12:19:44Z","published":"2024-08-13T10:28:54Z","title":"Maximizing V-information for Pre-training Superior Foundation Models","summary":" Pre-training foundation models on large-scale datasets demonstrates\nexceptional performance. However, recent research questions this traditional\nnotion, exploring whether an increase in pre-training data always leads to\nenhanced model performance. To address this issue, data-effective learning\napproaches have been introduced. However, current methods in this area lack a\nclear standard for sample selection. Our experiments reveal that by maximizing\nV-information, sample selection can be framed as an optimization problem,\nenabling effective improvement in model performance even with fewer samples.\nUnder this guidance, we develop an optimal data-effective learning method\n(OptiDEL) to maximize V-information. The OptiDEL method generates hard samples\nto achieve or even exceed the performance of models trained on the full dataset\nwhile using substantially less data. We compare the OptiDEL method with\nstate-of-the-art approaches finding that OptiDEL consistently outperforms\nexisting approaches across different datasets, with foundation models trained\non only 5% of the pre-training data surpassing the performance of those trained\non the full dataset.\n","authors":["Wenxuan Yang","Weimin Tan","Hanyu Zhang","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2408.07107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04720v4","updated":"2024-08-16T12:08:03Z","published":"2024-03-07T18:16:29Z","title":"Rethinking of Encoder-based Warm-start Methods in Hyperparameter\n Optimization","summary":" Effectively representing heterogeneous tabular datasets for meta-learning\npurposes remains an open problem. Previous approaches rely on predefined\nmeta-features, for example, statistical measures or landmarkers. The emergence\nof dataset encoders opens new possibilities for the extraction of meta-features\nbecause they do not involve any handmade design. Moreover, they are proven to\ngenerate dataset representations with desired spatial properties. In this\nresearch, we evaluate an encoder-based approach to one of the most established\nmeta-tasks - warm-starting of the Bayesian Hyperparameter Optimization. To\nbroaden our analysis we introduce a new approach for representation learning on\ntabular data based on [Tomoharu Iwata and Atsutoshi Kumagai. Meta-learning from\nTasks with Heterogeneous Attribute Spaces. In Advances in Neural Information\nProcessing Systems, 2020]. The validation on over 100 datasets from UCI and an\nindependent metaMIMIC set of datasets highlights the nuanced challenges in\nrepresentation learning. We show that general representations may not suffice\nfor some meta-tasks where requirements are not explicitly considered during\nextraction.\n","authors":["Dawid Płudowski","Antoni Zajko","Anna Kozak","Katarzyna Woźnica"],"pdf_url":"https://arxiv.org/pdf/2403.04720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08690v1","updated":"2024-08-16T12:06:09Z","published":"2024-08-16T12:06:09Z","title":"Explore-then-Commit Algorithms for Decentralized Two-Sided Matching\n Markets","summary":" Online learning in a decentralized two-sided matching markets, where the\ndemand-side (players) compete to match with the supply-side (arms), has\nreceived substantial interest because it abstracts out the complex interactions\nin matching platforms (e.g. UpWork, TaskRabbit). However, past works assume\nthat each arm knows their preference ranking over the players (one-sided\nlearning), and each player aim to learn the preference over arms through\nsuccessive interactions. Moreover, several (impractical) assumptions on the\nproblem are usually made for theoretical tractability such as broadcast\nplayer-arm match Liu et al. (2020; 2021); Kong & Li (2023) or serial\ndictatorship Sankararaman et al. (2021); Basu et al. (2021); Ghosh et al.\n(2022). In this paper, we study a decentralized two-sided matching market,\nwhere we do not assume that the preference ranking over players are known to\nthe arms apriori. Furthermore, we do not have any structural assumptions on the\nproblem. We propose a multi-phase explore-then-commit type algorithm namely\nepoch-based CA-ETC (collision avoidance explore then commit) (\\texttt{CA-ETC}\nin short) for this problem that does not require any communication across\nagents (players and arms) and hence decentralized. We show that for the initial\nepoch length of $T_{\\circ}$ and subsequent epoch-lengths of $2^{l/\\gamma}\nT_{\\circ}$ (for the $l-$th epoch with $\\gamma \\in (0,1)$ as an input parameter\nto the algorithm), \\texttt{CA-ETC} yields a player optimal expected regret of\n$\\mathcal{O}\\left(T_{\\circ} (\\frac{K \\log T}{T_{\\circ} \\Delta^2})^{1/\\gamma} +\nT_{\\circ} (\\frac{T}{T_{\\circ}})^\\gamma\\right)$ for the $i$-th player, where $T$\nis the learning horizon, $K$ is the number of arms and $\\Delta$ is an\nappropriately defined problem gap. Furthermore, we propose a blackboard\ncommunication based baseline achieving logarithmic regret in $T$.\n","authors":["Tejas Pagare","Avishek Ghosh"],"pdf_url":"https://arxiv.org/pdf/2408.08690v1.pdf","comment":"Accepted at International Symposium of Information Theory (ISIT) 2024"},{"id":"http://arxiv.org/abs/2408.08685v1","updated":"2024-08-16T11:58:34Z","published":"2024-08-16T11:58:34Z","title":"Can Large Language Models Improve the Adversarial Robustness of Graph\n Neural Networks?","summary":" Graph neural networks (GNNs) are vulnerable to adversarial perturbations,\nespecially for topology attacks, and many methods that improve the robustness\nof GNNs have received considerable attention. Recently, we have witnessed the\nsignificant success of large language models (LLMs), leading many to explore\nthe great potential of LLMs on GNNs. However, they mainly focus on improving\nthe performance of GNNs by utilizing LLMs to enhance the node features.\nTherefore, we ask: Will the robustness of GNNs also be enhanced with the\npowerful understanding and inference capabilities of LLMs? By presenting the\nempirical results, we find that despite that LLMs can improve the robustness of\nGNNs, there is still an average decrease of 23.1% in accuracy, implying that\nthe GNNs remain extremely vulnerable against topology attack. Therefore,\nanother question is how to extend the capabilities of LLMs on graph adversarial\nrobustness. In this paper, we propose an LLM-based robust graph structure\ninference framework, LLM4RGNN, which distills the inference capabilities of\nGPT-4 into a local LLM for identifying malicious edges and an LM-based edge\npredictor for finding missing important edges, so as to recover a robust graph\nstructure. Extensive experiments demonstrate that LLM4RGNN consistently\nimproves the robustness across various GNNs. Even in some cases where the\nperturbation ratio increases to 40%, the accuracy of GNNs is still better than\nthat on the clean graph.\n","authors":["Zhongjian Zhang","Xiao Wang","Huichi Zhou","Yue Yu","Mengmei Zhang","Cheng Yang","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2408.08685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08684v1","updated":"2024-08-16T11:56:49Z","published":"2024-08-16T11:56:49Z","title":"Research on Personalized Compression Algorithm for Pre-trained Models\n Based on Homomorphic Entropy Increase","summary":" In this article, we explore the challenges and evolution of two key\ntechnologies in the current field of AI: Vision Transformer model and Large\nLanguage Model (LLM). Vision Transformer captures global information by\nsplitting images into small pieces and leveraging Transformer's multi-head\nattention mechanism, but its high reference count and compute overhead limit\ndeployment on mobile devices. At the same time, the rapid development of LLM\nhas revolutionized natural language processing, but it also faces huge\ndeployment challenges. To address these issues, we investigate model pruning\ntechniques, with a particular focus on how to reduce redundant parameters\nwithout losing accuracy to accommodate personalized data and\nresource-constrained environments. In this paper, a new layered pruning\nstrategy is proposed to distinguish the personalized layer from the common\nlayer by compressed sensing and random sampling, thus significantly reducing\nthe model parameters. Our experimental results show that the introduced step\nbuffering mechanism further improves the accuracy of the model after pruning,\nproviding new directions and possibilities for the deployment of efficient and\npersonalized AI models on mobile devices in the future.\n","authors":["Yicong Li","Xing Guo","Haohua Du"],"pdf_url":"https://arxiv.org/pdf/2408.08684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08681v1","updated":"2024-08-16T11:53:52Z","published":"2024-08-16T11:53:52Z","title":"A Mean Field Ansatz for Zero-Shot Weight Transfer","summary":" The pre-training cost of large language models (LLMs) is prohibitive. One\ncutting-edge approach to reduce the cost is zero-shot weight transfer, also\nknown as model growth for some cases, which magically transfers the weights\ntrained in a small model to a large model. However, there are still some\ntheoretical mysteries behind the weight transfer. In this paper, inspired by\nprior applications of mean field theory to neural network dynamics, we\nintroduce a mean field ansatz to provide a theoretical explanation for weight\ntransfer. Specifically, we propose the row-column (RC) ansatz under the mean\nfield point of view, which describes the measure structure of the weights in\nthe neural network (NN) and admits a close measure dynamic. Thus, the weights\nof different sizes NN admit a common distribution under proper assumptions, and\nweight transfer methods can be viewed as sampling methods. We empirically\nvalidate the RC ansatz by exploring simple MLP examples and LLMs such as GPT-3\nand Llama-3.1. We show the mean-field point of view is adequate under suitable\nassumptions which can provide theoretical support for zero-shot weight\ntransfer.\n","authors":["Xingyuan Chen","Wenwei Kuang","Lei Deng","Wei Han","Bo Bai","Goncalo dos Reis"],"pdf_url":"https://arxiv.org/pdf/2408.08681v1.pdf","comment":"40 pages, 6 Figures, 1 table"},{"id":"http://arxiv.org/abs/2408.08677v1","updated":"2024-08-16T11:44:27Z","published":"2024-08-16T11:44:27Z","title":"Neural Reward Machines","summary":" Non-markovian Reinforcement Learning (RL) tasks are very hard to solve,\nbecause agents must consider the entire history of state-action pairs to act\nrationally in the environment. Most works use symbolic formalisms (as Linear\nTemporal Logic or automata) to specify the temporally-extended task. These\napproaches only work in finite and discrete state environments or continuous\nproblems for which a mapping between the raw state and a symbolic\ninterpretation is known as a symbol grounding (SG) function. Here, we define\nNeural Reward Machines (NRM), an automata-based neurosymbolic framework that\ncan be used for both reasoning and learning in non-symbolic non-markovian RL\ndomains, which is based on the probabilistic relaxation of Moore Machines. We\ncombine RL with semisupervised symbol grounding (SSSG) and we show that NRMs\ncan exploit high-level symbolic knowledge in non-symbolic environments without\nany knowledge of the SG function, outperforming Deep RL methods which cannot\nincorporate prior knowledge. Moreover, we advance the research in SSSG,\nproposing an algorithm for analysing the groundability of temporal\nspecifications, which is more efficient than baseline techniques of a factor\n$10^3$.\n","authors":["Elena Umili","Francesco Argenziano","Roberto Capobianco"],"pdf_url":"https://arxiv.org/pdf/2408.08677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08675v1","updated":"2024-08-16T11:41:06Z","published":"2024-08-16T11:41:06Z","title":"Misclassification excess risk bounds for PAC-Bayesian classification via\n convexified loss","summary":" PAC-Bayesian bounds have proven to be a valuable tool for deriving\ngeneralization bounds and for designing new learning algorithms in machine\nlearning. However, it typically focus on providing generalization bounds with\nrespect to a chosen loss function. In classification tasks, due to the\nnon-convex nature of the 0-1 loss, a convex surrogate loss is often used, and\nthus current PAC-Bayesian bounds are primarily specified for this convex\nsurrogate. This work shifts its focus to providing misclassification excess\nrisk bounds for PAC-Bayesian classification when using a convex surrogate loss.\nOur key ingredient here is to leverage PAC-Bayesian relative bounds in\nexpectation rather than relying on PAC-Bayesian bounds in probability. We\ndemonstrate our approach in several important applications.\n","authors":["The Tien Mai"],"pdf_url":"https://arxiv.org/pdf/2408.08675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19105v2","updated":"2024-08-16T11:28:13Z","published":"2024-02-29T12:36:10Z","title":"CollaFuse: Navigating Limited Resources and Privacy in Collaborative\n Generative AI","summary":" In the landscape of generative artificial intelligence, diffusion-based\nmodels present challenges for socio-technical systems in data requirements and\nprivacy. Traditional approaches like federated learning distribute the learning\nprocess but strain individual clients, especially with constrained resources\n(e.g., edge devices). In response to these challenges, we introduce CollaFuse,\na novel framework inspired by split learning. Tailored for efficient and\ncollaborative use of denoising diffusion probabilistic models, CollaFuse\nenables shared server training and inference, alleviating client computational\nburdens. This is achieved by retaining data and computationally inexpensive GPU\nprocesses locally at each client while outsourcing the computationally\nexpensive processes to the shared server. Demonstrated in a healthcare context,\nCollaFuse enhances privacy by highly reducing the need for sensitive\ninformation sharing. These capabilities hold the potential to impact various\napplication areas, such as the design of edge computing solutions, healthcare\nresearch, or autonomous driving. In essence, our work advances distributed\nmachine learning, shaping the future of collaborative GenAI networks.\n","authors":["Domenique Zipperling","Simeon Allmendinger","Lukas Struppek","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2402.19105v2.pdf","comment":"Thirty-Second European Conference on Information Systems (ECIS 2024)"},{"id":"http://arxiv.org/abs/2408.08666v1","updated":"2024-08-16T11:15:52Z","published":"2024-08-16T11:15:52Z","title":"A Multivocal Literature Review on Privacy and Fairness in Federated\n Learning","summary":" Federated Learning presents a way to revolutionize AI applications by\neliminating the necessity for data sharing. Yet, research has shown that\ninformation can still be extracted during training, making additional\nprivacy-preserving measures such as differential privacy imperative. To\nimplement real-world federated learning applications, fairness, ranging from a\nfair distribution of performance to non-discriminative behaviour, must be\nconsidered. Particularly in high-risk applications (e.g. healthcare), avoiding\nthe repetition of past discriminatory errors is paramount. As recent research\nhas demonstrated an inherent tension between privacy and fairness, we conduct a\nmultivocal literature review to examine the current methods to integrate\nprivacy and fairness in federated learning. Our analyses illustrate that the\nrelationship between privacy and fairness has been neglected, posing a critical\nrisk for real-world applications. We highlight the need to explore the\nrelationship between privacy, fairness, and performance, advocating for the\ncreation of integrated federated learning frameworks.\n","authors":["Beatrice Balbierer","Lukas Heinlein","Domenique Zipperling","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2408.08666v1.pdf","comment":"Accepted for publication at the Internationale Tagung\n Wirtschaftsinformatik 2024"},{"id":"http://arxiv.org/abs/2408.08664v1","updated":"2024-08-16T11:11:56Z","published":"2024-08-16T11:11:56Z","title":"A new perspective on Bayesian Operational Modal Analysis","summary":" In the field of operational modal analysis (OMA), obtained modal information\nis frequently used to assess the current state of aerospace, mechanical,\noffshore and civil structures. However, the stochasticity of operational\nsystems and the lack of forcing information can lead to inconsistent results.\nQuantifying the uncertainty of the recovered modal parameters through OMA is\ntherefore of significant value. In this article, a new perspective on Bayesian\nOMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm.\nDistinct from existing approaches to Bayesian OMA, a hierarchical probabilistic\nmodel is embedded at the core of covariance-driven SSI. Through substitution of\ncanonical correlation analysis with a Bayesian equivalent, posterior\ndistributions over the modal properties are obtained. Two inference schemes are\npresented for the proposed Bayesian formulation: Markov Chain Monte Carlo and\nvariational Bayes. Two case studies are then explored. The first is benchmark\nstudy using data from a simulated, multi degree-of-freedom, linear system.\nFollowing application of Bayesian SSI, it is shown that the same posterior is\ntargeted and recovered by both inference schemes, with good agreement between\nthe posterior mean and the conventional SSI result. The second study applies\nthe variational form to data obtained from an in-service structure: The Z24\nbridge. The results of this study are presented at single model orders, and\nthen using a stabilisation diagram. The recovered posterior uncertainty is\npresented and compared to the classic SSI result. It is observed that the\nposterior distributions with mean values coinciding with the natural\nfrequencies exhibit lower variance than values situated away from the natural\nfrequencies.\n","authors":["Brandon J. O'Connell","Max D. Champneys","Timothy J. Rogers"],"pdf_url":"https://arxiv.org/pdf/2408.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08661v1","updated":"2024-08-16T11:09:56Z","published":"2024-08-16T11:09:56Z","title":"MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector","summary":" The increasing parameters and expansive dataset of large language models\n(LLMs) highlight the urgent demand for a technical solution to audit the\nunderlying privacy risks and copyright issues associated with LLMs. Existing\nstudies have partially addressed this need through an exploration of the\npre-training data detection problem, which is an instance of a membership\ninference attack (MIA). This problem involves determining whether a given piece\nof text has been used during the pre-training phase of the target LLM. Although\nexisting methods have designed various sophisticated MIA score functions to\nachieve considerable detection performance in pre-trained LLMs, how to achieve\nhigh-confidence detection and how to perform MIA on aligned LLMs remain\nchallenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA\nmethod, which instructs LLMs themselves to serve as a more precise pre-training\ndata detector internally, rather than design an external MIA score function.\nFurthermore, we design two instruction-based safeguards to respectively\nmitigate the privacy risks brought by the existing methods and MIA-Tuner. To\ncomprehensively evaluate the most recent state-of-the-art LLMs, we collect a\nmore up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely\nadopted benchmark WIKIMIA. We conduct extensive experiments across various\naligned and unaligned LLMs over the two benchmark datasets. The results\ndemonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a\nsignificantly high level of 0.9.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08661v1.pdf","comment":"code and dataset: https://github.com/wjfu99/MIA-Tuner"},{"id":"http://arxiv.org/abs/2408.02349v2","updated":"2024-08-16T11:09:18Z","published":"2024-08-05T09:54:08Z","title":"Active Sensing of Knee Osteoarthritis Progression with Reinforcement\n Learning","summary":" Osteoarthritis (OA) is the most common musculoskeletal disease, which has no\ncure. Knee OA (KOA) is one of the highest causes of disability worldwide, and\nit costs billions of United States dollars to the global community. Prediction\nof KOA progression has been of high interest to the community for years, as it\ncan advance treatment development through more efficient clinical trials and\nimprove patient outcomes through more efficient healthcare utilization.\nExisting approaches for predicting KOA, however, are predominantly static, i.e.\nconsider data from a single time point to predict progression many years into\nthe future, and knee level, i.e. consider progression in a single joint only.\nDue to these and related reasons, these methods fail to deliver the level of\npredictive performance, which is sufficient to result in cost savings and\nbetter patient outcomes. Collecting extensive data from all patients on a\nregular basis could address the issue, but it is limited by the high cost at a\npopulation level. In this work, we propose to go beyond static prediction\nmodels in OA, and bring a novel Active Sensing (AS) approach, designed to\ndynamically follow up patients with the objective of maximizing the number of\ninformative data acquisitions, while minimizing their total cost over a period\nof time. Our approach is based on Reinforcement Learning (RL), and it leverages\na novel reward function designed specifically for AS of disease progression in\nmore than one part of a human body. Our method is end-to-end, relies on\nmulti-modal Deep Learning, and requires no human input at inference time.\nThroughout an exhaustive experimental evaluation, we show that using RL can\nprovide a higher monetary benefit when compared to state-of-the-art baselines.\n","authors":["Khanh Nguyen","Huy Hoang Nguyen","Egor Panfilov","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2408.02349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12842v3","updated":"2024-08-16T10:47:32Z","published":"2023-10-19T15:51:23Z","title":"Model-agnostic variable importance for predictive uncertainty: an\n entropy-based approach","summary":" In order to trust the predictions of a machine learning algorithm, it is\nnecessary to understand the factors that contribute to those predictions. In\nthe case of probabilistic and uncertainty-aware models, it is necessary to\nunderstand not only the reasons for the predictions themselves, but also the\nreasons for the model's level of confidence in those predictions. In this\npaper, we show how existing methods in explainability can be extended to\nuncertainty-aware models and how such extensions can be used to understand the\nsources of uncertainty in a model's predictive distribution. In particular, by\nadapting permutation feature importance, partial dependence plots, and\nindividual conditional expectation plots, we demonstrate that novel insights\ninto model behaviour may be obtained and that these methods can be used to\nmeasure the impact of features on both the entropy of the predictive\ndistribution and the log-likelihood of the ground truth labels under that\ndistribution. With experiments using both synthetic and real-world data, we\ndemonstrate the utility of these approaches to understand both the sources of\nuncertainty and their impact on model performance.\n","authors":["Danny Wood","Theodore Papamarkou","Matt Benatan","Richard Allmendinger"],"pdf_url":"https://arxiv.org/pdf/2310.12842v3.pdf","comment":"Data Mining and Knowledge Discovery. Springer"},{"id":"http://arxiv.org/abs/2408.08655v1","updated":"2024-08-16T10:44:14Z","published":"2024-08-16T10:44:14Z","title":"Mitigating Backdoor Attacks in Federated Learning via Flipping Weight\n Updates of Low-Activation Input Neurons","summary":" Federated learning enables multiple clients to collaboratively train machine\nlearning models under the overall planning of the server while adhering to\nprivacy requirements. However, the server cannot directly oversee the local\ntraining process, creating an opportunity for malicious clients to introduce\nbackdoors. Existing research shows that backdoor attacks activate specific\nneurons in the compromised model, which remain dormant when processing clean\ndata. Leveraging this insight, we propose a method called Flipping Weight\nUpdates of Low-Activation Input Neurons (FLAIN) to defend against backdoor\nattacks in federated learning. Specifically, after completing global training,\nwe employ an auxiliary dataset to identify low-activation input neurons and\nflip the associated weight updates. We incrementally raise the threshold for\nlow-activation inputs and flip the weight updates iteratively, until the\nperformance degradation on the auxiliary data becomes unacceptable. Extensive\nexperiments validate that our method can effectively reduce the success rate of\nbackdoor attacks to a low level in various attack scenarios including those\nwith non-IID data distribution or high MCRs, causing only minimal performance\ndegradation on clean data.\n","authors":["Binbin Ding","Penghui Yang","Zeqing Ge","Shengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12378v2","updated":"2024-08-16T10:40:40Z","published":"2024-06-18T08:05:04Z","title":"Efficient mapping of phase diagrams with conditional Boltzmann\n Generators","summary":" The accurate prediction of phase diagrams is of central importance for both\nthe fundamental understanding of materials as well as for technological\napplications in material sciences. However, the computational prediction of the\nrelative stability between phases based on their free energy is a daunting\ntask, as traditional free energy estimators require a large amount of\nsimulation data to obtain uncorrelated equilibrium samples over a grid of\nthermodynamic states. In this work, we develop deep generative machine learning\nmodels based on the Boltzmann Generator approach for entire phase diagrams,\nemploying normalizing flows conditioned on the thermodynamic states, e.g.,\ntemperature and pressure, that they map to. By training a single normalizing\nflow to transform the equilibrium distribution sampled at only one reference\nthermodynamic state to a wide range of target temperatures and pressures, we\ncan efficiently generate equilibrium samples across the entire phase diagram.\nUsing a permutation-equivariant architecture allows us, thereby, to treat solid\nand liquid phases on the same footing. We demonstrate our approach by\npredicting the solid-liquid coexistence line for a Lennard-Jones system in\nexcellent agreement with state-of-the-art free energy methods while\nsignificantly reducing the number of energy evaluations needed.\n","authors":["Maximilian Schebek","Michele Invernizzi","Frank Noé","Jutta Rogal"],"pdf_url":"https://arxiv.org/pdf/2406.12378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.11152v6","updated":"2024-08-16T10:36:24Z","published":"2020-11-23T00:39:49Z","title":"On the Overlooked Pitfalls of Weight Decay and How to Mitigate Them: A\n Gradient-Norm Perspective","summary":" Weight decay is a simple yet powerful regularization technique that has been\nvery widely used in training of deep neural networks (DNNs). While weight decay\nhas attracted much attention, previous studies fail to discover some overlooked\npitfalls on large gradient norms resulted by weight decay. In this paper, we\ndiscover that, weight decay can unfortunately lead to large gradient norms at\nthe final phase (or the terminated solution) of training, which often indicates\nbad convergence and poor generalization. To mitigate the gradient-norm-centered\npitfalls, we present the first practical scheduler for weight decay, called the\nScheduled Weight Decay (SWD) method that can dynamically adjust the weight\ndecay strength according to the gradient norm and significantly penalize large\ngradient norms during training. Our experiments also support that SWD indeed\nmitigates large gradient norms and often significantly outperforms the\nconventional constant weight decay strategy for Adaptive Moment Estimation\n(Adam).\n","authors":["Zeke Xie","Zhiqiang Xu","Jingzhao Zhang","Issei Sato","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2011.11152v6.pdf","comment":"NeurIPS 2023, 21 pages, 20 figures. Keywords: Weight Decay,\n Regularization, Optimization, Deep Learning"},{"id":"http://arxiv.org/abs/2408.08652v1","updated":"2024-08-16T10:36:08Z","published":"2024-08-16T10:36:08Z","title":"TextCAVs: Debugging vision models using text","summary":" Concept-based interpretability methods are a popular form of explanation for\ndeep learning models which provide explanations in the form of high-level human\ninterpretable concepts. These methods typically find concept activation vectors\n(CAVs) using a probe dataset of concept examples. This requires labelled data\nfor these concepts -- an expensive task in the medical domain. We introduce\nTextCAVs: a novel method which creates CAVs using vision-language models such\nas CLIP, allowing for explanations to be created solely using text descriptions\nof the concept, as opposed to image exemplars. This reduced cost in testing\nconcepts allows for many concepts to be tested and for users to interact with\nthe model, testing new ideas as they are thought of, rather than a delay caused\nby image collection and annotation. In early experimental results, we\ndemonstrate that TextCAVs produces reasonable explanations for a chest x-ray\ndataset (MIMIC-CXR) and natural images (ImageNet), and that these explanations\ncan be used to debug deep learning-based models.\n","authors":["Angus Nicolson","Yarin Gal","J. Alison Noble"],"pdf_url":"https://arxiv.org/pdf/2408.08652v1.pdf","comment":"11 pages, 2 figures. Accepted at iMIMIC Workshop at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.16058v2","updated":"2024-08-16T10:29:46Z","published":"2024-07-22T21:26:39Z","title":"Revisiting Score Function Estimators for $k$-Subset Sampling","summary":" Are score function estimators an underestimated approach to learning with\n$k$-subset sampling? Sampling $k$-subsets is a fundamental operation in many\nmachine learning tasks that is not amenable to differentiable parametrization,\nimpeding gradient-based optimization. Prior work has focused on relaxed\nsampling or pathwise gradient estimators. Inspired by the success of score\nfunction estimators in variational inference and reinforcement learning, we\nrevisit them within the context of $k$-subset sampling. Specifically, we\ndemonstrate how to efficiently compute the $k$-subset distribution's score\nfunction using a discrete Fourier transform, and reduce the estimator's\nvariance with control variates. The resulting estimator provides both exact\nsamples and unbiased gradient estimates while also applying to\nnon-differentiable downstream models, unlike existing methods. Experiments in\nfeature selection show results competitive with current methods, despite weaker\nassumptions.\n","authors":["Klas Wijk","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2407.16058v2.pdf","comment":"ICML 2024 Workshop on Differentiable Almost Everything:\n Differentiable Relaxations, Algorithms, Operators, and Simulators"},{"id":"http://arxiv.org/abs/2408.08647v1","updated":"2024-08-16T10:22:54Z","published":"2024-08-16T10:22:54Z","title":"Modeling the Neonatal Brain Development Using Implicit Neural\n Representations","summary":" The human brain undergoes rapid development during the third trimester of\npregnancy. In this work, we model the neonatal development of the infant brain\nin this age range. As a basis, we use MR images of preterm- and term-birth\nneonates from the developing human connectome project (dHCP). We propose a\nneural network, specifically an implicit neural representation (INR), to\npredict 2D- and 3D images of varying time points. In order to model a\nsubject-specific development process, it is necessary to disentangle the age\nfrom the subjects' identity in the latent space of the INR. We propose two\nmethods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent\nAugmentation (SGLA), enabling this disentanglement. We perform an analysis of\nthe results and compare our proposed model to an age-conditioned denoising\ndiffusion model as a baseline. We also show that our method can be applied in a\nmemory-efficient way, which is especially important for 3D data.\n","authors":["Florentin Bieder","Paul Friedrich","Hélène Corbaz","Alicia Durrer","Julia Wolleb","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2408.08647v1.pdf","comment":"Preprint, Accepted for PRIME MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08642v1","updated":"2024-08-16T10:19:27Z","published":"2024-08-16T10:19:27Z","title":"The Power of Bias: Optimizing Client Selection in Federated Learning\n with Heterogeneous Differential Privacy","summary":" To preserve the data privacy, the federated learning (FL) paradigm emerges in\nwhich clients only expose model gradients rather than original data for\nconducting model training. To enhance the protection of model gradients in FL,\ndifferentially private federated learning (DPFL) is proposed which incorporates\ndifferentially private (DP) noises to obfuscate gradients before they are\nexposed. Yet, an essential but largely overlooked problem in DPFL is the\nheterogeneity of clients' privacy requirement, which can vary significantly\nbetween clients and extremely complicates the client selection problem in DPFL.\nIn other words, both the data quality and the influence of DP noises should be\ntaken into account when selecting clients. To address this problem, we conduct\nconvergence analysis of DPFL under heterogeneous privacy, a generic client\nselection strategy, popular DP mechanisms and convex loss. Based on convergence\nanalysis, we formulate the client selection problem to minimize the value of\nloss function in DPFL with heterogeneous privacy, which is a convex\noptimization problem and can be solved efficiently. Accordingly, we propose the\nDPFL-BCS (biased client selection) algorithm. The extensive experiment results\nwith real datasets under both convex and non-convex loss functions indicate\nthat DPFL-BCS can remarkably improve model utility compared with the SOTA\nbaselines.\n","authors":["Jiating Ma","Yipeng Zhou","Qi Li","Quan Z. Sheng","Laizhong Cui","Jiangchuan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08639v1","updated":"2024-08-16T10:09:45Z","published":"2024-08-16T10:09:45Z","title":"Solving The Quantum Many-Body Hamiltonian Learning Problem with Neural\n Differential Equations","summary":" Understanding and characterising quantum many-body dynamics remains a\nsignificant challenge due to both the exponential complexity required to\nrepresent quantum many-body Hamiltonians, and the need to accurately track\nstates in time under the action of such Hamiltonians. This inherent complexity\nlimits our ability to characterise quantum many-body systems, highlighting the\nneed for innovative approaches to unlock their full potential. To address this\nchallenge, we propose a novel method to solve the Hamiltonian Learning (HL)\nproblem-inferring quantum dynamics from many-body state trajectories-using\nNeural Differential Equations combined with an Ansatz Hamiltonian. Our method\nis reliably convergent, experimentally friendly, and interpretable, making it a\nstable solution for HL on a set of Hamiltonians previously unlearnable in the\nliterature. In addition to this, we propose a new quantitative benchmark based\non power laws, which can objectively compare the reliability and generalisation\ncapabilities of any two HL algorithms. Finally, we benchmark our method against\nstate-of-the-art HL algorithms with a 1D spin-1/2 chain proof of concept.\n","authors":["Timothy Heightman","Edward Jiang","Antonio Acín"],"pdf_url":"https://arxiv.org/pdf/2408.08639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07081v3","updated":"2024-08-16T09:54:23Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Improving the readability of mathematical expressions in text-based document\nsuch as subtitle of mathematical video, is an significant task. To achieve\nthis, mathematical expressions should be convert to compiled formulas. For\ninstance, the spoken expression ``x equals minus b plus or minus the square\nroot of b squared minus four a c, all over two a'' from automatic speech\nrecognition is more readily comprehensible when displayed as a compiled formula\n$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken\nsentences to compiled formulas, two processes are required: spoken sentences\nare converted into LaTeX formulas, and LaTeX formulas are converted into\ncompiled formulas. The latter can be managed by using LaTeX engines. However,\nthere is no way to do the former effectively. Even if we try to solve this\nusing language models, there is no paired data between spoken sentences and\nLaTeX formulas to train it. In this paper, we introduce MathBridge, the first\nextensive dataset for translating mathematical spoken sentences into LaTeX\nformulas. MathBridge comprises approximately 23 million LaTeX formulas paired\nwith the corresponding mathematical spoken sentences. Through comprehensive\nevaluations, including fine-tuning with proposed data, we discovered that\nMathBridge significantly enhances the capabilities of pretrained language\nmodels for converting to LaTeX formulas from mathematical spoken sentences.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.00563v2","updated":"2024-08-16T09:53:46Z","published":"2024-03-01T14:41:51Z","title":"Indirectly Parameterized Concrete Autoencoders","summary":" Feature selection is a crucial task in settings where data is\nhigh-dimensional or acquiring the full set of features is costly. Recent\ndevelopments in neural network-based embedded feature selection show promising\nresults across a wide range of applications. Concrete Autoencoders (CAEs),\nconsidered state-of-the-art in embedded feature selection, may struggle to\nachieve stable joint optimization, hurting their training time and\ngeneralization. In this work, we identify that this instability is correlated\nwith the CAE learning duplicate selections. To remedy this, we propose a simple\nand effective improvement: Indirectly Parameterized CAEs (IP-CAEs). IP-CAEs\nlearn an embedding and a mapping from it to the Gumbel-Softmax distributions'\nparameters. Despite being simple to implement, IP-CAE exhibits significant and\nconsistent improvements over CAE in both generalization and training time\nacross several datasets for reconstruction and classification. Unlike CAE,\nIP-CAE effectively leverages non-linear relationships and does not require\nretraining the jointly optimized decoder. Furthermore, our approach is, in\nprinciple, generalizable to Gumbel-Softmax distributions beyond feature\nselection.\n","authors":["Alfred Nilsson","Klas Wijk","Sai bharath chandra Gutha","Erik Englesson","Alexandra Hotti","Carlo Saccardi","Oskar Kviman","Jens Lagergren","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2403.00563v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.08629v1","updated":"2024-08-16T09:43:01Z","published":"2024-08-16T09:43:01Z","title":"Navigating Uncertainties in Machine Learning for Structural Dynamics: A\n Comprehensive Review of Probabilistic and Non-Probabilistic Approaches in\n Forward and Inverse Problems","summary":" In the era of big data, machine learning (ML) has become a powerful tool in\nvarious fields, notably impacting structural dynamics. ML algorithms offer\nadvantages by modeling physical phenomena based on data, even in the absence of\nunderlying mechanisms. However, uncertainties such as measurement noise and\nmodeling errors can compromise the reliability of ML predictions, highlighting\nthe need for effective uncertainty awareness to enhance prediction robustness.\nThis paper presents a comprehensive review on navigating uncertainties in ML,\ncategorizing uncertainty-aware approaches into probabilistic methods (including\nBayesian and frequentist perspectives) and non-probabilistic methods (such as\ninterval learning and fuzzy learning). Bayesian neural networks, known for\ntheir uncertainty quantification and nonlinear mapping capabilities, are\nemphasized for their superior performance and potential. The review covers\nvarious techniques and methodologies for addressing uncertainties in ML,\ndiscussing fundamentals and implementation procedures of each method. While\nproviding a concise overview of fundamental concepts, the paper refrains from\nin-depth critical explanations. Strengths and limitations of each approach are\nexamined, along with their applications in structural dynamic forward problems\nlike response prediction, sensitivity assessment, and reliability analysis, and\ninverse problems like system identification, model updating, and damage\nidentification. Additionally, the review identifies research gaps and suggests\nfuture directions for investigations, aiming to provide comprehensive insights\nto the research community. By offering an extensive overview of both\nprobabilistic and non-probabilistic approaches, this review aims to assist\nresearchers and practitioners in making informed decisions when utilizing ML\ntechniques to address uncertainties in structural dynamic problems.\n","authors":["Wang-Ji Yan","Lin-Feng Mei","Jiang Mo","Costas Papadimitriou","Ka-Veng Yuen","Michael Beer"],"pdf_url":"https://arxiv.org/pdf/2408.08629v1.pdf","comment":"114 pages, 27 figures, 6 tables, references added"},{"id":"http://arxiv.org/abs/2311.01205v2","updated":"2024-08-16T09:42:19Z","published":"2023-11-02T12:59:32Z","title":"Attacking Graph Neural Networks with Bit Flips: Weisfeiler and Lehman Go\n Indifferent","summary":" Prior attacks on graph neural networks have mostly focused on graph poisoning\nand evasion, neglecting the network's weights and biases. Traditional\nweight-based fault injection attacks, such as bit flip attacks used for\nconvolutional neural networks, do not consider the unique properties of graph\nneural networks. We propose the Injectivity Bit Flip Attack, the first bit flip\nattack designed specifically for graph neural networks. Our attack targets the\nlearnable neighborhood aggregation functions in quantized message passing\nneural networks, degrading their ability to distinguish graph structures and\nlosing the expressivity of the Weisfeiler-Lehman test. Our findings suggest\nthat exploiting mathematical properties specific to certain graph neural\nnetwork architectures can significantly increase their vulnerability to bit\nflip attacks. Injectivity Bit Flip Attacks can degrade the maximal expressive\nGraph Isomorphism Networks trained on various graph property prediction\ndatasets to random output by flipping only a small fraction of the network's\nbits, demonstrating its higher destructive power compared to a bit flip attack\ntransferred from convolutional neural networks. Our attack is transparent and\nmotivated by theoretical insights which are confirmed by extensive empirical\nresults.\n","authors":["Lorenz Kummer","Samir Moustafa","Nils N. Kriege","Wilfried N. Gansterer"],"pdf_url":"https://arxiv.org/pdf/2311.01205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08628v1","updated":"2024-08-16T09:42:19Z","published":"2024-08-16T09:42:19Z","title":"A survey on secure decentralized optimization and learning","summary":" Decentralized optimization has become a standard paradigm for solving\nlarge-scale decision-making problems and training large machine learning models\nwithout centralizing data. However, this paradigm introduces new privacy and\nsecurity risks, with malicious agents potentially able to infer private data or\nimpair the model accuracy. Over the past decade, significant advancements have\nbeen made in developing secure decentralized optimization and learning\nframeworks and algorithms. This survey provides a comprehensive tutorial on\nthese advancements. We begin with the fundamentals of decentralized\noptimization and learning, highlighting centralized aggregation and distributed\nconsensus as key modules exposed to security risks in federated and distributed\noptimization, respectively. Next, we focus on privacy-preserving algorithms,\ndetailing three cryptographic tools and their integration into decentralized\noptimization and learning systems. Additionally, we examine resilient\nalgorithms, exploring the design and analysis of resilient aggregation and\nconsensus protocols that support these systems. We conclude the survey by\ndiscussing current trends and potential future directions.\n","authors":["Changxin Liu","Nicola Bastianello","Wei Huo","Yang Shi","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2408.08628v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2408.08622v1","updated":"2024-08-16T09:30:36Z","published":"2024-08-16T09:30:36Z","title":"DeepDFA: Automata Learning through Neural Probabilistic Relaxations","summary":" In this work, we introduce DeepDFA, a novel approach to identifying\nDeterministic Finite Automata (DFAs) from traces, harnessing a differentiable\nyet discrete model. Inspired by both the probabilistic relaxation of DFAs and\nRecurrent Neural Networks (RNNs), our model offers interpretability\npost-training, alongside reduced complexity and enhanced training efficiency\ncompared to traditional RNNs. Moreover, by leveraging gradient-based\noptimization, our method surpasses combinatorial approaches in both scalability\nand noise resilience. Validation experiments conducted on target regular\nlanguages of varying size and complexity demonstrate that our approach is\naccurate, fast, and robust to noise in both the input symbols and the output\nlabels of training data, integrating the strengths of both logical grammar\ninduction and deep learning.\n","authors":["Elena Umili","Roberto Capobianco"],"pdf_url":"https://arxiv.org/pdf/2408.08622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01295v4","updated":"2024-08-16T09:26:37Z","published":"2024-02-02T10:34:13Z","title":"ExtremeCast: Boosting Extreme Value Prediction for Global Weather\n Forecast","summary":" Data-driven weather forecast based on machine learning (ML) has experienced\nrapid development and demonstrated superior performance in the global\nmedium-range forecast compared to traditional physics-based dynamical models.\nHowever, most of these ML models struggle with accurately predicting extreme\nweather, which is related to training loss and the uncertainty of weather\nsystems. Through mathematical analysis, we prove that the use of symmetric\nlosses, such as the Mean Squared Error (MSE), leads to biased predictions and\nunderestimation of extreme values. To address this issue, we introduce Exloss,\na novel loss function that performs asymmetric optimization and highlights\nextreme values to obtain accurate extreme weather forecast. Beyond the\nevolution in training loss, we introduce a training-free extreme value\nenhancement module named ExBooster, which captures the uncertainty in\nprediction outcomes by employing multiple random samples, thereby increasing\nthe hit rate of low-probability extreme events. Combined with an advanced\nglobal weather forecast model, extensive experiments show that our solution can\nachieve state-of-the-art performance in extreme weather prediction, while\nmaintaining the overall forecast accuracy comparable to the top medium-range\nforecast models.\n","authors":["Wanghan Xu","Kang Chen","Tao Han","Hao Chen","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2402.01295v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13044v3","updated":"2024-08-16T09:06:08Z","published":"2024-07-17T22:48:47Z","title":"DropKAN: Dropout Kolmogorov-Arnold Networks","summary":" We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization\nmethod that prevents co-adaptation of activation function weights in\nKolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask\ndirectly within the KAN layer, randomly masking the outputs of some activations\nwithin the KANs' computation graph. We show that this simple procedure that\nrequire minimal coding effort has a regularizing effect and consistently lead\nto better generalization of KANs. We analyze the adaptation of the standard\nDropout with KANs and demonstrate that Dropout applied to KANs' neurons can\nlead to unpredictable behavior in the feedforward pass. We carry an empirical\nstudy with real world Machine Learning datasets to validate our findings. Our\nresults suggest that DropKAN is consistently a better alternative to using\nstandard Dropout with KANs, and improves the generalization performance of\nKANs. Our implementation of DropKAN is available at:\n\\url{https://github.com/Ghaith81/dropkan}.\n","authors":["Mohammed Ghaith Altarabichi"],"pdf_url":"https://arxiv.org/pdf/2407.13044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08610v1","updated":"2024-08-16T08:52:02Z","published":"2024-08-16T08:52:02Z","title":"Generative Dataset Distillation Based on Diffusion Model","summary":" This paper presents our method for the generative track of The First Dataset\nDistillation Challenge at ECCV 2024. Since the diffusion model has become the\nmainstay of generative models because of its high-quality generative effects,\nwe focus on distillation methods based on the diffusion model. Considering that\nthe track can only generate a fixed number of images in 10 minutes using a\ngenerative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a\ngenerative model that can generate images at high speed. In this study, we\nproposed a novel generative dataset distillation method based on Stable\nDiffusion. Specifically, we use the SDXL-Turbo model which can generate images\nat high speed and quality. Compared to other diffusion models that can only\ngenerate images per class (IPC) = 1, our method can achieve an IPC = 10 for\nTiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to\ngenerate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we\nuse the class information as text prompts and post data augmentation for the\nSDXL-Turbo model. Experimental results show the effectiveness of the proposed\nmethod, and we achieved third place in the generative track of the ECCV 2024 DD\nChallenge. Codes are available at https://github.com/Guang000/BANKO.\n","authors":["Duo Su","Junjie Hou","Guang Li","Ren Togo","Rui Song","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2408.08610v1.pdf","comment":"The Third Place Winner in Generative Track of the ECCV 2024 DD\n Challenge"},{"id":"http://arxiv.org/abs/2401.03140v4","updated":"2024-08-16T08:27:00Z","published":"2024-01-06T06:55:26Z","title":"Fair Sampling in Diffusion Models through Switching Mechanism","summary":" Diffusion models have shown their effectiveness in generation tasks by\nwell-approximating the underlying probability distribution. However, diffusion\nmodels are known to suffer from an amplified inherent bias from the training\ndata in terms of fairness. While the sampling process of diffusion models can\nbe controlled by conditional guidance, previous works have attempted to find\nempirical guidance to achieve quantitative fairness. To address this\nlimitation, we propose a fairness-aware sampling method called\n\\textit{attribute switching} mechanism for diffusion models. Without additional\ntraining, the proposed sampling can obfuscate sensitive attributes in generated\ndata without relying on classifiers. We mathematically prove and experimentally\ndemonstrate the effectiveness of the proposed method on two key aspects: (i)\nthe generation of fair data and (ii) the preservation of the utility of the\ngenerated data.\n","authors":["Yujin Choi","Jinseong Park","Hoki Kim","Jaewook Lee","Saeroom Park"],"pdf_url":"https://arxiv.org/pdf/2401.03140v4.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2202.02466v5","updated":"2024-08-16T08:25:42Z","published":"2022-02-05T02:31:01Z","title":"Handling Distribution Shifts on Graphs: An Invariance Perspective","summary":" There is increasing evidence suggesting neural networks' sensitivity to\ndistribution shifts, so that research on out-of-distribution (OOD)\ngeneralization comes into the spotlight. Nonetheless, current endeavors mostly\nfocus on Euclidean data, and its formulation for graph-structured data is not\nclear and remains under-explored, given two-fold fundamental challenges: 1) the\ninter-connection among nodes in one graph, which induces non-IID generation of\ndata points even under the same environment, and 2) the structural information\nin the input graph, which is also informative for prediction. In this paper, we\nformulate the OOD problem on graphs and develop a new invariant learning\napproach, Explore-to-Extrapolate Risk Minimization (EERM), that facilitates\ngraph neural networks to leverage invariance principles for prediction. EERM\nresorts to multiple context explorers (specified as graph structure editers in\nour case) that are adversarially trained to maximize the variance of risks from\nmultiple virtual environments. Such a design enables the model to extrapolate\nfrom a single observed environment which is the common case for node-level\nprediction. We prove the validity of our method by theoretically showing its\nguarantee of a valid OOD solution and further demonstrate its power on various\nreal-world datasets for handling distribution shifts from artificial spurious\nfeatures, cross-domain transfers and dynamic graph evolution.\n","authors":["Qitian Wu","Hengrui Zhang","Junchi Yan","David Wipf"],"pdf_url":"https://arxiv.org/pdf/2202.02466v5.pdf","comment":"ICLR2022, 30 pages"},{"id":"http://arxiv.org/abs/2306.10759v5","updated":"2024-08-16T08:24:25Z","published":"2023-06-19T08:03:25Z","title":"SGFormer: Simplifying and Empowering Transformers for Large-Graph\n Representations","summary":" Learning representations on large-sized graphs is a long-standing challenge\ndue to the inter-dependence nature involved in massive data points.\nTransformers, as an emerging class of foundation encoders for graph-structured\ndata, have shown promising performance on small graphs due to its global\nattention capable of capturing all-pair influence beyond neighboring nodes.\nEven so, existing approaches tend to inherit the spirit of Transformers in\nlanguage and vision tasks, and embrace complicated models by stacking deep\nmulti-head attentions. In this paper, we critically demonstrate that even using\na one-layer attention can bring up surprisingly competitive performance across\nnode property prediction benchmarks where node numbers range from\nthousand-level to billion-level. This encourages us to rethink the design\nphilosophy for Transformers on large graphs, where the global attention is a\ncomputation overhead hindering the scalability. We frame the proposed scheme as\nSimplified Graph Transformers (SGFormer), which is empowered by a simple\nattention model that can efficiently propagate information among arbitrary\nnodes in one layer. SGFormer requires none of positional encodings,\nfeature/graph pre-processing or augmented loss. Empirically, SGFormer\nsuccessfully scales to the web-scale graph ogbn-papers100M and yields up to\n141x inference acceleration over SOTA Transformers on medium-sized graphs.\nBeyond current results, we believe the proposed methodology alone enlightens a\nnew technical path of independent interest for building Transformers on large\ngraphs.\n","authors":["Qitian Wu","Wentao Zhao","Chenxiao Yang","Hengrui Zhang","Fan Nie","Haitian Jiang","Yatao Bian","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10759v5.pdf","comment":"Accepted to NeurIPS 2023, the codes are available at\n https://github.com/qitianwu/SGFormer"},{"id":"http://arxiv.org/abs/2402.11494v2","updated":"2024-08-16T08:22:17Z","published":"2024-02-18T07:49:22Z","title":"Graph Out-of-Distribution Generalization via Causal Intervention","summary":" Out-of-distribution (OOD) generalization has gained increasing attentions for\nlearning on graphs, as graph neural networks (GNNs) often exhibit performance\ndegradation with distribution shifts. The challenge is that distribution shifts\non graphs involve intricate interconnections between nodes, and the environment\nlabels are often absent in data. In this paper, we adopt a bottom-up\ndata-generative perspective and reveal a key observation through causal\nanalysis: the crux of GNNs' failure in OOD generalization lies in the latent\nconfounding bias from the environment. The latter misguides the model to\nleverage environment-sensitive correlations between ego-graph features and\ntarget nodes' labels, resulting in undesirable generalization on new unseen\nnodes. Built upon this analysis, we introduce a conceptually simple yet\nprincipled approach for training robust GNNs under node-level distribution\nshifts, without prior knowledge of environment labels. Our method resorts to a\nnew learning objective derived from causal inference that coordinates an\nenvironment estimator and a mixture-of-expert GNN predictor. The new approach\ncan counteract the confounding bias in training data and facilitate learning\ngeneralizable predictive relations. Extensive experiment demonstrates that our\nmodel can effectively enhance generalization with various types of distribution\nshifts and yield up to 27.4\\% accuracy improvement over state-of-the-arts on\ngraph OOD generalization benchmarks. Source codes are available at\nhttps://github.com/fannie1208/CaNet.\n","authors":["Qitian Wu","Fan Nie","Chenxiao Yang","Tianyi Bao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2402.11494v2.pdf","comment":"Accepted by the research paper track of The Web Conference (WWW)\n 2024. The codes are available at https://github.com/fannie1208/CaNet"},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2402.16998v2","updated":"2024-08-16T08:13:38Z","published":"2024-02-26T20:13:58Z","title":"What Do Language Models Hear? Probing for Auditory Representations in\n Language Models","summary":" This work explores whether language models encode meaningfully grounded\nrepresentations of sounds of objects. We learn a linear probe that retrieves\nthe correct text representation of an object given a snippet of audio related\nto that object, where the sound representation is given by a pretrained audio\nmodel. This probe is trained via a contrastive loss that pushes the language\nrepresentations and sound representations of an object to be close to one\nanother. After training, the probe is tested on its ability to generalize to\nobjects that were not seen during training. Across different language models\nand audio models, we find that the probe generalization is above chance in many\ncases, indicating that despite being trained only on raw text, language models\nencode grounded knowledge of sounds for some objects.\n","authors":["Jerry Ngo","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.16998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08593v1","updated":"2024-08-16T08:02:00Z","published":"2024-08-16T08:02:00Z","title":"RadioDiff: An Effective Generative Diffusion Model for Sampling-Free\n Dynamic Radio Map Construction","summary":" Radio map (RM) is a promising technology that can obtain pathloss based on\nonly location, which is significant for 6G network applications to reduce the\ncommunication costs for pathloss estimation. However, the construction of RM in\ntraditional is either computationally intensive or depends on costly\nsampling-based pathloss measurements. Although the neural network (NN)-based\nmethod can efficiently construct the RM without sampling, its performance is\nstill suboptimal. This is primarily due to the misalignment between the\ngenerative characteristics of the RM construction problem and the\ndiscrimination modeling exploited by existing NN-based methods. Thus, to\nenhance RM construction performance, in this paper, the sampling-free RM\nconstruction is modeled as a conditional generative problem, where a denoised\ndiffusion-based method, named RadioDiff, is proposed to achieve high-quality RM\nconstruction. In addition, to enhance the diffusion model's capability of\nextracting features from dynamic environments, an attention U-Net with an\nadaptive fast Fourier transform module is employed as the backbone network to\nimprove the dynamic environmental features extracting capability. Meanwhile,\nthe decoupled diffusion model is utilized to further enhance the construction\nperformance of RMs. Moreover, a comprehensive theoretical analysis of why the\nRM construction is a generative problem is provided for the first time, from\nboth perspectives of data features and NN training methods. Experimental\nresults show that the proposed RadioDiff achieves state-of-the-art performance\nin all three metrics of accuracy, structural similarity, and peak\nsignal-to-noise ratio. The code is available at\nhttps://github.com/UNIC-Lab/RadioDiff.\n","authors":["Xiucheng Wang","Keda Tao","Nan Cheng","Zhisheng Yin","Zan Li","Yuan Zhang","Xuemin Shen"],"pdf_url":"https://arxiv.org/pdf/2408.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08590v1","updated":"2024-08-16T07:47:39Z","published":"2024-08-16T07:47:39Z","title":"A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive\n Language Models","summary":" Recent studies on logical reasoning in auto-regressive Language Models (LMs)\nhave sparked a debate on whether such models can learn systematic reasoning\nprinciples during pre-training or merely exploit superficial patterns in the\ntraining data. This paper presents a mechanistic interpretation of syllogistic\nreasoning in LMs to further enhance our understanding of internal dynamics.\nSpecifically, we present a methodology for circuit discovery aimed at\ndisentangling content-independent reasoning mechanisms from world knowledge\nacquired during pre-training. Through two distinct intervention methods, we\nuncover a sufficient and necessary circuit involving middle-term suppression\nthat elucidates how LMs transfer information to derive valid conclusions from\npremises. Furthermore, we investigate how belief biases manifest in syllogistic\nreasoning, finding evidence of partial contamination from additional attention\nheads responsible for encoding commonsense and contextualized knowledge.\nFinally, we explore the generalization of the discovered mechanisms across\nvarious syllogistic schemes and model sizes, finding that the identified\ncircuit is sufficient and necessary for all the schemes on which the model\nachieves high downstream accuracy ($\\geq$ 60\\%). Overall, our findings suggest\nthat LMs indeed learn transferable content-independent reasoning mechanisms,\nbut that, at the same time, such mechanisms do not involve generalisable and\nabstract logical primitives, being susceptible to contamination by the same\nworld knowledge acquired during pre-training.\n","authors":["Geonhee Kim","Marco Valentino","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2408.08590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06566v4","updated":"2024-08-16T07:43:55Z","published":"2024-06-03T07:44:32Z","title":"Natural Language Interaction with a Household Electricity\n Knowledge-based Digital Twin","summary":" Domain specific digital twins, representing a digital replica of various\nsegments of the smart grid, are foreseen as able to model, simulate, and\ncontrol the respective segments. At the same time, knowledge-based digital\ntwins, coupled with AI, may also empower humans to understand aspects of the\nsystem through natural language interaction in view of planning and policy\nmaking. This paper is the first to assess and report on the potential of\nRetrieval Augmented Generation (RAG) question answers related to household\nelectrical energy measurement aspects leveraging a knowledge-based energy\ndigital twin. Relying on the recently published electricity consumption\nknowledge graph that actually represents a knowledge-based digital twin, we\nstudy the capabilities of ChatGPT, Gemini and Llama in answering electricity\nrelated questions. Furthermore, we compare the answers with the ones generated\nthrough a RAG techniques that leverages an existing electricity knowledge-based\ndigital twin. Our findings illustrate that the RAG approach not only reduces\nthe incidence of incorrect information typically generated by LLMs but also\nsignificantly improves the quality of the output by grounding responses in\nverifiable data. This paper details our methodology, presents a comparative\nanalysis of responses with and without RAG, and discusses the implications of\nour findings for future applications of AI in specialized sectors like energy\ndata analysis.\n","authors":["Carolina Fortuna","Vid Hanžel","Blaž Bertalanič"],"pdf_url":"https://arxiv.org/pdf/2406.06566v4.pdf","comment":"Accepted at IEEE SmartGridComm'24"},{"id":"http://arxiv.org/abs/2408.08585v1","updated":"2024-08-16T07:39:38Z","published":"2024-08-16T07:39:38Z","title":"OptDist: Learning Optimal Distribution for Customer Lifetime Value\n Prediction","summary":" Customer Lifetime Value (CLTV) prediction is a critical task in business\napplications. Accurately predicting CLTV is challenging in real-world business\nscenarios, as the distribution of CLTV is complex and mutable. Firstly, there\nis a large number of users without any consumption consisting of a long-tailed\npart that is too complex to fit. Secondly, the small set of high-value users\nspent orders of magnitude more than a typical user leading to a wide range of\nthe CLTV distribution which is hard to capture in a single distribution.\nExisting approaches for CLTV estimation either assume a prior probability\ndistribution and fit a single group of distribution-related parameters for all\nsamples, or directly learn from the posterior distribution with manually\npredefined buckets in a heuristic manner. However, all these methods fail to\nhandle complex and mutable distributions. In this paper, we propose a novel\noptimal distribution selection model OptDist for CLTV prediction, which\nutilizes an adaptive optimal sub-distribution selection mechanism to improve\nthe accuracy of complex distribution modeling. Specifically, OptDist trains\nseveral candidate sub-distribution networks in the distribution learning module\n(DLM) for modeling the probability distribution of CLTV. Then, a distribution\nselection module (DSM) is proposed to select the sub-distribution for each\nsample, thus making the selection automatically and adaptively. Besides, we\ndesign an alignment mechanism that connects both modules, which effectively\nguides the optimization. We conduct extensive experiments on both two public\nand one private dataset to verify that OptDist outperforms state-of-the-art\nbaselines. Furthermore, OptDist has been deployed on a large-scale financial\nplatform for customer acquisition marketing campaigns and the online\nexperiments also demonstrate the effectiveness of OptDist.\n","authors":["Yunpeng Weng","Xing Tang","Zhenhao Xu","Fuyuan Lyu","Dugang Liu","Zexu Sun","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.08585v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2310.15290v3","updated":"2024-08-16T07:37:20Z","published":"2023-10-23T18:56:01Z","title":"Reliable Generation of Privacy-preserving Synthetic EHR Time Series via\n Diffusion Models","summary":" Electronic Health Records (EHRs) are rich sources of patient-level data,\noffering valuable resources for medical data analysis. However, privacy\nconcerns often restrict access to EHRs, hindering downstream analysis. Current\nEHR de-identification methods are flawed and can lead to potential privacy\nleakage. Additionally, existing publicly available EHR databases are limited,\npreventing the advancement of medical research using EHR. This study aims to\novercome these challenges by generating realistic and privacy-preserving\nsynthetic electronic health records (EHRs) time series efficiently. We\nintroduce a new method for generating diverse and realistic synthetic EHR time\nseries data using Denoising Diffusion Probabilistic Models (DDPM). We conducted\nexperiments on six databases: Medical Information Mart for Intensive Care III\nand IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and\nnon-EHR datasets on Stocks and Energy. We compared our proposed method with\neight existing methods. Our results demonstrate that our approach significantly\noutperforms all existing methods in terms of data fidelity while requiring less\ntraining effort. Additionally, data generated by our method yields a lower\ndiscriminative accuracy compared to other baseline methods, indicating the\nproposed method can generate data with less privacy risk. The proposed\ndiffusion-model-based method can reliably and efficiently generate synthetic\nEHR time series, which facilitates the downstream medical data analysis. Our\nnumerical results show the superiority of the proposed method over all other\nexisting methods.\n","authors":["Muhang Tian","Bernie Chen","Allan Guo","Shiyi Jiang","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08584v1","updated":"2024-08-16T07:37:05Z","published":"2024-08-16T07:37:05Z","title":"S-RAF: A Simulation-Based Robustness Assessment Framework for\n Responsible Autonomous Driving","summary":" As artificial intelligence (AI) technology advances, ensuring the robustness\nand safety of AI-driven systems has become paramount. However, varying\nperceptions of robustness among AI developers create misaligned evaluation\nmetrics, complicating the assessment and certification of safety-critical and\ncomplex AI systems such as autonomous driving (AD) agents. To address this\nchallenge, we introduce Simulation-Based Robustness Assessment Framework\n(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to\nrigorously assess AD agents across diverse conditions, including faulty\nsensors, environmental changes, and complex traffic situations. By quantifying\nrobustness and its relationship with other safety-critical factors, such as\ncarbon emissions, S-RAF aids developers and stakeholders in building safe and\nresponsible driving agents, and streamlining safety certification processes.\nFurthermore, S-RAF offers significant advantages, such as reduced testing\ncosts, and the ability to explore edge cases that may be unsafe to test in the\nreal world. The code for this framework is available here:\nhttps://github.com/cognitive-robots/rai-leaderboard\n","authors":["Daniel Omeiza","Pratik Somaiya","Jo-Ann Pattinson","Carolyn Ten-Holter","Jack Stilgoe","Marina Jirotka","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08583v1","updated":"2024-08-16T07:33:58Z","published":"2024-08-16T07:33:58Z","title":"GrassNet: State Space Model Meets Graph Neural Network","summary":" Designing spectral convolutional networks is a formidable task in graph\nlearning. In traditional spectral graph neural networks (GNNs),\npolynomial-based methods are commonly used to design filters via the Laplacian\nmatrix. In practical applications, however, these polynomial methods encounter\ninherent limitations, which primarily arise from the the low-order truncation\nof polynomial filters and the lack of overall modeling of the graph spectrum.\nThis leads to poor performance of existing spectral approaches on real-world\ngraph data, especially when the spectrum is highly concentrated or contains\nmany numerically identical values, as they tend to apply the exact same\nmodulation to signals with the same frequencies. To overcome these issues, in\nthis paper, we propose Graph State Space Network (GrassNet), a novel graph\nneural network with theoretical support that provides a simple yet effective\nscheme for designing and learning arbitrary graph spectral filters. In\nparticular, our GrassNet introduces structured state space models (SSMs) to\nmodel the correlations of graph signals at different frequencies and derives a\nunique rectification for each frequency in the graph spectrum. To the best of\nour knowledge, our work is the first to employ SSMs for the design of GNN\nspectral filters, and it theoretically offers greater expressive power compared\nwith polynomial filters. Extensive experiments on nine public benchmarks reveal\nthat GrassNet achieves superior performance in real-world graph modeling tasks.\n","authors":["Gongpei Zhao","Tao Wang","Yi Jin","Congyan Lang","Yidong Li","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2408.08583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14573v3","updated":"2024-08-16T07:30:29Z","published":"2024-07-21T06:27:45Z","title":"Trading Devil Final: Backdoor attack via Stock market and Bayesian\n Optimization","summary":" Since the advent of generative artificial intelligence, every company and\nresearcher has been rushing to develop their own generative models, whether\ncommercial or not. Given the large number of users of these powerful new tools,\nthere is currently no intrinsically verifiable way to explain from the ground\nup what happens when LLMs (large language models) learn. For example, those\nbased on automatic speech recognition systems, which have to rely on huge and\nastronomical amounts of data collected from all over the web to produce fast\nand efficient results, In this article, we develop a backdoor attack called\nMarketBackFinal 2.0, based on acoustic data poisoning, MarketBackFinal 2.0 is\nmainly based on modern stock market models. In order to show the possible\nvulnerabilities of speech-based transformers that may rely on LLMs.\n","authors":["Orson Mengara"],"pdf_url":"https://arxiv.org/pdf/2407.14573v3.pdf","comment":"END (will never be modified again) :Jumps-Diffusion and stock market:\n Better quantify uncertainty in financial simulations"},{"id":"http://arxiv.org/abs/2408.08185v2","updated":"2024-08-16T07:13:38Z","published":"2024-08-15T14:42:28Z","title":"Data-driven identification of latent port-Hamiltonian systems","summary":" Conventional physics-based modeling techniques involve high effort, e.g.,\ntime and expert knowledge, while data-driven methods often lack\ninterpretability, structure, and sometimes reliability. To mitigate this, we\npresent a data-driven system identification framework that derives models in\nthe port-Hamiltonian (pH) formulation. This formulation is suitable for\nmulti-physical systems while guaranteeing the useful system theoretical\nproperties of passivity and stability. Our framework combines linear and\nnonlinear reduction with structured, physics-motivated system identification.\nIn this process, high-dimensional state data obtained from possibly nonlinear\nsystems serves as input for an autoencoder, which then performs two tasks: (i)\nnonlinearly transforming and (ii) reducing this data onto a low-dimensional\nlatent space. In this space, a linear pH system, that satisfies the pH\nproperties per construction, is parameterized by the weights of a neural\nnetwork. The mathematical requirements are met by defining the pH matrices\nthrough Cholesky factorizations. The neural networks that define the coordinate\ntransformation and the pH system are identified in a joint optimization process\nto match the dynamics observed in the data while defining a linear pH system in\nthe latent space. The learned, low-dimensional pH system can describe even\nnonlinear systems and is rapidly computable due to its small size. The method\nis exemplified by a parametric mass-spring-damper and a nonlinear pendulum\nexample, as well as the high-dimensional model of a disc brake with linear\nthermoelastic behavior.\n","authors":["Johannes Rettberg","Jonas Kneifl","Julius Herb","Patrick Buchfink","Jörg Fehr","Bernard Haasdonk"],"pdf_url":"https://arxiv.org/pdf/2408.08185v2.pdf","comment":"33 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00204v2","updated":"2024-08-16T07:08:59Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: A PPO-based DRL Auto-Tuned Nonlinear PID Drone Controller for\n Robust Autonomous Flights","summary":" Navigation precision, speed and stability are crucial for safe UAV flight\nmaneuvers and effective flight mission executions in dynamic environments.\nDifferent flight missions may have varying objectives, such as minimizing\nenergy consumption, achieving precise positioning, or maximizing speed. A\ncontroller that can adapt to different objectives on the fly is highly\nvaluable. Proportional Integral Derivative controllers are one of the most\npopular and widely used control algorithms for drones control systems, but\ntheir linear control algorithm fails to capture the nonlinear nature of the\ndynamic wind conditions and complex drone system. Manually tuning the PID gains\nfor various missions can be time-consuming and requires significant expertise.\nThis paper aims to revolutionize drone flight control by presenting the\nAirPilot, a nonlinear Deep Reinforcement Learning (DRL) - enhanced PID drone\ncontroller using Proximal Policy Optimization. AirPilot controller combines the\nsimplicity and effectiveness of traditional PID control with the adaptability,\nlearning capability, and optimization potential of DRL. This makes it better\nsuited for modern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the Gazebo simulator and\nsubsequently implemented it in a real-world lab setting, which marks a\nsignificant milestone as one of the first attempts to apply a DRL-based flight\ncontroller on an actual drone. Airpilot is capable of reducing the navigation\nerror by more than 82% and improving overshoot, speed and settling time\nsignificantly.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen","Ulices Santa Cruz Leal","Yasser Shoukry"],"pdf_url":"https://arxiv.org/pdf/2404.00204v2.pdf","comment":"14 pages, 17 figures"},{"id":"http://arxiv.org/abs/2408.08567v1","updated":"2024-08-16T07:01:46Z","published":"2024-08-16T07:01:46Z","title":"S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton\n Sketching","summary":" Attention based models have achieved many remarkable breakthroughs in\nnumerous applications. However, the quadratic complexity of Attention makes the\nvanilla Attention based models hard to apply to long sequence tasks. Various\nimproved Attention structures are proposed to reduce the computation cost by\ninducing low rankness and approximating the whole sequence by sub-sequences.\nThe most challenging part of those approaches is maintaining the proper balance\nbetween information preservation and computation reduction: the longer\nsub-sequences used, the better information is preserved, but at the price of\nintroducing more noise and computational costs. In this paper, we propose a\nsmoothed skeleton sketching based Attention structure, coined S$^3$Attention,\nwhich significantly improves upon the previous attempts to negotiate this\ntrade-off. S$^3$Attention has two mechanisms to effectively minimize the impact\nof noise while keeping the linear complexity to the sequence length: a\nsmoothing block to mix information over long sequences and a matrix sketching\nmethod that simultaneously selects columns and rows from the input matrix. We\nverify the effectiveness of S$^3$Attention both theoretically and empirically.\nExtensive studies over Long Range Arena (LRA) datasets and six time-series\nforecasting show that S$^3$Attention significantly outperforms both vanilla\nAttention and other state-of-the-art variants of Attention structures.\n","authors":["Xue Wang","Tian Zhou","Jianqing Zhu","Jialin Liu","Kun Yuan","Tao Yao","Wotao Yin","Rong Jin","HanQin Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08560v1","updated":"2024-08-16T06:52:06Z","published":"2024-08-16T06:52:06Z","title":"A training regime to learn unified representations from complementary\n breast imaging modalities","summary":" Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT)\nare the two most widely used imaging modalities for breast cancer screening.\nAlthough DBT has increased cancer detection compared to FFDM, its widespread\nadoption in clinical practice has been slowed by increased interpretation times\nand a perceived decrease in the conspicuity of specific lesion types.\nSpecifically, the non-inferiority of DBT for microcalcifications remains under\ndebate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM\nacquisitions remain popular, leading to overall increased exam times and\nradiation dosage. Enabling DBT to provide diagnostic information present in\nboth FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in\nboth quantities. We propose a machine learning methodology that learns\nhigh-level representations leveraging the complementary diagnostic signal from\nboth DBT and FFDM. Experiments on a large-scale data set validate our claims\nand show that our representations enable more accurate breast lesion detection\nthan any DBT- or FFDM-based model.\n","authors":["Umang Sharma","Jungkyu Park","Laura Heacock","Sumit Chopra","Krzysztof Geras"],"pdf_url":"https://arxiv.org/pdf/2408.08560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08558v1","updated":"2024-08-16T06:43:58Z","published":"2024-08-16T06:43:58Z","title":"Linear combinations of latents in diffusion models: interpolation and\n beyond","summary":" Generative models are crucial for applications like data synthesis and\naugmentation. Diffusion, Flow Matching and Continuous Normalizing Flows have\nshown effectiveness across various modalities, and rely on Gaussian latent\nvariables for generation. As any generated object is directly associated with a\nparticular latent variable, we can manipulate the variables to exert control\nover the generation process. However, standard approaches for combining latent\nvariables, such as spherical interpolation, only apply or work well in special\ncases. Moreover, current methods for obtaining low-dimensional representations\nof the data, important for e.g. surrogate models for search and creative\napplications, are network and data modality specific. In this work we show that\nthe standard methods to combine variables do not yield intermediates following\nthe distribution the models are trained to expect. We propose Combination of\nGaussian variables (COG), a novel interpolation method that addresses this, is\neasy to implement yet matches or improves upon current methods. COG addresses\nlinear combinations in general and, as we demonstrate, also supports other\noperations including e.g. defining subspaces of the latent space, simplifying\nthe creation of expressive low-dimensional spaces of high-dimensional objects\nusing generative models based on Gaussian latents.\n","authors":["Erik Bodin","Henry Moss","Carl Henrik Ek"],"pdf_url":"https://arxiv.org/pdf/2408.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08554v1","updated":"2024-08-16T06:39:08Z","published":"2024-08-16T06:39:08Z","title":"ABQ-LLM: Arbitrary-Bit Quantized Inference Acceleration for Large\n Language Models","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks. However, their practical application is constrained by substantial\nmemory and computational demands. Post-training quantization (PTQ) is\nconsidered an effective method to accelerate LLM inference. Despite its growing\npopularity in LLM model compression, PTQ deployment faces two major challenges.\nFirst, low-bit quantization leads to performance degradation. Second,\nrestricted by the limited integer computing unit type on GPUs, quantized matrix\noperations with different precisions cannot be effectively accelerated. To\naddress these issues, we introduce a novel arbitrary-bit quantization algorithm\nand inference framework, ABQ-LLM. It achieves superior performance across\nvarious quantization settings and enables efficient arbitrary-precision\nquantized inference on the GPU. ABQ-LLM introduces several key innovations: (1)\na distribution correction method for transformer blocks to mitigate\ndistribution differences caused by full quantization of weights and\nactivations, improving performance at low bit-widths. (2) the bit balance\nstrategy to counteract performance degradation from asymmetric distribution\nissues at very low bit-widths (e.g., 2-bit). (3) an innovative quantization\nacceleration framework that reconstructs the quantization matrix multiplication\nof arbitrary precision combinations based on BTC (Binary TensorCore)\nequivalents, gets rid of the limitations of INT4/INT8 computing units. ABQ-LLM\ncan convert each component bit width gain into actual acceleration gain,\nmaximizing performance under mixed precision(e.g., W6A6, W2A8). Based on W2*A8\nquantization configuration on LLaMA-7B model, it achieved a WikiText2\nperplexity of 7.59 (2.17$\\downarrow $ vs 9.76 in AffineQuant). Compared to\nSmoothQuant, we realized 1.6$\\times$ acceleration improvement and 2.7$\\times$\nmemory compression gain.\n","authors":["Chao Zeng","Songwei Liu","Yusheng Xie","Hong Liu","Xiaojian Wang","Miao Wei","Shu Yang","Fangmin Chen","Xing Mei"],"pdf_url":"https://arxiv.org/pdf/2408.08554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05319v5","updated":"2024-08-16T06:38:32Z","published":"2023-02-10T15:28:55Z","title":"Large Language Models for Code: Security Hardening and Adversarial\n Testing","summary":" Large language models (large LMs) are increasingly trained on massive\ncodebases and used to generate code. However, LMs lack awareness of security\nand are found to frequently produce unsafe code. This work studies the security\nof LMs along two important axes: (i) security hardening, which aims to enhance\nLMs' reliability in generating secure code, and (ii) adversarial testing, which\nseeks to evaluate LMs' security at an adversarial standpoint. We address both\nof these by formulating a new security task called controlled code generation.\nThe task is parametric and takes as input a binary property to guide the LM to\ngenerate secure or unsafe code, while preserving the LM's capability of\ngenerating functionally correct code. We propose a novel learning-based\napproach called SVEN to solve this task. SVEN leverages property-specific\ncontinuous vectors to guide program generation towards the given property,\nwithout modifying the LM's weights. Our training procedure optimizes these\ncontinuous vectors by enforcing specialized loss terms on different regions of\ncode, using a high-quality dataset carefully curated by us. Our extensive\nevaluation shows that SVEN is highly effective in achieving strong security\ncontrol. For instance, a state-of-the-art CodeGen LM with 2.7B parameters\ngenerates secure code for 59.1% of the time. When we employ SVEN to perform\nsecurity hardening (or adversarial testing) on this LM, the ratio is\nsignificantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN\nclosely matches the original LMs in functional correctness.\n","authors":["Jingxuan He","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2302.05319v5.pdf","comment":"Accepted to ACM CCS 2023"},{"id":"http://arxiv.org/abs/2401.13913v2","updated":"2024-08-16T06:00:05Z","published":"2024-01-25T03:17:03Z","title":"Spectral Clustering for Discrete Distributions","summary":" The discrete distribution is often used to describe complex instances in\nmachine learning, such as images, sequences, and documents. Traditionally,\nclustering of discrete distributions (D2C) has been approached using\nWasserstein barycenter methods. These methods operate under the assumption that\nclusters can be well-represented by barycenters, which is seldom true in many\nreal-world applications. Additionally, these methods are not scalable for large\ndatasets due to the high computational cost of calculating Wasserstein\nbarycenters. In this work, we explore the feasibility of using spectral\nclustering combined with distribution affinity measures (e.g., maximum mean\ndiscrepancy and Wasserstein distance) to cluster discrete distributions. We\ndemonstrate that these methods can be more accurate and efficient than\nbarycenter methods. To further enhance scalability, we propose using linear\noptimal transport to construct affinity matrices efficiently for large\ndatasets. We provide theoretical guarantees for the success of our methods in\nclustering distributions. Experiments on both synthetic and real data show that\nour methods outperform existing baselines.\n","authors":["Zixiao Wang","Dong Qiao","Jicong Fan"],"pdf_url":"https://arxiv.org/pdf/2401.13913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08541v1","updated":"2024-08-16T05:56:10Z","published":"2024-08-16T05:56:10Z","title":"Where is the signal in tokenization space?","summary":" Large Language Models (LLMs) are typically shipped with tokenizers that\ndeterministically encode text into so-called canonical token sequences, to\nwhich the LLMs assign probability values. One common assumption is that the\nprobability of a piece of text is the probability of its canonical token\nsequence. However, the tokenization of a string is not unique: e.g., the Llama2\ntokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same\ntext. In this paper, we study non-canonical tokenizations. We prove that, given\na string, it is computationally hard to find the most likely tokenization for\nan autoregressive LLM, as well as to compute the marginal probability over all\npossible tokenizations. We then show how the marginal is, in most cases,\nindistinguishable from the canonical probability. Surprisingly, we then\nempirically demonstrate the existence of a significant amount of signal hidden\nwithin tokenization space. Notably, by simply aggregating the probabilities of\nnon-canonical tokenizations, we achieve improvements across a range of LLM\nevaluation benchmarks for a variety of architectures, including transformers\nand state space models.\n","authors":["Renato Lui Geh","Honghua Zhang","Kareem Ahmed","Benjie Wang","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2408.08541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08536v1","updated":"2024-08-16T05:34:50Z","published":"2024-08-16T05:34:50Z","title":"Blockchain-Enabled Accountability in Data Supply Chain: A Data Bill of\n Materials Approach","summary":" In the era of advanced artificial intelligence, highlighted by large-scale\ngenerative models like GPT-4, ensuring the traceability, verifiability, and\nreproducibility of datasets throughout their lifecycle is paramount for\nresearch institutions and technology companies. These organisations\nincreasingly rely on vast corpora to train and fine-tune advanced AI models,\nresulting in intricate data supply chains that demand effective data governance\nmechanisms. In addition, the challenge intensifies as diverse stakeholders may\nuse assorted tools, often without adequate measures to ensure the\naccountability of data and the reliability of outcomes. In this study, we adapt\nthe concept of ``Software Bill of Materials\" into the field of data governance\nand management to address the above challenges, and introduce ``Data Bill of\nMaterials\" (DataBOM) to capture the dependency relationship between different\ndatasets and stakeholders by storing specific metadata. We demonstrate a\nplatform architecture for providing blockchain-based DataBOM services, present\nthe interaction protocol for stakeholders, and discuss the minimal requirements\nfor DataBOM metadata. The proposed solution is evaluated in terms of\nfeasibility and performance via case study and quantitative analysis\nrespectively.\n","authors":["Yue Liu","Dawen Zhang","Boming Xia","Julia Anticev","Tunde Adebayo","Zhenchang Xing","Moses Machao"],"pdf_url":"https://arxiv.org/pdf/2408.08536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08533v1","updated":"2024-08-16T05:11:52Z","published":"2024-08-16T05:11:52Z","title":"Unsupervised Transfer Learning via Adversarial Contrastive Training","summary":" Learning a data representation for downstream supervised learning tasks under\nunlabeled scenario is both critical and challenging. In this paper, we propose\na novel unsupervised transfer learning approach using adversarial contrastive\ntraining (ACT). Our experimental results demonstrate outstanding classification\naccuracy with both fine-tuned linear probe and K-NN protocol across various\ndatasets, showing competitiveness with existing state-of-the-art\nself-supervised learning methods. Moreover, we provide an end-to-end\ntheoretical guarantee for downstream classification tasks in a misspecified,\nover-parameterized setting, highlighting how a large amount of unlabeled data\ncontributes to prediction accuracy. Our theoretical findings suggest that the\ntesting error of downstream tasks depends solely on the efficiency of data\naugmentation used in ACT when the unlabeled sample size is sufficiently large.\nThis offers a theoretical understanding of learning downstream tasks with a\nsmall sample size.\n","authors":["Chenguang Duan","Yuling Jiao","Huazhen Lin","Wensen Ma","Jerry Zhijian Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08531v1","updated":"2024-08-16T04:57:54Z","published":"2024-08-16T04:57:54Z","title":"Detecting Unsuccessful Students in Cybersecurity Exercises in Two\n Different Learning Environments","summary":" This full paper in the research track evaluates the usage of data logged from\ncybersecurity exercises in order to predict students who are potentially at\nrisk of performing poorly. Hands-on exercises are essential for learning since\nthey enable students to practice their skills. In cybersecurity, hands-on\nexercises are often complex and require knowledge of many topics. Therefore,\nstudents may miss solutions due to gaps in their knowledge and become\nfrustrated, which impedes their learning. Targeted aid by the instructor helps,\nbut since the instructor's time is limited, efficient ways to detect struggling\nstudents are needed. This paper develops automated tools to predict when a\nstudent is having difficulty. We formed a dataset with the actions of 313\nstudents from two countries and two learning environments: KYPO CRP and\nEDURange. These data are used in machine learning algorithms to predict the\nsuccess of students in exercises deployed in these environments. After\nextracting features from the data, we trained and cross-validated eight\nclassifiers for predicting the exercise outcome and evaluated their predictive\npower. The contribution of this paper is comparing two approaches to feature\nengineering, modeling, and classification performance on data from two learning\nenvironments. Using the features from either learning environment, we were able\nto detect and distinguish between successful and struggling students. A\ndecision tree classifier achieved the highest balanced accuracy and sensitivity\nwith data from both learning environments. The results show that activity data\nfrom cybersecurity exercises are suitable for predicting student success. In a\npotential application, such models can aid instructors in detecting struggling\nstudents and providing targeted help. We publish data and code for building\nthese models so that others can adopt or adapt them.\n","authors":["Valdemar Švábenský","Kristián Tkáčik","Aubrey Birdwell","Richard Weiss","Ryan S. Baker","Pavel Čeleda","Jan Vykopal","Jens Mache","Ankur Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2408.08531v1.pdf","comment":"To appear for publication in the FIE 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2408.08526v1","updated":"2024-08-16T04:54:09Z","published":"2024-08-16T04:54:09Z","title":"Inverse design with conditional cascaded diffusion models","summary":" Adjoint-based design optimizations are usually computationally expensive and\nthose costs scale with resolution. To address this, researchers have proposed\nmachine learning approaches for inverse design that can predict\nhigher-resolution solutions from lower cost/resolution ones. Due to the recent\nsuccess of diffusion models over traditional generative models, we extend the\nuse of diffusion models for multi-resolution tasks by proposing the conditional\ncascaded diffusion model (cCDM). Compared to GANs, cCDM is more stable to\ntrain, and each diffusion model within the cCDM can be trained independently,\nthus each model's parameters can be tuned separately to maximize the\nperformance of the pipeline. Our study compares cCDM against a cGAN model with\ntransfer learning.\n Our results demonstrate that the cCDM excels in capturing finer details,\npreserving volume fraction constraints, and minimizing compliance errors in\nmulti-resolution tasks when a sufficient amount of high-resolution training\ndata (more than 102 designs) is available. Furthermore, we explore the impact\nof training data size on the performance of both models. While both models show\ndecreased performance with reduced high-resolution training data, the cCDM\nloses its superiority to the cGAN model with transfer learning when training\ndata is limited (less than 102), and we show the break-even point for this\ntransition. Also, we highlight that while the diffusion model may achieve\nbetter pixel-wise performance in both low-resolution and high-resolution\nscenarios, this does not necessarily guarantee that the model produces optimal\ncompliance error or constraint satisfaction.\n","authors":["Milad Habibi","Mark Fuge"],"pdf_url":"https://arxiv.org/pdf/2408.08526v1.pdf","comment":"Accepted for presentation at IDETC/CIE 2024 conference, Washington,\n DC. 11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.11124v4","updated":"2024-08-16T03:43:17Z","published":"2024-02-16T23:17:00Z","title":"Implicit Causal Representation Learning via Switchable Mechanisms","summary":" Learning causal representations from observational and interventional data in\nthe absence of known ground-truth graph structures necessitates implicit latent\ncausal representation learning. Implicit learning of causal mechanisms\ntypically involves two categories of interventional data: hard and soft\ninterventions. In real-world scenarios, soft interventions are often more\nrealistic than hard interventions, as the latter require fully controlled\nenvironments. Unlike hard interventions, which directly force changes in a\ncausal variable, soft interventions exert influence indirectly by affecting the\ncausal mechanism. However, the subtlety of soft interventions impose several\nchallenges for learning causal models. One challenge is that soft\nintervention's effects are ambiguous, since parental relations remain intact.\nIn this paper, we tackle the challenges of learning causal models using soft\ninterventions while retaining implicit modelling. We propose ICLR-SM, which\nmodels the effects of soft interventions by employing a causal mechanism switch\nvariable designed to toggle between different causal mechanisms. In our\nexperiments, we consistently observe improved learning of identifiable, causal\nrepresentations, compared to baseline approaches.\n","authors":["Shayan Shirahmad Gale Bagi","Zahra Gharaee","Oliver Schulte","Mark Crowley"],"pdf_url":"https://arxiv.org/pdf/2402.11124v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18753v2","updated":"2024-08-16T03:29:18Z","published":"2024-05-29T04:37:19Z","title":"Confronting the Reproducibility Crisis: A Case Study of Challenges in\n Cybersecurity AI","summary":" In the rapidly evolving field of cybersecurity, ensuring the reproducibility\nof AI-driven research is critical to maintaining the reliability and integrity\nof security systems. This paper addresses the reproducibility crisis within the\ndomain of adversarial robustness -- a key area in AI-based cybersecurity that\nfocuses on defending deep neural networks against malicious perturbations.\nThrough a detailed case study, we attempt to validate results from prior work\non certified robustness using the VeriGauge toolkit, revealing significant\nchallenges due to software and hardware incompatibilities, version conflicts,\nand obsolescence. Our findings underscore the urgent need for standardized\nmethodologies, containerization, and comprehensive documentation to ensure the\nreproducibility of AI models deployed in critical cybersecurity applications.\nBy tackling these reproducibility challenges, we aim to contribute to the\nbroader discourse on securing AI systems against advanced persistent threats,\nenhancing network and IoT security, and protecting critical infrastructure.\nThis work advocates for a concerted effort within the research community to\nprioritize reproducibility, thereby strengthening the foundation upon which\nfuture cybersecurity advancements are built.\n","authors":["Richard H. Moulton","Gary A. McCully","John D. Hastings"],"pdf_url":"https://arxiv.org/pdf/2405.18753v2.pdf","comment":"8 pages, 0 figures, 2 tables, updated to incorporate feedback and\n improvements"},{"id":"http://arxiv.org/abs/2408.08508v1","updated":"2024-08-16T03:22:18Z","published":"2024-08-16T03:22:18Z","title":"Mitigating Degree Bias in Signed Graph Neural Networks","summary":" Like Graph Neural Networks (GNNs), Signed Graph Neural Networks (SGNNs) are\nalso up against fairness issues from source data and typical aggregation\nmethod. In this paper, we are pioneering to make the investigation of fairness\nin SGNNs expanded from GNNs. We identify the issue of degree bias within signed\ngraphs, offering a new perspective on the fairness issues related to SGNNs. To\nhandle the confronted bias issue, inspired by previous work on degree bias, a\nnew Model-Agnostic method is consequently proposed to enhance representation of\nnodes with different degrees, which named as Degree Debiased Signed Graph\nNeural Network (DD-SGNN) . More specifically, in each layer, we make a transfer\nfrom nodes with high degree to nodes with low degree inside a head-to-tail\ntriplet, which to supplement the underlying domain missing structure of the\ntail nodes and meanwhile maintain the positive and negative semantics specified\nby balance theory in signed graphs. We make extensive experiments on four\nreal-world datasets. The result verifies the validity of the model, that is,\nour model mitigates the degree bias issue without compromising\nperformance($\\textit{i.e.}$, AUC, F1). The code is provided in supplementary\nmaterial.\n","authors":["Fang He","Jinhai Deng","Ruizhan Xue","Maojun Wang","Zeyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08508v1.pdf","comment":"10 pages, 7 figures, The 39th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2408.02223v2","updated":"2024-08-16T03:18:12Z","published":"2024-08-05T03:54:52Z","title":"Large Language Model Aided QoS Prediction for Service Recommendation","summary":" Large language models (LLMs) have seen rapid improvement in the recent years,\nand have been used in a wider range of applications. After being trained on\nlarge text corpus, LLMs obtain the capability of extracting rich features from\ntextual data. Such capability is potentially useful for the web service\nrecommendation task, where the web users and services have intrinsic attributes\nthat can be described using natural language sentences and are useful for\nrecommendation. In this paper, we explore the possibility and practicality of\nusing LLMs for web service recommendation. We propose the large language model\naided QoS prediction (llmQoS) model, which use LLMs to extract useful\ninformation from attributes of web users and services via descriptive\nsentences. This information is then used in combination with the QoS values of\nhistorical interactions of users and services, to predict QoS values for any\ngiven user-service pair. On the WSDream dataset, llmQoS is shown to overcome\nthe data sparsity issue inherent to the QoS prediction problem, and outperforms\ncomparable baseline models consistently.\n","authors":["Huiying Liu","Zekun Zhang","Honghao Li","Qilin Wu","Yiwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.02223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15734v2","updated":"2024-08-16T02:57:35Z","published":"2024-03-23T06:01:45Z","title":"Space Group Informed Transformer for Crystalline Materials Generation","summary":" We introduce CrystalFormer, a transformer-based autoregressive model\nspecifically designed for space group-controlled generation of crystalline\nmaterials. The incorporation of space group symmetry significantly simplifies\nthe crystal space, which is crucial for data and compute efficient generative\nmodeling of crystalline materials. Leveraging the prominent discrete and\nsequential nature of the Wyckoff positions, CrystalFormer learns to generate\ncrystals by directly predicting the species and locations of\nsymmetry-inequivalent atoms in the unit cell. We demonstrate the advantages of\nCrystalFormer in standard tasks such as symmetric structure initialization and\nelement substitution compared to conventional methods implemented in popular\ncrystal structure prediction software. Moreover, we showcase the application of\nCrystalFormer of property-guided materials design in a plug-and-play manner.\nOur analysis shows that CrystalFormer ingests sensible solid-state chemistry\nknowledge and heuristics by compressing the material dataset, thus enabling\nsystematic exploration of crystalline materials. The simplicity, generality,\nand flexibility of CrystalFormer position it as a promising architecture to be\nthe foundational model of the entire crystalline materials space, heralding a\nnew era in materials modeling and discovery.\n","authors":["Zhendong Cao","Xiaoshan Luo","Jian Lv","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15734v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.08499v1","updated":"2024-08-16T02:53:01Z","published":"2024-08-16T02:53:01Z","title":"The Limitations of Model Retraining in the Face of Performativity","summary":" We study stochastic optimization in the context of performative shifts, where\nthe data distribution changes in response to the deployed model. We demonstrate\nthat naive retraining can be provably suboptimal even for simple distribution\nshifts. The issue worsens when models are retrained given a finite number of\nsamples at each retraining step. We show that adding regularization to\nretraining corrects both of these issues, attaining provably optimal models in\nthe face of distribution shifts. Our work advocates rethinking how machine\nlearning models are retrained in the presence of performative effects.\n","authors":["Anmol Kabra","Kumar Kshitij Patel"],"pdf_url":"https://arxiv.org/pdf/2408.08499v1.pdf","comment":"Accepted to 2024 ICML Workshop on Humans, Algorithmic Decision-Making\n and Society"},{"id":"http://arxiv.org/abs/2405.17938v2","updated":"2024-08-16T02:43:59Z","published":"2024-05-28T08:02:42Z","title":"RC-Mixup: A Data Augmentation Strategy against Noisy Data for Regression\n Tasks","summary":" We study the problem of robust data augmentation for regression tasks in the\npresence of noisy data. Data augmentation is essential for generalizing deep\nlearning models, but most of the techniques like the popular Mixup are\nprimarily designed for classification tasks on image data. Recently, there are\nalso Mixup techniques that are specialized to regression tasks like C-Mixup. In\ncomparison to Mixup, which takes linear interpolations of pairs of samples,\nC-Mixup is more selective in which samples to mix based on their label\ndistances for better regression performance. However, C-Mixup does not\ndistinguish noisy versus clean samples, which can be problematic when mixing\nand lead to suboptimal model performance. At the same time, robust training has\nbeen heavily studied where the goal is to train accurate models against noisy\ndata through multiple rounds of model training. We thus propose our data\naugmentation strategy RC-Mixup, which tightly integrates C-Mixup with\nmulti-round robust training methods for a synergistic effect. In particular,\nC-Mixup improves robust training in identifying clean data, while robust\ntraining provides cleaner data to C-Mixup for it to perform better. A key\nadvantage of RC-Mixup is that it is data-centric where the robust model\ntraining algorithm itself does not need to be modified, but can simply benefit\nfrom data mixing. We show in our experiments that RC-Mixup significantly\noutperforms C-Mixup and robust training baselines on noisy data benchmarks and\ncan be integrated with various robust training methods.\n","authors":["Seong-Hyeon Hwang","Minsu Kim","Steven Euijong Whang"],"pdf_url":"https://arxiv.org/pdf/2405.17938v2.pdf","comment":"Accepted to KDD 2024"},{"id":"http://arxiv.org/abs/2408.08494v1","updated":"2024-08-16T02:33:07Z","published":"2024-08-16T02:33:07Z","title":"Optimal Sketching for Residual Error Estimation for Matrix and Vector\n Norms","summary":" We study the problem of residual error estimation for matrix and vector norms\nusing a linear sketch. Such estimates can be used, for example, to quickly\nassess how useful a more expensive low-rank approximation computation will be.\nThe matrix case concerns the Frobenius norm and the task is to approximate the\n$k$-residual $\\|A - A_k\\|_F$ of the input matrix $A$ within a\n$(1+\\epsilon)$-factor, where $A_k$ is the optimal rank-$k$ approximation. We\nprovide a tight bound of $\\Theta(k^2/\\epsilon^4)$ on the size of bilinear\nsketches, which have the form of a matrix product $SAT$. This improves the\nprevious $O(k^2/\\epsilon^6)$ upper bound in (Andoni et al. SODA 2013) and gives\nthe first non-trivial lower bound, to the best of our knowledge. In our\nalgorithm, our sketching matrices $S$ and $T$ can both be sparse matrices,\nallowing for a very fast update time. We demonstrate that this gives a\nsubstantial advantage empirically, for roughly the same sketch size and\naccuracy as in previous work.\n For the vector case, we consider the $\\ell_p$-norm for $p>2$, where the task\nis to approximate the $k$-residual $\\|x - x_k\\|_p$ up to a constant factor,\nwhere $x_k$ is the optimal $k$-sparse approximation to $x$. Such vector norms\nare frequently studied in the data stream literature and are useful for finding\nfrequent items or so-called heavy hitters. We establish an upper bound of\n$O(k^{2/p}n^{1-2/p}\\operatorname{poly}(\\log n))$ for constant $\\epsilon$ on the\ndimension of a linear sketch for this problem. Our algorithm can be extended to\nthe $\\ell_p$ sparse recovery problem with the same sketching dimension, which\nseems to be the first such bound for $p > 2$. We also show an\n$\\Omega(k^{2/p}n^{1-2/p})$ lower bound for the sparse recovery problem, which\nis tight up to a $\\mathrm{poly}(\\log n)$ factor.\n","authors":["Yi Li","Honghao Lin","David P. Woodruff"],"pdf_url":"https://arxiv.org/pdf/2408.08494v1.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2408.08493v1","updated":"2024-08-16T02:29:38Z","published":"2024-08-16T02:29:38Z","title":"Fishers Harvest Parallel Unlearning in Inherited Model Networks","summary":" Unlearning in various learning frameworks remains challenging, with the\ncontinuous growth and updates of models exhibiting complex inheritance\nrelationships. This paper presents a novel unlearning framework, which enables\nfully parallel unlearning among models exhibiting inheritance. A key enabler is\nthe new Unified Model Inheritance Graph (UMIG), which captures the inheritance\nusing a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher\nInheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information\nMatrix (FIM) from initial unlearning models to pinpoint impacted parameters in\ninherited models. By employing FIM, the FIUn method breaks the sequential\ndependencies among the models, facilitating simultaneous unlearning and\nreducing computational overhead. We further design to merge disparate FIMs into\na single matrix, synchronizing updates across inherited models. Experiments\nconfirm the effectiveness of our unlearning framework. For single-class tasks,\nit achieves complete unlearning with 0\\% accuracy for unlearned labels while\nmaintaining 94.53\\% accuracy for retained labels on average. For multi-class\ntasks, the accuracy is 1.07\\% for unlearned labels and 84.77\\% for retained\nlabels on average. Our framework accelerates unlearning by 99\\% compared to\nalternative methods.\n","authors":["Xiao Liu","Mingyuan Li","Xu Wang","Guangsheng Yu","Wei Ni","Lixiang Li","Haipeng Peng","Renping Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17789v3","updated":"2024-08-16T02:19:23Z","published":"2024-04-27T06:06:41Z","title":"BiLO: Bilevel Local Operator Learning for PDE inverse problems","summary":" We propose a new neural network based method for solving inverse problems for\npartial differential equations (PDEs) by formulating the PDE inverse problem as\na bilevel optimization problem. At the upper level, we minimize the data loss\nwith respect to the PDE parameters. At the lower level, we train a neural\nnetwork to locally approximate the PDE solution operator in the neighborhood of\na given set of PDE parameters, which enables an accurate approximation of the\ndescent direction for the upper level optimization problem. The lower level\nloss function includes the L2 norms of both the residual and its derivative\nwith respect to the PDE parameters. We apply gradient descent simultaneously on\nboth the upper and lower level optimization problems, leading to an effective\nand fast algorithm. The method, which we refer to as BiLO (Bilevel Local\nOperator learning), is also able to efficiently infer unknown functions in the\nPDEs through the introduction of an auxiliary variable. Through extensive\nexperiments over multiple PDE systems, we demonstrate that our method enforces\nstrong PDE constraints, is robust to sparse and noisy data, and eliminates the\nneed to balance the residual and the data loss, which is inherent to the soft\nPDE constraints in many existing methods.\n","authors":["Ray Zirui Zhang","Xiaohui Xie","John S. Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2404.17789v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08488v1","updated":"2024-08-16T02:17:21Z","published":"2024-08-16T02:17:21Z","title":"Adversarial Contrastive Learning Based Physics-Informed Temporal\n Networks for Cuffless Blood Pressure Estimation","summary":" Time series data mining is immensely important in extensive applications,\nsuch as traffic, medical, and e-commerce. In this paper, we focus on medical\ntemporal variation modeling, \\emph{i.e.,} cuffless blood pressure (BP)\nmonitoring which has great value in cardiovascular healthcare. Although\nproviding a comfortable user experience, such methods are suffering from the\ndemand for a significant amount of realistic data to train an individual model\nfor each subject, especially considering the invasive or obtrusive BP\nground-truth measurements. To tackle this challenge, we introduce a novel\nphysics-informed temporal network~(PITN) with adversarial contrastive learning\nto enable precise BP estimation with very limited data. Specifically, we first\nenhance the physics-informed neural network~(PINN) with the temporal block for\ninvestigating BP dynamics' multi-periodicity for personal cardiovascular cycle\nmodeling and temporal variation. We then employ adversarial training to\ngenerate extra physiological time series data, improving PITN's robustness in\nthe face of sparse subject-specific training data. Furthermore, we utilize\ncontrastive learning to capture the discriminative variations of cardiovascular\nphysiologic phenomena. This approach aggregates physiological signals with\nsimilar blood pressure values in latent space while separating clusters of\nsamples with dissimilar blood pressure values. Experiments on three\nwidely-adopted datasets with different modailties (\\emph{i.e.,} bioimpedance,\nPPG, millimeter-wave) demonstrate the superiority and effectiveness of the\nproposed methods over previous state-of-the-art approaches. The code is\navailable at~\\url{https://github.com/Zest86/ACL-PITN}.\n","authors":["Rui Wang","Mengshi Qi","Yingxia Shao","Anfu Zhou","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2408.08488v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.08484v1","updated":"2024-08-16T02:07:34Z","published":"2024-08-16T02:07:34Z","title":"An Unsupervised Learning Framework Combined with Heuristics for the\n Maximum Minimal Cut Problem","summary":" The Maximum Minimal Cut Problem (MMCP), a NP-hard combinatorial optimization\n(CO) problem, has not received much attention due to the demanding and\nchallenging bi-connectivity constraint. Moreover, as a CO problem, it is also a\ndaunting task for machine learning, especially without labeled instances. To\ndeal with these problems, this work proposes an unsupervised learning framework\ncombined with heuristics for MMCP that can provide valid and high-quality\nsolutions. As far as we know, this is the first work that explores machine\nlearning and heuristics to solve MMCP. The unsupervised solver is inspired by a\nrelaxation-plus-rounding approach, the relaxed solution is parameterized by\ngraph neural networks, and the cost and penalty of MMCP are explicitly written\nout, which can train the model end-to-end. A crucial observation is that each\nsolution corresponds to at least one spanning tree. Based on this finding, a\nheuristic solver that implements tree transformations by adding vertices is\nutilized to repair and improve the solution quality of the unsupervised solver.\nAlternatively, the graph is simplified while guaranteeing solution consistency,\nwhich reduces the running time. We conduct extensive experiments to evaluate\nour framework and give a specific application. The results demonstrate the\nsuperiority of our method against two techniques designed.\n","authors":["Huaiyuan Liu","Xianzhang Liu","Donghua Yang","Hongzhi Wang","Yingchi Long","Mengtong Ji","Dongjing Miao","Zhiyu Liang"],"pdf_url":"https://arxiv.org/pdf/2408.08484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17987v3","updated":"2024-08-16T01:37:41Z","published":"2024-02-28T02:11:47Z","title":"Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A\n Bayesian Fusion Approach","summary":" Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs)\ninvolves transmitting Electromagnetic Waves (EMWs) and performing target type\nrecognition on the received radar echo, crucial for defense and aerospace\napplications. Previous studies highlighted the advantages of multistatic radar\nconfigurations over monostatic ones in RATR. However, fusion methods in\nmultistatic radar configurations often suboptimally combine classification\nvectors from individual radars probabilistically. To address this, we propose a\nfully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to\naggregate classification probability vectors from multiple radars. OBF, based\non expected 0-1 loss, updates a Recursive Bayesian Classification (RBC)\nposterior distribution for target UAV type, conditioned on historical\nobservations across multiple time steps. We evaluate the approach using\nsimulated random walk trajectories for seven drones, correlating target aspect\nangles to Radar Cross Section (RCS) measurements in an anechoic chamber.\nComparing against single radar Automated Target Recognition (ATR) systems and\nsuboptimal fusion methods, our empirical results demonstrate that the OBF\nmethod integrated with RBC significantly enhances classification accuracy\ncompared to other fusion methods and single radar configurations.\n","authors":["Michael Potter","Murat Akcakaya","Marius Necsoiu","Gunar Schirner","Deniz Erdogmus","Tales Imbiriba"],"pdf_url":"https://arxiv.org/pdf/2402.17987v3.pdf","comment":"Accepted to IEEE Transactions on Aerospace and Electronic Systems"},{"id":"http://arxiv.org/abs/2311.16536v3","updated":"2024-08-16T01:27:18Z","published":"2023-11-28T05:45:20Z","title":"Personalized Predictions of Glioblastoma Infiltration: Mathematical\n Models, Physics-Informed Neural Networks and Multimodal Scans","summary":" Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is\ncrucial for understanding tumor growth dynamics and designing personalized\nradiotherapy treatment plans.Mathematical models of GBM growth can complement\nthe data in the prediction of spatial distributions of tumor cells. However,\nthis requires estimating patient-specific parameters of the model from clinical\ndata, which is a challenging inverse problem due to limited temporal data and\nthe limited time between imaging and diagnosis. This work proposes a method\nthat uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific\nparameters of a reaction-diffusion PDE model of GBM growth from a single 3D\nstructural MRI snapshot. PINNs embed both the data and the PDE into a loss\nfunction, thus integrating theory and data. Key innovations include the\nidentification and estimation of characteristic non-dimensional parameters, a\npre-training step that utilizes the non-dimensional parameters and a\nfine-tuning step to determine the patient specific parameters. Additionally,\nthe diffuse domain method is employed to handle the complex brain geometry\nwithin the PINN framework. Our method is validated both on synthetic and\npatient datasets, and shows promise for real-time parametric inference in the\nclinical setting for personalized GBM treatment.\n","authors":["Ray Zirui Zhang","Ivan Ezhov","Michal Balcerak","Andy Zhu","Benedikt Wiestler","Bjoern Menze","John S. Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2311.16536v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08474v1","updated":"2024-08-16T01:20:27Z","published":"2024-08-16T01:20:27Z","title":"Enhancing Events in Neutrino Telescopes through Deep Learning-Driven\n Super-Resolution","summary":" Recent discoveries by neutrino telescopes, such as the IceCube Neutrino\nObservatory, relied extensively on machine learning (ML) tools to infer\nphysical quantities from the raw photon hits detected. Neutrino telescope\nreconstruction algorithms are limited by the sparse sampling of photons by the\noptical modules due to the relatively large spacing ($10-100\\,{\\rm m})$ between\nthem. In this letter, we propose a novel technique that learns photon transport\nthrough the detector medium through the use of deep learning-driven\nsuper-resolution of data events. These ``improved'' events can then be\nreconstructed using traditional or ML techniques, resulting in improved\nresolution. Our strategy arranges additional ``virtual'' optical modules within\nan existing detector geometry and trains a convolutional neural network to\npredict the hits on these virtual optical modules. We show that this technique\nimproves the angular reconstruction of muons in a generic ice-based neutrino\ntelescope. Our results readily extend to water-based neutrino telescopes and\nother event morphologies.\n","authors":["Felix J. Yu","Nicholas Kamp","Carlos A. Argüelles"],"pdf_url":"https://arxiv.org/pdf/2408.08474v1.pdf","comment":"5+1 pages, 4+1 figures"},{"id":"http://arxiv.org/abs/2408.03599v2","updated":"2024-08-16T01:19:04Z","published":"2024-08-07T07:36:49Z","title":"Activations Through Extensions: A Framework To Boost Performance Of\n Neural Networks","summary":" Activation functions are non-linearities in neural networks that allow them\nto learn complex mapping between inputs and outputs. Typical choices for\nactivation functions are ReLU, Tanh, Sigmoid etc., where the choice generally\ndepends on the application domain. In this work, we propose a\nframework/strategy that unifies several works on activation functions and\ntheoretically explains the performance benefits of these works. We also propose\nnovel techniques that originate from the framework and allow us to obtain\n``extensions'' (i.e. special generalizations of a given neural network) of\nneural networks through operations on activation functions. We theoretically\nand empirically show that ``extensions'' of neural networks have performance\nbenefits compared to vanilla neural networks with insignificant space and time\ncomplexity costs on standard test functions. We also show the benefits of\nneural network ``extensions'' in the time-series domain on real-world datasets.\n","authors":["Chandramouli Kamanchi","Sumanta Mukherjee","Kameshwaran Sampath","Pankaj Dayama","Arindam Jati","Vijay Ekambaram","Dzung Phan"],"pdf_url":"https://arxiv.org/pdf/2408.03599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16837v2","updated":"2024-08-16T01:18:11Z","published":"2024-05-27T05:10:49Z","title":"Enhancing Accuracy in Generative Models via Knowledge Transfer","summary":" This paper investigates the accuracy of generative models and the impact of\nknowledge transfer on their generation precision. Specifically, we examine a\ngenerative model for a target task, fine-tuned using a pre-trained model from a\nsource task. Building on the \"Shared Embedding\" concept, which bridges the\nsource and target tasks, we introduce a novel framework for transfer learning\nunder distribution metrics such as the Kullback-Leibler divergence. This\nframework underscores the importance of leveraging inherent similarities\nbetween diverse tasks despite their distinct data distributions. Our theory\nsuggests that the shared structures can augment the generation accuracy for a\ntarget task, reliant on the capability of a source model to identify shared\nstructures and effective knowledge transfer from source to target learning. To\ndemonstrate the practical utility of this framework, we explore the theoretical\nimplications for two specific generative models: diffusion and normalizing\nflows. The results show enhanced performance in both models over their\nnon-transfer counterparts, indicating advancements for diffusion models and\nproviding fresh insights into normalizing flows in transfer and non-transfer\nsettings. These results highlight the significant contribution of knowledge\ntransfer in boosting the generation capabilities of these models.\n","authors":["Xinyu Tian","Xiaotong Shen"],"pdf_url":"https://arxiv.org/pdf/2405.16837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08470v1","updated":"2024-08-16T01:12:21Z","published":"2024-08-16T01:12:21Z","title":"Context-Aware Assistant Selection for Improved Inference Acceleration\n with Large Language Models","summary":" Despite their widespread adoption, large language models (LLMs) remain\nprohibitive to use under resource constraints, with their ever growing sizes\nonly increasing the barrier for use. One noted issue is the high latency\nassociated with auto-regressive generation, rendering large LLMs use dependent\non advanced computing infrastructure. Assisted decoding, where a smaller draft\nmodel guides a larger target model's generation, has helped alleviate this, but\nremains dependent on alignment between the two models. Thus if the draft model\nis insufficiently capable on some domain relative to the target model,\nperformance can degrade. Alternatively, one can leverage multiple draft models\nto better cover the expertise of the target, but when multiple black-box draft\nmodels are available, selecting an assistant without details about its\nconstruction can be difficult. To better understand this decision making\nproblem, we observe it as a contextual bandit, where a policy must choose a\ndraft model based on a context. We show that even without prior knowledge of\nthe draft models, creating an offline dataset from only outputs of independent\ndraft/target models and training a policy over the alignment of these outputs\ncan accelerate performance on multiple domains provided the candidates are\neffective. Further results show this to hold on various settings with multiple\nassisted decoding candidates, highlighting its flexibility and the advantageous\nrole that such decision making can play.\n","authors":["Jerry Huang","Prasanna Parthasarathi","Mehdi Rezagholizadeh","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2408.08470v1.pdf","comment":"14 pages (9 pages main content + references + appendix)"},{"id":"http://arxiv.org/abs/2407.11790v3","updated":"2024-08-16T01:11:48Z","published":"2024-07-16T14:45:46Z","title":"Characterizing and Understanding HGNN Training on GPUs","summary":" Owing to their remarkable representation capabilities for heterogeneous graph\ndata, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in\nmany critical real-world domains such as recommendation systems and medical\nanalysis. Prior to their practical application, identifying the optimal HGNN\nmodel parameters tailored to specific tasks through extensive training is a\ntime-consuming and costly process. To enhance the efficiency of HGNN training,\nit is essential to characterize and analyze the execution semantics and\npatterns within the training process to identify performance bottlenecks. In\nthis study, we conduct an in-depth quantification and analysis of two\nmainstream HGNN training scenarios, including single-GPU and multi-GPU\ndistributed training. Based on the characterization results, we disclose the\nperformance bottlenecks and their underlying causes in different HGNN training\nscenarios and provide optimization guidelines from both software and hardware\nperspectives.\n","authors":["Dengke Han","Mingyu Yan","Xiaochun Ye","Dongrui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.11790v3.pdf","comment":"23 pages, 14 figures, submitted to ACM TACO"},{"id":"http://arxiv.org/abs/2301.08028v3","updated":"2024-08-16T00:59:44Z","published":"2023-01-19T12:01:41Z","title":"A Survey of Meta-Reinforcement Learning","summary":" While deep reinforcement learning (RL) has fueled multiple high-profile\nsuccesses in machine learning, it is held back from more widespread adoption by\nits often poor data efficiency and the limited generality of the policies it\nproduces. A promising approach for alleviating these limitations is to cast the\ndevelopment of better RL algorithms as a machine learning problem itself in a\nprocess called meta-RL. Meta-RL is most commonly studied in a problem setting\nwhere, given a distribution of tasks, the goal is to learn a policy that is\ncapable of adapting to any new task from the task distribution with as little\ndata as possible. In this survey, we describe the meta-RL problem setting in\ndetail as well as its major variations. We discuss how, at a high level,\nmeta-RL research can be clustered based on the presence of a task distribution\nand the learning budget available for each individual task. Using these\nclusters, we then survey meta-RL algorithms and applications. We conclude by\npresenting the open problems on the path to making meta-RL part of the standard\ntoolbox for a deep RL practitioner.\n","authors":["Jacob Beck","Risto Vuorio","Evan Zheran Liu","Zheng Xiong","Luisa Zintgraf","Chelsea Finn","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2301.08028v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06261v2","updated":"2024-08-16T00:00:05Z","published":"2024-08-12T16:21:29Z","title":"Open-Source Molecular Processing Pipeline for Generating Molecules","summary":" Generative models for molecules have shown considerable promise for use in\ncomputational chemistry, but remain difficult to use for non-experts. For this\nreason, we introduce open-source infrastructure for easily building generative\nmolecular models into the widely used DeepChem [Ramsundar et al., 2019] library\nwith the aim of creating a robust and reusable molecular generation pipeline.\nIn particular, we add high quality PyTorch [Paszke et al., 2019]\nimplementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao\nand Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our\nimplementations show strong performance comparable with past work [Kuznetsov\nand Polykovskiy, 2021, Cao and Kipf, 2022].\n","authors":["V Shreyas","Jose Siguenza","Karan Bania","Bharath Ramsundar"],"pdf_url":"https://arxiv.org/pdf/2408.06261v2.pdf","comment":"Presented at the 2024 Molecular Machine Learning Conference (MoML\n 2024)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.08544v1","updated":"2024-08-16T06:04:25Z","published":"2024-08-16T06:04:25Z","title":"Scaling up Multimodal Pre-training for Sign Language Understanding","summary":" Sign language serves as the primary meaning of communication for the\ndeaf-mute community. Different from spoken language, it commonly conveys\ninformation by the collaboration of manual features, i.e., hand gestures and\nbody movements, and non-manual features, i.e., facial expressions and mouth\ncues. To facilitate communication between the deaf-mute and hearing people, a\nseries of sign language understanding (SLU) tasks have been studied in recent\nyears, including isolated/continuous sign language recognition (ISLR/CSLR),\ngloss-free sign language translation (GF-SLT) and sign language retrieval\n(SL-RT). Sign language recognition and translation aims to understand the\nsemantic meaning conveyed by sign languages from gloss-level and\nsentence-level, respectively. In contrast, SL-RT focuses on retrieving sign\nvideos or corresponding texts from a closed-set under the query-by-example\nsearch paradigm. These tasks investigate sign language topics from diverse\nperspectives and raise challenges in learning effective representation of sign\nlanguage videos. To advance the development of sign language understanding,\nexploring a generalized model that is applicable across various SLU tasks is a\nprofound research direction.\n","authors":["Wengang Zhou","Weichao Zhao","Hezhen Hu","Zecheng Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.08544v1.pdf","comment":"Sign language recognition; Sign language translation; Sign language\n retrieval"},{"id":"http://arxiv.org/abs/2407.19988v2","updated":"2024-08-16T05:12:35Z","published":"2024-07-29T13:20:22Z","title":"HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR\n Headsets","summary":" Virtual Reality (VR) has become increasingly popular for remote\ncollaboration, but video conferencing poses challenges when the user's face is\ncovered by the headset. Existing solutions have limitations in terms of\naccessibility. In this paper, we propose HeadsetOff, a novel system that\nachieves photorealistic video conferencing on economical VR headsets by\nleveraging voice-driven face reconstruction. HeadsetOff consists of three main\ncomponents: a multimodal predictor, a generator, and an adaptive controller.\nThe predictor effectively predicts user future behavior based on different\nmodalities. The generator employs voice, head motion, and eye blink to animate\nthe human face. The adaptive controller dynamically selects the appropriate\ngenerator model based on the trade-off between video quality and delay.\nExperimental results demonstrate the effectiveness of HeadsetOff in achieving\nhigh-quality, low-latency video conferencing on economical VR headsets.\n","authors":["Yili Jin","Xize Duan","Fangxin Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19988v2.pdf","comment":"Accepted by ACM Multimedia 2024"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..74a7057f --- /dev/null +++ b/index.html @@ -0,0 +1,75334 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 64 + +
+
+
+ + ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable gains in accuracy for Large +Language Models (LLMs). However, such techniques depend on the availability of +an accurate answer extraction process to aggregate across multiple outputs. +Moreover, they acquire higher inference cost, in comparison to Greedy Decoding, +due to generation of relatively higher number of output tokens. Research has +shown that the free form text outputs from Self-Consistency can be aggregated +reliably using LLMs to produce the final output. Additionally, recent +advancements in LLM inference have demonstrated that usage of diverse exemplars +in prompts have the ability to induce diversity in the LLM outputs. Such proven +techniques can be easily extended to self-ensembling based approaches to +achieve enhanced results in text generation. In this paper, we introduce PEDAL +(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid +self-ensembling approach, that combines the strengths of diverse exemplar based +prompts and LLM based aggregation to achieve improvement in overall +performance. On the publicly available SVAMP and ARC datasets, our experiments +reveal that PEDAL can achieve better accuracy than Greedy Decoding based +strategies with lower inference cost compared to Self Consistency based +approaches. + +
+
+
+
+
+ + ☆ PsychoLex: Unveiling the Psychological Mind of Large Language Models + + +
+ This paper explores the intersection of psychology and artificial +intelligence through the development and evaluation of specialized Large +Language Models (LLMs). We introduce PsychoLex, a suite of resources designed +to enhance LLMs' proficiency in psychological tasks in both Persian and +English. Key contributions include the PsychoLexQA dataset for instructional +content and the PsychoLexEval dataset for rigorous evaluation of LLMs in +complex psychological scenarios. Additionally, we present the PsychoLexLLaMA +model, optimized specifically for psychological applications, demonstrating +superior performance compared to general-purpose models. The findings +underscore the potential of tailored LLMs for advancing psychological research +and applications, while also highlighting areas for further refinement. This +research offers a foundational step towards integrating LLMs into specialized +psychological domains, with implications for future advancements in AI-driven +psychological practice. + +
+
+
+
+
+ + ☆ FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats + + +
+ The table reasoning task aims to answer the question according to the given +table. Currently, using Large Language Models (LLMs) is the predominant method +for table reasoning. Most existing methods employ a fixed tabular format to +represent the table, which could limit the performance. Given that each +instance requires different capabilities and models possess varying abilities, +we assert that different instances and models suit different tabular formats. +We prove the aforementioned claim through quantitative analysis of experimental +results, where different instances and models achieve different performances +using various tabular formats. Building on this discussion, we propose +FLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by +employing flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a +classifier to predict the most suitable tabular format based on the instance +and the LLM. (ii) FLEXTAF-Vote integrates the results across different formats. +Our experiments on WikiTableQuestions and TabFact reveal significant +improvements, with average gains of 2.3% and 4.8% compared to the best +performance achieved using a fixed tabular format with greedy decoding and +self-consistency decoding, thereby validating the effectiveness of our methods. + +
+
+
+
+
+ + ☆ CIKMar: A Dual-Encoder Approach to Prompt-Based Reranking in Educational + Dialogue Systems + + +
+ In this study, we introduce CIKMar, an efficient approach to educational +dialogue systems powered by the Gemma Language model. By leveraging a +Dual-Encoder ranking system that incorporates both BERT and SBERT model, we +have designed CIKMar to deliver highly relevant and accurate responses, even +with the constraints of a smaller language model size. Our evaluation reveals +that CIKMar achieves a robust recall and F1-score of 0.70 using BERTScore +metrics. However, we have identified a significant challenge: the Dual-Encoder +tends to prioritize theoretical responses over practical ones. These findings +underscore the potential of compact and efficient models like Gemma in +democratizing access to advanced educational AI systems, ensuring effective and +contextually appropriate responses. + +
+
+ comment: This paper is the result of the final project of the Natural Language + Processing course, Master of Artificial Intelligence, Universitas Gadjah Mada +
+
+
+
+
+ + ☆ Leveraging FourierKAN Classification Head for Pre-Trained + Transformer-based Text Classification + + +
+ For many years, transformer-based pre-trained models with Multi-layer +Perceptron (MLP) heads have been the standard for text classification tasks. +However, the fixed non-linear functions employed by MLPs often fall short of +capturing the intricacies of the contextualized embeddings produced by +pre-trained encoders. Furthermore, MLPs usually require a significant number of +training parameters, which can be computationally expensive. In this work, we +introduce FourierKAN (FR-KAN), a variant of the promising MLP alternative +called Kolmogorov-Arnold Networks (KANs), as classification heads for +transformer-based encoders. Our studies reveal an average increase of 10% in +accuracy and 11% in F1-score when incorporating FR-KAN heads instead of +traditional MLP heads for several transformer-based pre-trained models across +multiple text classification tasks. Beyond improving model accuracy, FR-KAN +heads train faster and require fewer parameters. Our research opens new grounds +for broader applications of KAN across several Natural Language Processing +(NLP) tasks. + +
+
+
+
+
+ + ☆ EmoDynamiX: Emotional Support Dialogue Strategy Prediction by Modelling + MiXed Emotions and Discourse Dynamics + + +
+ Designing emotionally intelligent conversational systems to provide comfort +and advice to people experiencing distress is a compelling area of research. +Previous efforts have focused on developing modular dialogue systems that treat +socio-emotional strategy prediction as an auxiliary task and generate +strategy-conditioned responses with customized decoders. Recently, with +advancements in large language models (LLMs), end-to-end dialogue agents +without explicit socio-emotional strategy prediction steps have become +prevalent. However, despite their excellence in language generation, recent +studies show that LLMs' inherent preference bias towards certain +socio-emotional strategies hinders the delivery of high-quality emotional +support. To address this challenge, we propose decoupling strategy prediction +from language generation, and introduce a novel dialogue strategy predictor, +EmoDynamiX, which models the discourse dynamics between user emotions and +system strategies using a heterogeneous graph. Additionally, we make use of the +Emotion Recognition in Conversations (ERC) task and design a flexible +mixed-emotion module to capture fine-grained emotional states of the user. +Experimental results on two ESC datasets show EmoDynamiX outperforms previous +state-of-the-art methods with a significant margin. + +
+
+
+
+
+ + ☆ Evaluating the Evaluator: Measuring LLMs' Adherence to Task Evaluation + Instructions + + +
+ LLMs-as-a-judge is a recently popularized method which replaces human +judgements in task evaluation (Zheng et al. 2024) with automatic evaluation +using LLMs. Due to widespread use of RLHF (Reinforcement Learning from Human +Feedback), state-of-the-art LLMs like GPT4 and Llama3 are expected to have +strong alignment with human preferences when prompted for a quality judgement, +such as the coherence of a text. While this seems beneficial, it is not clear +whether the assessments by an LLM-as-a-judge constitute only an evaluation +based on the instructions in the prompts, or reflect its preference for +high-quality data similar to its fine-tune data. To investigate how much +influence prompting the LLMs-as-a-judge has on the alignment of AI judgements +to human judgements, we analyze prompts with increasing levels of instructions +about the target quality of an evaluation, for several LLMs-as-a-judge. +Further, we compare to a prompt-free method using model perplexity as a quality +measure instead. We aggregate a taxonomy of quality criteria commonly used +across state-of-the-art evaluations with LLMs and provide this as a rigorous +benchmark of models as judges. Overall, we show that the LLMs-as-a-judge +benefit only little from highly detailed instructions in prompts and that +perplexity can sometimes align better with human judgements than prompting, +especially on textual quality. + +
+
+
+
+
+ + ☆ Large Language Models Might Not Care What You Are Saying: Prompt Format + Beats Descriptions + + +
+ With the help of in-context learning (ICL), large language models (LLMs) have +achieved impressive performance across various tasks. However, the function of +descriptive instructions during ICL remains under-explored. In this work, we +propose an ensemble prompt framework to describe the selection criteria of +multiple in-context examples, and preliminary experiments on machine +translation (MT) across six translation directions confirm that this framework +boosts ICL perfromance. But to our surprise, LLMs might not necessarily care +what the descriptions actually say, and the performance gain is primarily +caused by the ensemble format, since the framework could lead to improvement +even with random descriptive nouns. We further apply this new ensemble prompt +on a range of commonsense, math, logical reasoning and hallucination tasks with +three LLMs and achieve promising results, suggesting again that designing a +proper prompt format would be much more effective and efficient than paying +effort into specific descriptions. Our code will be publicly available once +this paper is published. + +
+
+ comment: 10 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ DAC: Decomposed Automation Correction for Text-to-SQL + + +
+ Text-to-SQL is an important task that helps people obtain information from +databases by automatically generating SQL queries. Considering the brilliant +performance, approaches based on Large Language Models (LLMs) become the +mainstream for text-to-SQL. Among these approaches, automated correction is an +effective approach that further enhances performance by correcting the mistakes +in the generated results. The existing correction methods require LLMs to +directly correct with generated SQL, while previous research shows that LLMs do +not know how to detect mistakes, leading to poor performance. Therefore, in +this paper, we propose to employ the decomposed correction to enhance +text-to-SQL performance. We first demonstrate that decomposed correction +outperforms direct correction since detecting and fixing mistakes with the +results of the decomposed sub-tasks is easier than with SQL. Based on this +analysis, we introduce Decomposed Automation Correction (DAC), which corrects +SQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC +first generates the entity and skeleton corresponding to the question and then +compares the differences between the initial SQL and the generated entities and +skeleton as feedback for correction. Experimental results show that our method +improves performance by $3.7\%$ on average of Spider, Bird, and KaggleDBQA +compared with the baseline method, demonstrating the effectiveness of DAC. + +
+
+
+
+
+ + ☆ Lower Layer Matters: Alleviating Hallucination via Multi-Layer Fusion + Contrastive Decoding with Truthfulness Refocused + + +
+ Large Language Models (LLMs) have demonstrated exceptional performance across +various natural language processing tasks, yet they occasionally tend to yield +content that factually inaccurate or discordant with the expected output, a +phenomenon empirically referred to as "hallucination". To tackle this issue, +recent works have investigated contrastive decoding between the original model +and an amateur model with induced hallucination, which has shown promising +results. Nonetheless, this method may undermine the output distribution of the +original LLM caused by its coarse contrast and simplistic subtraction +operation, potentially leading to errors in certain cases. In this paper, we +introduce a novel contrastive decoding framework termed LOL (LOwer Layer +Matters). Our approach involves concatenating the contrastive decoding of both +the final and lower layers between the original model and the amateur model, +thereby achieving multi-layer fusion to aid in the mitigation of hallucination. +Additionally, we incorporate a truthfulness refocused module that leverages +contextual guidance to enhance factual encoding, further capturing truthfulness +during contrastive decoding. Extensive experiments conducted on two publicly +available datasets illustrate that our proposed LOL framework can substantially +alleviate hallucination while surpassing existing baselines in most cases. +Compared with the best baseline, we improve by average 4.5 points on all +metrics of TruthfulQA. The source code is coming soon. + +
+
+ comment: 9 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ ConcateNet: Dialogue Separation Using Local And Global Feature + Concatenation + + +
+ Dialogue separation involves isolating a dialogue signal from a mixture, such +as a movie or a TV program. This can be a necessary step to enable dialogue +enhancement for broadcast-related applications. In this paper, ConcateNet for +dialogue separation is proposed, which is based on a novel approach for +processing local and global features aimed at better generalization for +out-of-domain signals. ConcateNet is trained using a noise reduction-focused, +publicly available dataset and evaluated using three datasets: two noise +reduction-focused datasets (in-domain), which show competitive performance for +ConcateNet, and a broadcast-focused dataset (out-of-domain), which verifies the +better generalization performance for the proposed architecture compared to +considered state-of-the-art noise-reduction methods. + +
+
+
+
+
+ + ☆ ChatZero:Zero-shot Cross-Lingual Dialogue Generation via Pseudo-Target + Language ECAI2024 + + +
+ Although large language models(LLMs) show amazing capabilities, among various +exciting applications discovered for LLMs fall short in other low-resource +languages. Besides, most existing methods depend on large-scale dialogue +corpora and thus building systems for dialogue generation in a zero-shot +scenario remains a considerable challenge. To address this challenge, we +propose a novel end-to-end zero-shot dialogue generation model ChatZero based +on cross-lingual code-switching method. First, we construct code-switching +language and pseudo-target language with placeholders. Then for cross-lingual +semantic transfer, we employ unsupervised contrastive learning to minimize the +semantics gap of the source language, code-switching language, and +pseudo-target language that are mutually positive examples in the high +dimensional semantic space. Experiments on the multilingual DailyDialog and +DSTC7-AVSD datasets demonstrate that ChatZero can achieve more than 90\% of the +original performance under the zero-shot case compared to supervised learning, +and achieve state-of-the-art performance compared with other baselines. + +
+
+ comment: ECAI2024 +
+
+
+
+
+ + ☆ Turning Trash into Treasure: Accelerating Inference of Large Language + Models with Token Recycling + + +
+ The rapid growth in the parameters of large language models (LLMs) has made +inference latency a fundamental bottleneck, limiting broader application of +LLMs. Speculative decoding represents a lossless approach to accelerate +inference through a guess-and-verify paradigm, leveraging the parallel +capabilities of modern hardware. Some speculative decoding methods rely on +additional structures to guess draft tokens, such as small models or +parameter-efficient architectures, which need extra training before use. +Alternatively, retrieval-based train-free techniques build libraries from +pre-existing corpora or by n-gram generation. However, they face challenges +like large storage requirements, time-consuming retrieval, and limited +adaptability. Observing that candidate tokens generated during the decoding +process are likely to reoccur in future sequences, we propose Token Recycling. +This approach stores candidate tokens in an adjacency matrix and employs a +breadth-first search (BFS)-like algorithm on the matrix to construct a draft +tree. The tree is then validated through tree attention. New candidate tokens +from the decoding process are then used to update the matrix. Token Recycling +requires \textless2MB of additional storage and achieves approximately 2x +speedup across all sizes of LLMs. It significantly outperforms existing +train-free methods by 30\% and even a training method by 25\%. It can be +directly applied to any existing LLMs and tasks without the need for +adaptation. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Quantifying the Effectiveness of Student Organization Activities using + Natural Language Processing + + +
+ Student extracurricular activities play an important role in enriching the +students' educational experiences. With the increasing popularity of Machine +Learning and Natural Language Processing, it becomes a logical step that +incorporating ML-NLP in improving extracurricular activities is a potential +focus of study in Artificial Intelligence (AI). This research study aims to +develop a machine learning workflow that will quantify the effectiveness of +student-organized activities based on student emotional responses using +sentiment analysis. The study uses the Bidirectional Encoder Representations +from Transformers (BERT) Large Language Model (LLM) called via the +pysentimiento toolkit, as a Transformer pipeline in Hugging Face. A sample data +set from Organization C, a Recognized Student Organization (RSO) of a higher +educational institute in the Philippines, College X, was used to develop the +workflow. The workflow consisted of data preprocessing, key feature selection, +LLM feature processing, and score aggregation, resulting in an Event Score for +each data set. The results show that the BERT LLM can also be used effectively +in analyzing sentiment beyond product reviews and post comments. For the +student affairs offices of educational institutions, this study can provide a +practical example of how NLP can be applied to real-world scenarios, showcasing +the potential impact of data-driven decision making. + +
+
+ comment: 11 pages, 4 figures, presented in International Conference on + Generative Al and its Applications (ICGAIA-24) last 22nd - 23rd, July, 2024 + at Jakarta, Indonesia +
+
+
+
+
+ + ☆ Med-PMC: Medical Personalized Multi-modal Consultation with a Proactive + Ask-First-Observe-Next Paradigm + + +
+ The application of the Multi-modal Large Language Models (MLLMs) in medical +clinical scenarios remains underexplored. Previous benchmarks only focus on the +capacity of the MLLMs in medical visual question-answering (VQA) or report +generation and fail to assess the performance of the MLLMs on complex clinical +multi-modal tasks. In this paper, we propose a novel Medical Personalized +Multi-modal Consultation (Med-PMC) paradigm to evaluate the clinical capacity +of the MLLMs. Med-PMC builds a simulated clinical environment where the MLLMs +are required to interact with a patient simulator to complete the multi-modal +information-gathering and decision-making task. Specifically, the patient +simulator is decorated with personalized actors to simulate diverse patients in +real scenarios. We conduct extensive experiments to access 12 types of MLLMs, +providing a comprehensive view of the MLLMs' clinical performance. We found +that current MLLMs fail to gather multimodal information and show potential +bias in the decision-making task when consulted with the personalized patient +simulators. Further analysis demonstrates the effectiveness of Med-PMC, showing +the potential to guide the development of robust and reliable clinical MLLMs. +Code and data are available at https://github.com/LiuHC0428/Med-PMC. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic + Preference Optimization Dataset Generation + + +
+ This paper presents and evaluates multi-agent workflows for synthetic +Preference Optimization (PO) dataset generation. PO dataset generation requires +two modules: (1) response evaluation, and (2) response generation. In the +response evaluation module, the responses from Large Language Models (LLMs) are +evaluated and ranked - a task typically carried out by human annotators that we +automate using LLMs. We assess the response evaluation module in a 2 step +process. In step 1, we assess LLMs as evaluators using three distinct prompting +strategies. In step 2, we apply the winning prompting strategy to compare the +performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In each step, we +use inter-rater agreement using Cohen's Kappa between human annotators and +LLMs. For the response generation module, we compare different configurations +for the LLM Feedback Loop using the identified LLM evaluator configuration. We +use the win rate (the fraction of times a generation framework is selected as +the best by an LLM evaluator) to determine the best multi-agent configuration +for generation. After identifying the best configurations for both modules, we +use models from the GPT, Gemma, and Llama families to generate our PO datasets +using the above pipeline. We generate two types of PO datasets, one to improve +the generation capabilities of individual LLM and the other to improve the +multi-agent workflow. Our evaluation shows that GPT-4o-as-a-Judge is more +consistent across datasets when the candidate responses do not include +responses from the GPT family. Additionally, we find that the LLM Feedback +Loop, with Llama as the generator and Gemma as the reviewer, achieves a notable +71.8% and 73.8% win rate over single-agent Llama and Gemma, respectively. + +
+
+
+
+
+ + ☆ LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression + + +
+ The key to effective point cloud compression is to obtain a robust context +model consistent with complex 3D data structures. Recently, the advancement of +large language models (LLMs) has highlighted their capabilities not only as +powerful generators for in-context learning and generation but also as +effective compressors. These dual attributes of LLMs make them particularly +well-suited to meet the demands of data compression. Therefore, this paper +explores the potential of using LLM for compression tasks, focusing on lossless +point cloud geometry compression (PCGC) experiments. However, applying LLM +directly to PCGC tasks presents some significant challenges, i.e., LLM does not +understand the structure of the point cloud well, and it is a difficult task to +fill the gap between text and point cloud through text description, especially +for large complicated and small shapeless point clouds. To address these +problems, we introduce a novel architecture, namely the Large Language +Model-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to +compress point cloud geometry information without any text description or +aligning operation. By utilizing different adaptation techniques for +cross-modality representation alignment and semantic consistency, including +clustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA), +the proposed method can translate LLM to a compressor/generator for point +cloud. To the best of our knowledge, this is the first structure to employ LLM +as a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC +outperforms the other existing methods significantly, by achieving -40.213% bit +rate reduction compared to the reference software of MPEG Geometry-based Point +Cloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction +compared to the state-of-the-art learning-based method. + +
+
+
+
+
+ + ☆ MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector + + +
+ The increasing parameters and expansive dataset of large language models +(LLMs) highlight the urgent demand for a technical solution to audit the +underlying privacy risks and copyright issues associated with LLMs. Existing +studies have partially addressed this need through an exploration of the +pre-training data detection problem, which is an instance of a membership +inference attack (MIA). This problem involves determining whether a given piece +of text has been used during the pre-training phase of the target LLM. Although +existing methods have designed various sophisticated MIA score functions to +achieve considerable detection performance in pre-trained LLMs, how to achieve +high-confidence detection and how to perform MIA on aligned LLMs remain +challenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA +method, which instructs LLMs themselves to serve as a more precise pre-training +data detector internally, rather than design an external MIA score function. +Furthermore, we design two instruction-based safeguards to respectively +mitigate the privacy risks brought by the existing methods and MIA-Tuner. To +comprehensively evaluate the most recent state-of-the-art LLMs, we collect a +more up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely +adopted benchmark WIKIMIA. We conduct extensive experiments across various +aligned and unaligned LLMs over the two benchmark datasets. The results +demonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a +significantly high level of 0.9. + +
+
+ comment: code and dataset: https://github.com/wjfu99/MIA-Tuner +
+
+
+
+
+ + ☆ LLMs Are Biased Towards Output Formats! Systematically Evaluating and + Mitigating Output Format Bias of LLMs + + +
+ We present the first systematic evaluation examining format bias in +performance of large language models (LLMs). Our approach distinguishes between +two categories of an evaluation metric under format constraints to reliably and +accurately assess performance: one measures performance when format constraints +are adhered to, while the other evaluates performance regardless of constraint +adherence. We then define a metric for measuring the format bias of LLMs and +establish effective strategies to reduce it. Subsequently, we present our +empirical format bias evaluation spanning four commonly used categories -- +multiple-choice question-answer, wrapping, list, and mapping -- covering 15 +widely-used formats. Our evaluation on eight generation tasks uncovers +significant format bias across state-of-the-art LLMs. We further discover that +improving the format-instruction following capabilities of LLMs across formats +potentially reduces format bias. Based on our evaluation findings, we study +prompting and fine-tuning with synthesized format data techniques to mitigate +format bias. Our methods successfully reduce the variance in ChatGPT's +performance among wrapping formats from 235.33 to 0.71 (%$^2$). + +
+
+
+
+
+ + ☆ Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of + Thought Reasoning + + +
+ Language models are known to absorb biases from their training data, leading +to predictions driven by statistical regularities rather than semantic +relevance. We investigate the impact of these biases on answer choice +preferences in the Massive Multi-Task Language Understanding (MMLU) task. Our +findings reveal that differences in learned regularities across answer options +are predictive of model preferences and mirror human test-taking strategies. To +address this issue, we introduce two novel methods: Counterfactual Prompting +with Chain of Thought (CoT) and Counterfactual Prompting with Agnostically +Primed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with +CoT alone is insufficient to mitigate bias, our novel Primed Counterfactual +Prompting with CoT approach effectively reduces the influence of base-rate +probabilities while improving overall accuracy. Our results suggest that +mitigating bias requires a "System-2" like process and that CoT reasoning is +susceptible to confirmation bias under some prompting methodologies. Our +contributions offer practical solutions for developing more robust and fair +language models. + +
+
+
+
+
+ + ☆ An End-to-End Model for Photo-Sharing Multi-modal Dialogue Generation + + +
+ Photo-Sharing Multi-modal dialogue generation requires a dialogue agent not +only to generate text responses but also to share photos at the proper moment. +Using image text caption as the bridge, a pipeline model integrates an image +caption model, a text generation model, and an image generation model to handle +this complex multi-modal task. However, representing the images with text +captions may loss important visual details and information and cause error +propagation in the complex dialogue system. Besides, the pipeline model +isolates the three models separately because discrete image text captions +hinder end-to-end gradient propagation. We propose the first end-to-end model +for photo-sharing multi-modal dialogue generation, which integrates an image +perceptron and an image generator with a large language model. The large +language model employs the Q-Former to perceive visual images in the input end. +For image generation in the output end, we propose a dynamic vocabulary +transformation matrix and use straight-through and gumbel-softmax techniques to +align the large language model and stable diffusion model and achieve +end-to-end gradient propagation. We perform experiments on PhotoChat and +DialogCC datasets to evaluate our end-to-end model. Compared with pipeline +models, the end-to-end model gains state-of-the-art performances on various +metrics of text and image generation. More analysis experiments also verify the +effectiveness of the end-to-end model for photo-sharing multi-modal dialogue +generation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Understanding Enthymemes in Argument Maps: Bridging Argument Mining and + Logic-based Argumentation + + +
+ Argument mining is natural language processing technology aimed at +identifying arguments in text. Furthermore, the approach is being developed to +identify the premises and claims of those arguments, and to identify the +relationships between arguments including support and attack relationships. In +this paper, we assume that an argument map contains the premises and claims of +arguments, and support and attack relationships between them, that have been +identified by argument mining. So from a piece of text, we assume an argument +map is obtained automatically by natural language processing. However, to +understand and to automatically analyse that argument map, it would be +desirable to instantiate that argument map with logical arguments. Once we have +the logical representation of the arguments in an argument map, we can use +automated reasoning to analyze the argumentation (e.g. check consistency of +premises, check validity of claims, and check the labelling on each arc +corresponds with thw logical arguments). We address this need by using +classical logic for representing the explicit information in the text, and +using default logic for representing the implicit information in the text. In +order to investigate our proposal, we consider some specific options for +instantiation. + +
+
+ comment: Research note +
+
+
+
+
+ + ☆ Math-PUMA: Progressive Upward Multimodal Alignment to Enhance + Mathematical Reasoning + + +
+ Multimodal Large Language Models (MLLMs) excel in solving text-based +mathematical problems, but they struggle with mathematical diagrams since they +are primarily trained on natural scene images. For humans, visual aids +generally enhance problem-solving, but MLLMs perform worse as information +shifts from textual to visual modality. This decline is mainly due to their +shortcomings in aligning images and text. To tackle aforementioned challenges, +we propose Math-PUMA, a methodology focused on Progressive Upward Multimodal +Alignment. This approach is designed to improve the mathematical reasoning +skills of MLLMs through a three-stage training process, with the second stage +being the critical alignment stage. We first enhance the language model's +mathematical reasoning capabilities with extensive set of textual mathematical +problems. We then construct a multimodal dataset with varying degrees of +textual and visual information, creating data pairs by presenting each problem +in at least two forms. By leveraging the Kullback-Leibler (KL) divergence of +next-token prediction distributions to align visual and textual modalities, +consistent problem-solving abilities are ensured. Finally, we utilize +multimodal instruction tuning for MLLMs with high-quality multimodal data. +Experimental results on multiple mathematical reasoning benchmarks demonstrate +that the MLLMs trained with Math-PUMA surpass most open-source MLLMs. Our +approach effectively narrows the performance gap for problems presented in +different modalities. + +
+
+
+
+
+ + ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of \textbf{180 benchmarks} and evaluation for MLLMs, +focusing on (1)perception and understanding, (2)cognition and reasoning, +(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we +discuss the limitations of the current evaluation methods for MLLMs and explore +promising future directions. Our key argument is that evaluation should be +regarded as a crucial discipline to better support the development of MLLMs. +For more details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ☆ Persona is a Double-edged Sword: Enhancing the Zero-shot Reasoning by + Ensembling the Role-playing and Neutral Prompts + + +
+ Recent studies demonstrate that prompting an appropriate role-playing persona +to an LLM improves its reasoning capability. However, assigning a proper +persona is difficult since an LLM's performance is extremely sensitive to +assigned prompts; therefore, personas sometimes hinder LLMs and degrade their +reasoning capabilities. In this paper, we propose a novel framework, Jekyll \& +Hyde, which ensembles the results of role-playing and neutral prompts to +eradicate performance degradation via unilateral use of role-playing prompted +LLM and enhance the robustness of an LLM's reasoning ability. Specifically, +Jekyll \& Hyde collects two potential solutions from both role-playing and +neutral prompts and selects a better solution after cross-checking via an LLM +evaluator. However, LLM-based evaluators tend to be affected by the order of +those potential solutions within the prompt when selecting the proper solution; +thus, we also propose a robust LLM evaluator to mitigate the position bias. The +experimental analysis demonstrates that role-playing prompts distract LLMs and +degrade their reasoning abilities in 4 out of 12 datasets, even when using +GPT-4. In addition, we reveal that Jekyll \& Hyde improves reasoning +capabilities by selecting better choices among the potential solutions on +twelve widely-used reasoning datasets. We further show that our proposed LLM +evaluator outperforms other baselines, proving the LLMs' position bias is +successfully mitigated. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ RealMedQA: A pilot biomedical question answering dataset containing + realistic clinical questions + + +
+ Clinical question answering systems have the potential to provide clinicians +with relevant and timely answers to their questions. Nonetheless, despite the +advances that have been made, adoption of these systems in clinical settings +has been slow. One issue is a lack of question-answering datasets which reflect +the real-world needs of health professionals. In this work, we present +RealMedQA, a dataset of realistic clinical questions generated by humans and an +LLM. We describe the process for generating and verifying the QA pairs and +assess several QA models on BioASQ and RealMedQA to assess the relative +difficulty of matching answers to questions. We show that the LLM is more +cost-efficient for generating "ideal" QA pairs. Additionally, we achieve a +lower lexical similarity between questions and answers than BioASQ which +provides an additional challenge to the top two QA models, as per the results. +We release our code and our dataset publicly to encourage further research. + +
+
+ comment: Accepted at AMIA Annual Symposium 2024 +
+
+
+
+
+ + ☆ A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive + Language Models + + +
+ Recent studies on logical reasoning in auto-regressive Language Models (LMs) +have sparked a debate on whether such models can learn systematic reasoning +principles during pre-training or merely exploit superficial patterns in the +training data. This paper presents a mechanistic interpretation of syllogistic +reasoning in LMs to further enhance our understanding of internal dynamics. +Specifically, we present a methodology for circuit discovery aimed at +disentangling content-independent reasoning mechanisms from world knowledge +acquired during pre-training. Through two distinct intervention methods, we +uncover a sufficient and necessary circuit involving middle-term suppression +that elucidates how LMs transfer information to derive valid conclusions from +premises. Furthermore, we investigate how belief biases manifest in syllogistic +reasoning, finding evidence of partial contamination from additional attention +heads responsible for encoding commonsense and contextualized knowledge. +Finally, we explore the generalization of the discovered mechanisms across +various syllogistic schemes and model sizes, finding that the identified +circuit is sufficient and necessary for all the schemes on which the model +achieves high downstream accuracy ($\geq$ 60\%). Overall, our findings suggest +that LMs indeed learn transferable content-independent reasoning mechanisms, +but that, at the same time, such mechanisms do not involve generalisable and +abstract logical primitives, being susceptible to contamination by the same +world knowledge acquired during pre-training. + +
+
+
+
+
+ + ☆ Overview of the BioLaySumm 2024 Shared Task on the Lay Summarization of + Biomedical Research Articles + + +
+ This paper presents the setup and results of the second edition of the +BioLaySumm shared task on the Lay Summarisation of Biomedical Research +Articles, hosted at the BioNLP Workshop at ACL 2024. In this task edition, we +aim to build on the first edition's success by further increasing research +interest in this important task and encouraging participants to explore novel +approaches that will help advance the state-of-the-art. Encouragingly, we found +research interest in the task to be high, with this edition of the task +attracting a total of 53 participating teams, a significant increase in +engagement from the previous edition. Overall, our results show that a broad +range of innovative approaches were adopted by task participants, with a +predictable shift towards the use of Large Language Models (LLMs). + +
+
+ comment: Published in: Proceedings of the 23rd Workshop on Biomedical Natural + Language Processing +
+
+
+
+
+ + ☆ Collaborative Cross-modal Fusion with Large Language Model for + Recommendation CIKM 2024 + + +
+ Despite the success of conventional collaborative filtering (CF) approaches +for recommendation systems, they exhibit limitations in leveraging semantic +knowledge within the textual attributes of users and items. Recent focus on the +application of large language models for recommendation (LLM4Rec) has +highlighted their capability for effective semantic knowledge capture. However, +these methods often overlook the collaborative signals in user behaviors. Some +simply instruct-tune a language model, while others directly inject the +embeddings of a CF-based model, lacking a synergistic fusion of different +modalities. To address these issues, we propose a framework of Collaborative +Cross-modal Fusion with Large Language Models, termed CCF-LLM, for +recommendation. In this framework, we translate the user-item interactions into +a hybrid prompt to encode both semantic knowledge and collaborative signals, +and then employ an attentive cross-modal fusion strategy to effectively fuse +latent embeddings of both modalities. Extensive experiments demonstrate that +CCF-LLM outperforms existing methods by effectively utilizing semantic and +collaborative signals in the LLM4Rec context. + +
+
+ comment: 10 pages, 4 figures, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Integrating Multi-view Analysis: Multi-view Mixture-of-Expert for + Textual Personality Detection NLPCC 2024 + + +
+ Textual personality detection aims to identify personality traits by +analyzing user-generated content. To achieve this effectively, it is essential +to thoroughly examine user-generated content from various perspectives. +However, previous studies have struggled with automatically extracting and +effectively integrating information from multiple perspectives, thereby +limiting their performance on personality detection. To address these +challenges, we propose the Multi-view Mixture-of-Experts Model for Textual +Personality Detection (MvP). MvP introduces a Multi-view Mixture-of-Experts +(MoE) network to automatically analyze user posts from various perspectives. +Additionally, it employs User Consistency Regularization to mitigate conflicts +among different perspectives and learn a multi-view generic user +representation. The model's training is optimized via a multi-task joint +learning strategy that balances supervised personality detection with +self-supervised user consistency constraints. Experimental results on two +widely-used personality detection datasets demonstrate the effectiveness of the +MvP model and the benefits of automatically analyzing user posts from diverse +perspectives for textual personality detection. + +
+
+ comment: Accepted by NLPCC 2024 +
+
+
+
+
+ + ☆ SelectLLM: Query-Aware Efficient Selection Algorithm for Large Language + Models + + +
+ Large language models (LLMs) have gained increased popularity due to their +remarkable success across various tasks, which has led to the active +development of a large set of diverse LLMs. However, individual LLMs have +limitations when applied to complex tasks because of such factors as training +biases, model sizes, and the datasets used. A promising approach is to +efficiently harness the diverse capabilities of LLMs to overcome these +individual limitations. Towards this goal, we introduce a novel LLM selection +algorithm called SelectLLM. This algorithm directs input queries to the most +suitable subset of LLMs from a large pool, ensuring they collectively provide +the correct response efficiently. SelectLLM uses a multi-label classifier, +utilizing the classifier's predictions and confidence scores to design optimal +policies for selecting an optimal, query-aware, and lightweight subset of LLMs. +Our findings show that the proposed model outperforms individual LLMs and +achieves competitive performance compared to similarly sized, computationally +expensive top-performing LLM subsets. Specifically, with a similarly sized +top-performing LLM subset, we achieve a significant reduction in latency on two +standard reasoning benchmarks: 13% lower latency for GSM8K and 70% lower +latency for MMLU. Additionally, we conduct comprehensive analyses and ablation +studies, which validate the robustness of the proposed model. + +
+
+
+
+
+ + ☆ Where is the signal in tokenization space? + + +
+ Large Language Models (LLMs) are typically shipped with tokenizers that +deterministically encode text into so-called canonical token sequences, to +which the LLMs assign probability values. One common assumption is that the +probability of a piece of text is the probability of its canonical token +sequence. However, the tokenization of a string is not unique: e.g., the Llama2 +tokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same +text. In this paper, we study non-canonical tokenizations. We prove that, given +a string, it is computationally hard to find the most likely tokenization for +an autoregressive LLM, as well as to compute the marginal probability over all +possible tokenizations. We then show how the marginal is, in most cases, +indistinguishable from the canonical probability. Surprisingly, we then +empirically demonstrate the existence of a significant amount of signal hidden +within tokenization space. Notably, by simply aggregating the probabilities of +non-canonical tokenizations, we achieve improvements across a range of LLM +evaluation benchmarks for a variety of architectures, including transformers +and state space models. + +
+
+
+
+
+ + ☆ CommunityKG-RAG: Leveraging Community Structures in Knowledge Graphs for + Advanced Retrieval-Augmented Generation in Fact-Checking + + +
+ Despite advancements in Large Language Models (LLMs) and Retrieval-Augmented +Generation (RAG) systems, their effectiveness is often hindered by a lack of +integration with entity relationships and community structures, limiting their +ability to provide contextually rich and accurate information retrieval for +fact-checking. We introduce CommunityKG-RAG (Community Knowledge +Graph-Retrieval Augmented Generation), a novel zero-shot framework that +integrates community structures within Knowledge Graphs (KGs) with RAG systems +to enhance the fact-checking process. Capable of adapting to new domains and +queries without additional training, CommunityKG-RAG utilizes the multi-hop +nature of community structures within KGs to significantly improve the accuracy +and relevance of information retrieval. Our experimental results demonstrate +that CommunityKG-RAG outperforms traditional methods, representing a +significant advancement in fact-checking by offering a robust, scalable, and +efficient solution. + +
+
+
+
+
+ + ☆ MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement + Framework for Multimodal Question Answering + + +
+ Recent advancements in retrieval-augmented generation (RAG) have demonstrated +impressive performance in the question-answering (QA) task. However, most +previous works predominantly focus on text-based answers. While some studies +address multimodal data, they still fall short in generating comprehensive +multimodal answers, particularly for explaining concepts or providing +step-by-step tutorials on how to accomplish specific goals. This capability is +especially valuable for applications such as enterprise chatbots and settings +such as customer service and educational systems, where the answers are sourced +from multimodal data. In this paper, we introduce a simple and effective +framework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR +enhances text-based answers by retrieving relevant multimodal data and refining +the responses to create coherent multimodal answers. This framework can be +easily extended to support multimodal answers in enterprise chatbots with +minimal modifications. Human evaluation results indicate that multimodal +answers generated by MuRAR are more useful and readable compared to plain text +answers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding + + +
+ Generating long-term texts such as novels using artificial intelligence has +always been a challenge. A common approach is to use large language models +(LLMs) to construct a hierarchical framework that first plans and then writes. +Despite the fact that the generated novels reach a sufficient length, they +exhibit poor logical coherence and appeal in their plots and deficiencies in +character and event depiction, ultimately compromising the overall narrative +quality. In this paper, we propose a method named Extracting Excelsior and +Expanding. Ex3 initially extracts structure information from raw novel data. By +combining this structure information with the novel data, an +instruction-following dataset is meticulously crafted. This dataset is then +utilized to fine-tune the LLM, aiming for excelsior generation performance. In +the final stage, a tree-like expansion method is deployed to facilitate the +generation of arbitrarily long novels. Evaluation against previous methods +showcases Ex3's ability to produce higher-quality long-form novels. + +
+
+
+
+
+ + ♻ ☆ Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation + + +
+ Several recent advances in AI systems solve problems by providing a +"scaffolding" program that structures multiple calls to language models (LMs) +to generate better outputs. A scaffolding program is written in a programming +language such as Python. In this work, we use a language-model-infused +scaffolding program to improve itself. We start with a seed "improver" that +improves an input program according to a given utility function by querying an +LM several times and returning the best solution. We then run this seed +improver to improve itself. Across a small set of downstream tasks, the +resulting improved improver generates programs with significantly better +performance than its seed improver. A variety of self-improvement strategies +are proposed by the language model, including beam search, genetic algorithms, +and simulated annealing. Since the language models themselves are not altered, +this is not full recursive self-improvement. Nonetheless, it demonstrates that +a modern language model, GPT-4 in our experiments, is capable of writing code +that can call itself to improve itself. We consider concerns around the +development of self-improving technologies and evaluate the frequency with +which the generated code bypasses a sandbox. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams ICPR + + +
+ The proliferation of textual data on the Internet presents a unique +opportunity for institutions and companies to monitor public opinion about +their services and products. Given the rapid generation of such data, the text +stream mining setting, which handles sequentially arriving, potentially +infinite text streams, is often more suitable than traditional batch learning. +While pre-trained language models are commonly employed for their high-quality +text vectorization capabilities in streaming contexts, they face challenges +adapting to concept drift - the phenomenon where the data distribution changes +over time, adversely affecting model performance. Addressing the issue of +concept drift, this study explores the efficacy of seven text sampling methods +designed to selectively fine-tune language models, thereby mitigating +performance degradation. We precisely assess the impact of these methods on +fine-tuning the SBERT model using four different loss functions. Our +evaluation, focused on Macro F1-score and elapsed time, employs two text stream +datasets and an incremental SVM classifier to benchmark performance. Our +findings indicate that Softmax loss and Batch All Triplets loss are +particularly effective for text stream classification, demonstrating that +larger sample sizes generally correlate with improved macro F1-scores. Notably, +our proposed WordPieceToken ratio sampling method significantly enhances +performance with the identified loss functions, surpassing baseline results. + +
+
+ comment: Accepted for presentation at the 27th International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ Apollo: A Lightweight Multilingual Medical LLM towards Democratizing + Medical AI to 6B People + + +
+ Despite the vast repository of global medical knowledge predominantly being +in English, local languages are crucial for delivering tailored healthcare +services, particularly in areas with limited medical resources. To extend the +reach of medical AI advancements to a broader population, we aim to develop +medical LLMs across the six most widely spoken languages, encompassing a global +population of 6.1 billion. This effort culminates in the creation of the +ApolloCorpora multilingual medical dataset and the XMedBench benchmark. In the +multilingual medical benchmark, the released Apollo models, at various +relatively-small sizes (i.e., 0.5B, 1.8B, 2B, 6B, and 7B), achieve the best +performance among models of equivalent size. Especially, Apollo-7B is the +state-of-the-art multilingual medical LLMs up to 70B. Additionally, these lite +models could be used to improve the multi-lingual medical capabilities of +larger models without fine-tuning in a proxy-tuning fashion. We will +open-source training corpora, code, model weights and evaluation benchmark. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ AI-as-exploration: Navigating intelligence space + + +
+ Artificial Intelligence is a field that lives many lives, and the term has +come to encompass a motley collection of scientific and commercial endeavours. +In this paper, I articulate the contours of a rather neglected but central +scientific role that AI has to play, which I dub `AI-as-exploration'.The basic +thrust of AI-as-exploration is that of creating and studying systems that can +reveal candidate building blocks of intelligence that may differ from the forms +of human and animal intelligence we are familiar with. In other words, I +suggest that AI is one of the best tools we have for exploring intelligence +space, namely the space of possible intelligent systems. I illustrate the value +of AI-as-exploration by focusing on a specific case study, i.e., recent work on +the capacity to combine novel and invented concepts in humans and Large +Language Models. I show that the latter, despite showing human-level accuracy +in such a task, probably solve it in ways radically different, but no less +relevant to intelligence research, to those hypothesised for humans. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Multimodal Learning: A Survey + + +
+ Multimodal learning, which aims to understand and analyze information from +multiple modalities, has achieved substantial progress in the supervised regime +in recent years. However, the heavy dependence on data paired with expensive +human annotations impedes scaling up models. Meanwhile, given the availability +of large-scale unannotated data in the wild, self-supervised learning has +become an attractive strategy to alleviate the annotation bottleneck. Building +on these two directions, self-supervised multimodal learning (SSML) provides +ways to learn from raw multimodal data. In this survey, we provide a +comprehensive review of the state-of-the-art in SSML, in which we elucidate +three major challenges intrinsic to self-supervised learning with multimodal +data: (1) learning representations from multimodal data without labels, (2) +fusion of different modalities, and (3) learning with unaligned data. We then +detail existing solutions to these challenges. Specifically, we consider (1) +objectives for learning from multimodal unlabeled data via self-supervision, +(2) model architectures from the perspective of different multimodal fusion +strategies, and (3) pair-free learning strategies for coarse-grained and +fine-grained alignment. We also review real-world applications of SSML +algorithms in diverse fields such as healthcare, remote sensing, and machine +translation. Finally, we discuss challenges and future directions for SSML. A +collection of related resources can be found at: +https://github.com/ys-zong/awesome-self-supervised-multimodal-learning. + +
+
+ comment: Accepted to IEEE T-PAMI +
+
+
+
+
+ + ♻ ☆ Multi-Hop Table Retrieval for Open-Domain Text-to-SQL + + +
+ Open-domain text-to-SQL is an important task that retrieves question-relevant +tables from massive databases and then generates SQL. However, existing +retrieval methods that retrieve in a single hop do not pay attention to the +text-to-SQL challenge of schema linking, which is aligning the entities in the +question with table entities, reflected in two aspects: similar irrelevant +entity and domain mismatch entity. Therefore, we propose our method, the +multi-hop table retrieval with rewrite and beam search (Murre). To reduce the +effect of the similar irrelevant entity, our method focuses on unretrieved +entities at each hop and considers the low-ranked tables by beam search. To +alleviate the limitation of domain mismatch entity, Murre rewrites the question +based on retrieved tables in multiple hops, decreasing the domain gap with +relevant tables. We conduct experiments on SpiderUnion and BirdUnion+, reaching +new state-of-the-art results with an average improvement of 6.38%. + +
+
+
+
+
+ + ♻ ☆ Fine-Tuned 'Small' LLMs (Still) Significantly Outperform Zero-Shot + Generative AI Models in Text Classification + + +
+ Generative AI offers a simple, prompt-based alternative to fine-tuning +smaller BERT-style LLMs for text classification tasks. This promises to +eliminate the need for manually labeled training data and task-specific model +training. However, it remains an open question whether tools like ChatGPT can +deliver on this promise. In this paper, we show that smaller, fine-tuned LLMs +(still) consistently and significantly outperform larger, zero-shot prompted +models in text classification. We compare three major generative AI models +(ChatGPT with GPT-3.5/GPT-4 and Claude Opus) with several fine-tuned LLMs +across a diverse set of classification tasks (sentiment, approval/disapproval, +emotions, party positions) and text categories (news, tweets, speeches). We +find that fine-tuning with application-specific training data achieves superior +performance in all cases. To make this approach more accessible to a broader +audience, we provide an easy-to-use toolkit alongside this paper. Our toolkit, +accompanied by non-technical step-by-step guidance, enables users to select and +fine-tune BERT-like LLMs for any classification task with minimal technical and +computational effort. + +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Transformers and Cortical Waves: Encoders for Pulling In Context Across + Time + + +
+ The capabilities of transformer networks such as ChatGPT and other Large +Language Models (LLMs) have captured the world's attention. The crucial +computational mechanism underlying their performance relies on transforming a +complete input sequence - for example, all the words in a sentence - into a +long "encoding vector" that allows transformers to learn long-range temporal +dependencies in naturalistic sequences. Specifically, "self-attention" applied +to this encoding vector enhances temporal context in transformers by computing +associations between pairs of words in the input sequence. We suggest that +waves of neural activity traveling across single cortical areas or multiple +regions at the whole-brain scale could implement a similar encoding principle. +By encapsulating recent input history into a single spatial pattern at each +moment in time, cortical waves may enable temporal context to be extracted from +sequences of sensory inputs, the same computational principle used in +transformers. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ No Language is an Island: Unifying Chinese and English in Financial + Large Language Models, Instruction Data, and Benchmarks + + +
+ While the progression of Large Language Models (LLMs) has notably propelled +financial analysis, their application has largely been confined to singular +language realms, leaving untapped the potential of bilingual Chinese-English +capacity. To bridge this chasm, we introduce ICE-PIXIU, seamlessly amalgamating +the ICE-INTENT model and ICE-FLARE benchmark for bilingual financial analysis. +ICE-PIXIU uniquely integrates a spectrum of Chinese tasks, alongside translated +and original English datasets, enriching the breadth and depth of bilingual +financial modeling. It provides unrestricted access to diverse model variants, +a substantial compilation of diverse cross-lingual and multi-modal instruction +data, and an evaluation benchmark with expert annotations, comprising 10 NLP +tasks, 20 bilingual specific tasks, totaling 95k datasets. Our thorough +evaluation emphasizes the advantages of incorporating these bilingual datasets, +especially in translation tasks and utilizing original English data, enhancing +both linguistic flexibility and analytical acuity in financial contexts. +Notably, ICE-INTENT distinguishes itself by showcasing significant enhancements +over conventional LLMs and existing financial LLMs in bilingual milieus, +underscoring the profound impact of robust bilingual data on the accuracy and +efficacy of financial NLP. + +
+
+ comment: 19 pages, 3 figures, 12 tables, including Appendix +
+
+
+
+
+ + ♻ ☆ Covert Bias: The Severity of Social Views' Unalignment in Language + Models Towards Implicit and Explicit Opinion + + +
+ While various approaches have recently been studied for bias identification, +little is known about how implicit language that does not explicitly convey a +viewpoint affects bias amplification in large language models. To examine the +severity of bias toward a view, we evaluated the performance of two downstream +tasks where the implicit and explicit knowledge of social groups were used. +First, we present a stress test evaluation by using a biased model in edge +cases of excessive bias scenarios. Then, we evaluate how LLMs calibrate +linguistically in response to both implicit and explicit opinions when they are +aligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM +performance in identifying implicit and explicit opinions, with a general +tendency of bias toward explicit opinions of opposing stances. Moreover, the +bias-aligned models generate more cautious responses using uncertainty phrases +compared to the unaligned (zero-shot) base models. The direct, incautious +responses of the unaligned models suggest a need for further refinement of +decisiveness by incorporating uncertainty markers to enhance their reliability, +especially on socially nuanced topics with high subjectivity. + +
+
+ comment: This work is under-review +
+
+
+
+
+ + ♻ ☆ Large Language Models Meet Text-Centric Multimodal Sentiment Analysis: A + Survey + + +
+ Compared to traditional sentiment analysis, which only considers text, +multimodal sentiment analysis needs to consider emotional signals from +multimodal sources simultaneously and is therefore more consistent with the way +how humans process sentiment in real-world scenarios. It involves processing +emotional information from various sources such as natural language, images, +videos, audio, physiological signals, etc. However, although other modalities +also contain diverse emotional cues, natural language usually contains richer +contextual information and therefore always occupies a crucial position in +multimodal sentiment analysis. The emergence of ChatGPT has opened up immense +potential for applying large language models (LLMs) to text-centric multimodal +tasks. However, it is still unclear how existing LLMs can adapt better to +text-centric multimodal sentiment analysis tasks. This survey aims to (1) +present a comprehensive review of recent research in text-centric multimodal +sentiment analysis tasks, (2) examine the potential of LLMs for text-centric +multimodal sentiment analysis, outlining their approaches, advantages, and +limitations, (3) summarize the application scenarios of LLM-based multimodal +sentiment analysis technology, and (4) explore the challenges and potential +research directions for multimodal sentiment analysis in the future. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2210.14556 by other authors +
+
+
+
+
+ + ♻ ☆ Unlocking the Non-Native Language Context Limitation: Native Language + Prompting Facilitates Knowledge Elicitation + + +
+ Multilingual large language models (MLLMs) struggle to answer questions posed +in non-dominant languages, even though they have acquired the relevant +knowledge from their dominant language corpus. In contrast, human multilinguals +can overcome such non-native language context limitations through Positive +Native Language Transfer (PNLT). Inspired by the process of PNLT, we analogize +the dominant language of MLLMs to the native language of human multilinguals, +and propose Native Language Prompting (NatLan) to simulate the PNLT observed in +human multilinguals. It explicitly creates native language contexts for MLLMs +to facilitate the elicitation of the rich native language knowledge during +question-answering, unlocking the limitations imposed by non-native language +contexts. By employing multi-MLLM collaboration, NatLan reduces the workload on +each MLLM in simulating PNLT and refines semantic transfer. On the C-Eval +benchmark, NatLan provides up to a 10.1% average accuracy improvement and up to +a 5.0% increase in the hard-level subset across five MLLMs, surpassing all +top-notch related methods. Our code is available at +https://github.com/AnonyNLP/NatLan. + +
+
+
+
+
+ + ♻ ☆ Emphasising Structured Information: Integrating Abstract Meaning + Representation into LLMs for Enhanced Open-Domain Dialogue Evaluation + + +
+ Automatic open-domain dialogue evaluation has attracted increasing attention. +Trainable evaluation metrics, typically trained with true positive and randomly +selected negative responses, tend to assign higher scores to responses that +share greater content similarity with a given context. However, adversarial +negative responses, despite possessing high content similarity with the +contexts, are semantically different. Consequently, existing evaluation metrics +are not robust enough to evaluate such responses, resulting in low correlations +with human judgments. While recent studies have demonstrated the effectiveness +of Large Language Models (LLMs) for open-domain dialogue evaluation, they still +face challenges in effectively handling adversarial negative examples. In this +paper, we propose an effective framework for open-domain dialogue evaluation, +which combines domain-specific language models (SLMs) enhanced with Abstract +Meaning Representation (AMR) knowledge with LLMs. The SLMs can explicitly +incorporate AMR graph information of the dialogue through a gating mechanism +for enhanced dialogue semantic representation learning. Both the evaluation +result from the SLMs and the AMR graph information are incorporated into the +LLM's prompt for enhanced evaluation performance. Experimental results on +open-domain dialogue evaluation tasks demonstrate the superiority of our method +compared to a wide range of state-of-the-art baselines, especially in +discriminating adversarial negative responses. Our code and data are publicly +available at https://github.com/Bernard-Yang/SIMAMR. + +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Improving the readability of mathematical expressions in text-based document +such as subtitle of mathematical video, is an significant task. To achieve +this, mathematical expressions should be convert to compiled formulas. For +instance, the spoken expression ``x equals minus b plus or minus the square +root of b squared minus four a c, all over two a'' from automatic speech +recognition is more readily comprehensible when displayed as a compiled formula +$x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken +sentences to compiled formulas, two processes are required: spoken sentences +are converted into LaTeX formulas, and LaTeX formulas are converted into +compiled formulas. The latter can be managed by using LaTeX engines. However, +there is no way to do the former effectively. Even if we try to solve this +using language models, there is no paired data between spoken sentences and +LaTeX formulas to train it. In this paper, we introduce MathBridge, the first +extensive dataset for translating mathematical spoken sentences into LaTeX +formulas. MathBridge comprises approximately 23 million LaTeX formulas paired +with the corresponding mathematical spoken sentences. Through comprehensive +evaluations, including fine-tuning with proposed data, we discovered that +MathBridge significantly enhances the capabilities of pretrained language +models for converting to LaTeX formulas from mathematical spoken sentences. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Crafting Customisable Characters with LLMs: Introducing SimsChat, a + Persona-Driven Role-Playing Agent Framework + + +
+ Large Language Models (LLMs) demonstrate a remarkable ability to comprehend +human instructions and generate high-quality text. This capability allows LLMs +to function as agents that can emulate human beings at a more sophisticated +level, beyond the mere replication of basic human behaviours. However, there is +a lack of exploring into leveraging LLMs to craft characters from diverse +aspects. In this work, we introduce the Customisable Conversation Agent +Framework, which leverages LLMs to simulate real-world characters that can be +freely customised according to various user preferences. This adaptable +framework is beneficial for the design of customisable characters and +role-playing agents aligned with human preferences. We propose the SimsConv +dataset, which encompasses 68 different customised characters, 1,360 multi-turn +role-playing dialogues, and a total of 13,971 interaction dialogues. The +characters are created from several real-world elements, such as career, +aspiration, trait, and skill. Building upon these foundations, we present +SimsChat, a freely customisable role-playing agent. It incorporates diverse +real-world scenes and topic-specific character interaction dialogues, thereby +simulating characters' life experiences in various scenarios and topic-specific +interactions with specific emotions. Experimental results indicate that our +proposed framework achieves desirable performance and provides a valuable +guideline for the construction of more accurate human simulacra in the future. +Our data and code are publicly available at +https://github.com/Bernard-Yang/SimsChat. + +
+
+
+
+
+ + ♻ ☆ Ada-KV: Optimizing KV Cache Eviction by Adaptive Budget Allocation for + Efficient LLM Inference + + +
+ Large Language Models have excelled in various fields but encounter +challenges in memory and time efficiency due to the expanding Key-Value (KV) +cache required for long-sequence inference. Recent efforts try to reduce KV +cache size to a given memory budget by evicting vast non-critical cache +elements during runtime, while preserving generation quality. Our revisiting of +current eviction methods reveals that they fundamentally minimize an upper +bound of the $L_1$ eviction loss between the pre- and post-eviction outputs of +multi-head self-attention mechanisms. Moreover, our analysis indicates that the +common practices of uniformly assigning budgets across attention heads harm +their post-eviction generation quality. In light of these findings, we propose +a simple yet effective adaptive budget allocation algorithm. This algorithm not +only optimizes the theoretical loss upper bound but also reduces the $L_1$ +eviction loss in practice by aligning with the varied characteristics across +different heads. By integrating this algorithm into two state-of-the-art +methods, we demonstrate the effectiveness of using adaptive budget allocation +to optimize KV cache eviction. Extensive evaluations on 16 datasets and the +Needle-in-a-Haystack test confirm significant performance improvements across +various tasks. + +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ What Do Language Models Hear? Probing for Auditory Representations in + Language Models + + +
+ This work explores whether language models encode meaningfully grounded +representations of sounds of objects. We learn a linear probe that retrieves +the correct text representation of an object given a snippet of audio related +to that object, where the sound representation is given by a pretrained audio +model. This probe is trained via a contrastive loss that pushes the language +representations and sound representations of an object to be close to one +another. After training, the probe is tested on its ability to generalize to +objects that were not seen during training. Across different language models +and audio models, we find that the probe generalization is above chance in many +cases, indicating that despite being trained only on raw text, language models +encode grounded knowledge of sounds for some objects. + +
+
+
+
+
+ + ♻ ☆ Natural Language Interaction with a Household Electricity + Knowledge-based Digital Twin + + +
+ Domain specific digital twins, representing a digital replica of various +segments of the smart grid, are foreseen as able to model, simulate, and +control the respective segments. At the same time, knowledge-based digital +twins, coupled with AI, may also empower humans to understand aspects of the +system through natural language interaction in view of planning and policy +making. This paper is the first to assess and report on the potential of +Retrieval Augmented Generation (RAG) question answers related to household +electrical energy measurement aspects leveraging a knowledge-based energy +digital twin. Relying on the recently published electricity consumption +knowledge graph that actually represents a knowledge-based digital twin, we +study the capabilities of ChatGPT, Gemini and Llama in answering electricity +related questions. Furthermore, we compare the answers with the ones generated +through a RAG techniques that leverages an existing electricity knowledge-based +digital twin. Our findings illustrate that the RAG approach not only reduces +the incidence of incorrect information typically generated by LLMs but also +significantly improves the quality of the output by grounding responses in +verifiable data. This paper details our methodology, presents a comparative +analysis of responses with and without RAG, and discusses the implications of +our findings for future applications of AI in specialized sectors like energy +data analysis. + +
+
+ comment: Accepted at IEEE SmartGridComm'24 +
+
+
+
+
+ + ♻ ☆ RT-Surv: Improving Mortality Prediction After Radiotherapy with Large + Language Model Structuring of Large-Scale Unstructured Electronic Health + Records + + +
+ Accurate patient selection is critical in radiotherapy (RT) to prevent +ineffective treatments. Traditional survival prediction models, relying on +structured data, often lack precision. This study explores the potential of +large language models (LLMs) to structure unstructured electronic health record +(EHR) data, thereby improving survival prediction accuracy through +comprehensive clinical information integration. Data from 34,276 patients +treated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed, +encompassing both structured and unstructured data. An open-source LLM was used +to structure the unstructured EHR data via single-shot learning, with its +performance compared against a domain-specific medical LLM and a smaller +variant. Survival prediction models were developed using statistical, machine +learning, and deep learning approaches, incorporating both structured and +LLM-structured data. Clinical experts evaluated the accuracy of the +LLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring +unstructured EHR data without additional training, significantly outperforming +the domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs +were more effective, particularly in extracting clinically relevant features +like general condition and disease extent, which closely correlated with +patient survival. Incorporating LLM-structured clinical features into survival +prediction models significantly improved accuracy, with the C-index of deep +learning models increasing from 0.737 to 0.820. These models also became more +interpretable by emphasizing clinically significant factors. This study shows +that general-domain LLMs, even without specific medical training, can +effectively structure large-scale unstructured EHR data, substantially +enhancing the accuracy and interpretability of clinical predictive models. + +
+
+ comment: 23 pages, 2 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ SCENE: Evaluating Explainable AI Techniques Using Soft Counterfactuals + + +
+ Explainable Artificial Intelligence (XAI) plays a crucial role in enhancing +the transparency and accountability of AI models, particularly in natural +language processing (NLP) tasks. However, popular XAI methods such as LIME and +SHAP have been found to be unstable and potentially misleading, underscoring +the need for a standardized evaluation approach. This paper introduces SCENE +(Soft Counterfactual Evaluation for Natural language Explainability), a novel +evaluation method that leverages large language models (LLMs) to generate Soft +Counterfactual explanations in a zero-shot manner. By focusing on token-based +substitutions, SCENE creates contextually appropriate and semantically +meaningful Soft Counterfactuals without extensive fine-tuning. SCENE adopts +Validitysoft and Csoft metrics to assess the effectiveness of model-agnostic +XAI methods in text classification tasks. Applied to CNN, RNN, and Transformer +architectures, SCENE provides valuable insights into the strengths and +limitations of various XAI techniques. + +
+
+
+
+
+ + ♻ ☆ Labeling supervised fine-tuning data with the scaling law + + +
+ This paper introduces a multi-stage manual annotation calibrated by the +scaling law, offering a high-quality Supervised Fine-Tuning data acquisition +method for environments with constrained resources like GPU poor, limited GPT +access, and funding restrictions. We have preprocessed 58k authentic chat data +and manually annotated 2.3k questions. After this, we conducted fine-tuning on +Qwen models, ranging from 0.5B to 32B parameters. The optimal version improved +29.07 in F1 score. This confirms the viability of fine-tuning Large Language +Model (LLM) for downstream Natural Language Processing (NLP) tasks. Our +contributions are: 1) Created Supervised Fine-Tuning (SFT) training data in +alpaca format, along with a set of Low-Rank Adaptation (LoRA) weights, and 2) +Developed a method for acquiring high-quality data leveraging scaling law +principle. The script, raw data with alpaca format and experiments track are +open-sourced on Github +(https://github.com/InternLM/HuixiangDou/tree/main/web/tools), HuggingFace +(https://huggingface.co/tpoisonooo) and WandB +(https://wandb.ai/tpoisonooo/huixiangdou-cr/table?nw=nwusertpoisonooo). The +privacy of the data involved has been authorized by users. SFT data and license +comes from ncnn contributors group. + +
+
+ comment: 5 pages, 3 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ ToolSword: Unveiling Safety Issues of Large Language Models in Tool + Learning Across Three Stages ACL 2024 + + +
+ Tool learning is widely acknowledged as a foundational approach or deploying +large language models (LLMs) in real-world scenarios. While current research +primarily emphasizes leveraging tools to augment LLMs, it frequently neglects +emerging safety considerations tied to their application. To fill this gap, we +present *ToolSword*, a comprehensive framework dedicated to meticulously +investigating safety issues linked to LLMs in tool learning. Specifically, +ToolSword delineates six safety scenarios for LLMs in tool learning, +encompassing **malicious queries** and **jailbreak attacks** in the input +stage, **noisy misdirection** and **risky cues** in the execution stage, and +**harmful feedback** and **error conflicts** in the output stage. Experiments +conducted on 11 open-source and closed-source LLMs reveal enduring safety +challenges in tool learning, such as handling harmful queries, employing risky +tools, and delivering detrimental feedback, which even GPT-4 is susceptible to. +Moreover, we conduct further studies with the aim of fostering research on tool +learning safety. The data is released in +https://github.com/Junjie-Ye/ToolSword. + +
+
+ comment: Accepted by ACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and + Iterative Sub-SQL Refinement for Text-to-SQL + + +
+ Recent In-Context Learning based methods have achieved remarkable success in +Text-to-SQL task. However, there is still a large gap between the performance +of these models and human performance on datasets with complex database schema +and difficult questions, such as BIRD. Besides, existing work has neglected to +supervise intermediate steps when solving questions iteratively with question +decomposition methods, and the schema linking methods used in these works are +very rudimentary. To address these issues, we propose MAG-SQL, a multi-agent +generative approach with soft schema linking and iterative Sub-SQL refinement. +In our framework, an entity-based method with tables' summary is used to select +the columns in database, and a novel targets-conditions decomposition method is +introduced to decompose those complex questions. Additionally, we build a +iterative generating module which includes a Sub-SQL Generator and Sub-SQL +Refiner, introducing external oversight for each step of generation. Through a +series of ablation studies, the effectiveness of each agent in our framework +has been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL +achieves an execution accuracy of 61.08%, compared to the baseline accuracy of +46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL. +Besides, our approach makes similar progress on Spider. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Distilling Reasoning Ability from Large Language Models with Adaptive + Thinking + + +
+ Chain of thought finetuning (cot-finetuning) aims to endow small language +models (SLM) with reasoning ability to improve their performance towards +specific tasks by allowing them to imitate the reasoning procedure of large +language models (LLM) beyond simply predicting the answers. Most existing +cot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to +generate a rationale before providing an answer. This mechanism enables SLM to +analyze and think about complex questions, but it also makes answer correctness +highly sensitive to minor errors in rationale. Therefore, we propose a robust +post-thinking mechanism to generate answers before rationale. Thanks to this +answer-first setting, 1) the answer can escape from the adverse effects caused +by minor errors in the rationale; 2) the rationale serves as an error amplifier +to the answer, which makes the SLM focus on learning hard samples; 3) the +inferring efficiency can also benefit from the setting since users can stop the +generation right after answers are outputted when inference is conducted. +However, although the post-thinking mechanism brings many advantages and +improves the overall performance of SLM on specific tasks, it may lose the +ability to think about the questions and decompose complex questions into +simple sub-questions compared to pre-thinking mechanism. Therefore, a +plug-and-play adaptive-thinking mechanism is proposed with the aid of the soft +prompt tuning to integrate the merits of the pre-thinking mechanism and +post-thinking mechanism, in which a perception module is introduced to +adaptively prompt SLM answer or think first based on perceiving the complexity +of the questions. Extensive experiments are conducted across 12 reasoning tasks +and 2 representative language models to demonstrate the effectiveness of the +proposed mechanism. + +
+
+
+
+
+ + ♻ ☆ MKRAG: Medical Knowledge Retrieval Augmented Generation for Medical + Question Answering + + +
+ Large Language Models (LLMs), although powerful in general domains, often +perform poorly on domain-specific tasks such as medical question answering +(QA). In addition, LLMs tend to function as "black-boxes", making it +challenging to modify their behavior. To address the problem, our work employs +a transparent process of retrieval augmented generation (RAG), aiming to +improve LLM responses without the need for fine-tuning or retraining. +Specifically, we propose a comprehensive retrieval strategy to extract medical +facts from an external knowledge base, and then inject them into the LLM's +query prompt. Focusing on medical QA, we evaluate the impact of different +retrieval models and the number of facts on LLM performance using the +MedQA-SMILE dataset. Notably, our retrieval-augmented Vicuna-7B model exhibited +an accuracy improvement from 44.46% to 48.54%. This work underscores the +potential of RAG to enhance LLM performance, offering a practical approach to +mitigate the challenges posed by black-box LLMs. + +
+
+ comment: Accepted by AMIA 2024 Annual Symposium +
+
+
+
+
+ + ♻ ☆ A Data Generation Perspective to the Mechanism of In-Context Learning + + +
+ In-Context Learning (ICL) empowers Large Language Models (LLMs) with the +capacity to learn in context, achieving downstream generalization without +gradient updates but with a few in-context examples. Despite the encouraging +empirical success, the underlying mechanism of ICL remains unclear, and +existing research offers various viewpoints of understanding. These studies +propose intuition-driven and ad-hoc technical solutions for interpreting ICL, +illustrating an ambiguous road map. In this paper, we leverage a data +generation perspective to reinterpret recent efforts and demonstrate the +potential broader usage of popular technical solutions, approaching a +systematic angle. For a conceptual definition, we rigorously adopt the terms of +skill learning and skill recognition. The difference between them is skill +learning can learn new data generation functions from in-context data. We also +provide a comprehensive study on the merits and weaknesses of different +solutions, and highlight the uniformity among them given the perspective of +data generation, establishing a technical foundation for future research to +incorporate the strengths of different lines of research. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 95 + +
+
+
+ + ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ☆ SAM2-UNet: Segment Anything 2 Makes Strong Encoder for Natural and + Medical Image Segmentation + + +
+ Image segmentation plays an important role in vision understanding. Recently, +the emerging vision foundation models continuously achieved superior +performance on various tasks. Following such success, in this paper, we prove +that the Segment Anything Model 2 (SAM2) can be a strong encoder for U-shaped +segmentation models. We propose a simple but effective framework, termed +SAM2-UNet, for versatile image segmentation. Specifically, SAM2-UNet adopts the +Hiera backbone of SAM2 as the encoder, while the decoder uses the classic +U-shaped design. Additionally, adapters are inserted into the encoder to allow +parameter-efficient fine-tuning. Preliminary experiments on various downstream +tasks, such as camouflaged object detection, salient object detection, marine +animal segmentation, mirror detection, and polyp segmentation, demonstrate that +our SAM2-UNet can simply beat existing specialized state-of-the-art methods +without bells and whistles. Project page: +\url{https://github.com/WZH0120/SAM2-UNet}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ DPA: Dual Prototypes Alignment for Unsupervised Adaptation of + Vision-Language Models + + +
+ Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in +zero-shot image classification. However, adapting these models to new domains +remains challenging, especially in unsupervised settings where labelled data is +unavailable. Recent research has proposed pseudo-labelling approaches to adapt +CLIP in an unsupervised manner using unlabelled target data. Nonetheless, these +methods struggle due to noisy pseudo-labels resulting from the misalignment +between CLIP's visual and textual representations. This study introduces DPA, +an unsupervised domain adaptation method for VLMs. DPA introduces the concept +of dual prototypes, acting as distinct classifiers, along with the convex +combination of their outputs, thereby leading to accurate pseudo-label +construction. Next, it ranks pseudo-labels to facilitate robust self-training, +particularly during early training. Finally, it addresses visual-textual +misalignment by aligning textual prototypes with image prototypes to further +improve the adaptation performance. Experiments on 13 downstream vision tasks +demonstrate that DPA significantly outperforms zero-shot CLIP and the +state-of-the-art unsupervised adaptation baselines. + +
+
+
+
+
+ + ☆ HistoGym: A Reinforcement Learning Environment for Histopathological + Image Analysis + + +
+ In pathological research, education, and clinical practice, the +decision-making process based on pathological images is critically important. +This significance extends to digital pathology image analysis: its adequacy is +demonstrated by the extensive information contained within tissue structures, +which is essential for accurate cancer classification and grading. +Additionally, its necessity is highlighted by the inherent requirement for +interpretability in the conclusions generated by algorithms. For humans, +determining tumor type and grade typically involves multi-scale analysis, which +presents a significant challenge for AI algorithms. Traditional patch-based +methods are inadequate for modeling such complex structures, as they fail to +capture the intricate, multi-scale information inherent in whole slide images. +Consequently, there is a pressing need for advanced AI techniques capable of +efficiently and accurately replicating this complex analytical process. To +address this issue, we introduce HistoGym, an open-source reinforcement +learning environment for histopathological image analysis. Following OpenAI Gym +APIs, HistoGym aims to foster whole slide image diagnosis by mimicking the +real-life processes of doctors. Leveraging the pyramid feature of WSIs and the +OpenSlide API, HistoGym provides a unified framework for various clinical +tasks, including tumor detection and classification. We detail the observation, +action, and reward specifications tailored for the histopathological image +analysis domain and provide an open-source Python-based interface for both +clinicians and researchers. To accommodate different clinical demands, we offer +various scenarios for different organs and cancers, including both WSI-based +and selected region-based scenarios, showcasing several noteworthy results. + +
+
+
+
+
+ + ☆ RGBT Tracking via All-layer Multimodal Interactions with Progressive + Fusion Mamba + + +
+ Existing RGBT tracking methods often design various interaction models to +perform cross-modal fusion of each layer, but can not execute the feature +interactions among all layers, which plays a critical role in robust multimodal +representation, due to large computational burden. To address this issue, this +paper presents a novel All-layer multimodal Interaction Network, named AINet, +which performs efficient and effective feature interactions of all modalities +and layers in a progressive fusion Mamba, for robust RGBT tracking. Even though +modality features in different layers are known to contain different cues, it +is always challenging to build multimodal interactions in each layer due to +struggling in balancing interaction capabilities and efficiency. Meanwhile, +considering that the feature discrepancy between RGB and thermal modalities +reflects their complementary information to some extent, we design a +Difference-based Fusion Mamba (DFM) to achieve enhanced fusion of different +modalities with linear complexity. When interacting with features from all +layers, a huge number of token sequences (3840 tokens in this work) are +involved and the computational burden is thus large. To handle this problem, we +design an Order-dynamic Fusion Mamba (OFM) to execute efficient and effective +feature interactions of all layers by dynamically adjusting the scan order of +different layers in Mamba. Extensive experiments on four public RGBT tracking +datasets show that AINet achieves leading performance against existing +state-of-the-art methods. + +
+
+
+
+
+ + ☆ PFDiff: Training-free Acceleration of Diffusion Models through the + Gradient Guidance of Past and Future + + +
+ Diffusion Probabilistic Models (DPMs) have shown remarkable potential in +image generation, but their sampling efficiency is hindered by the need for +numerous denoising steps. Most existing solutions accelerate the sampling +process by proposing fast ODE solvers. However, the inevitable discretization +errors of the ODE solvers are significantly magnified when the number of +function evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel +training-free and orthogonal timestep-skipping strategy, which enables existing +fast ODE solvers to operate with fewer NFE. Based on two key observations: a +significant similarity in the model's outputs at time step size that is not +excessively large during the denoising process of existing ODE solvers, and a +high resemblance between the denoising process and SGD. PFDiff, by employing +gradient replacement from past time steps and foresight updates inspired by +Nesterov momentum, rapidly updates intermediate states, thereby reducing +unnecessary NFE while correcting for discretization errors inherent in +first-order ODE solvers. Experimental results demonstrate that PFDiff exhibits +flexible applicability across various pre-trained DPMs, particularly excelling +in conditional DPMs and surpassing previous state-of-the-art training-free +methods. For instance, using DDIM as a baseline, we achieved 16.46 FID (4 NFE) +compared to 138.81 FID with DDIM on ImageNet 64x64 with classifier guidance, +and 13.06 FID (10 NFE) on Stable Diffusion with 7.5 guidance scale. + +
+
+
+
+
+ + ☆ Retrieval-augmented Few-shot Medical Image Segmentation with Foundation + Models + + +
+ Medical image segmentation is crucial for clinical decision-making, but the +scarcity of annotated data presents significant challenges. Few-shot +segmentation (FSS) methods show promise but often require retraining on the +target domain and struggle to generalize across different modalities. +Similarly, adapting foundation models like the Segment Anything Model (SAM) for +medical imaging has limitations, including the need for finetuning and +domain-specific adaptation. To address these issues, we propose a novel method +that adapts DINOv2 and Segment Anything Model 2 (SAM 2) for retrieval-augmented +few-shot medical image segmentation. Our approach uses DINOv2's feature as +query to retrieve similar samples from limited annotated data, which are then +encoded as memories and stored in memory bank. With the memory attention +mechanism of SAM 2, the model leverages these memories as conditions to +generate accurate segmentation of the target image. We evaluated our framework +on three medical image segmentation tasks, demonstrating superior performance +and generalizability across various modalities without the need for any +retraining or finetuning. Overall, this method offers a practical and effective +solution for few-shot medical image segmentation and holds significant +potential as a valuable annotation tool in clinical applications. + +
+
+
+
+
+ + ☆ PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors + + +
+ Online vectorized High-Definition (HD) map construction is crucial for +subsequent prediction and planning tasks in autonomous driving. Following MapTR +paradigm, recent works have made noteworthy achievements. However, reference +points are randomly initialized in mainstream methods, leading to unstable +matching between predictions and ground truth. To address this issue, we +introduce PriorMapNet to enhance online vectorized HD map construction with +priors. We propose the PPS-Decoder, which provides reference points with +position and structure priors. Fitted from the map elements in the dataset, +prior reference points lower the learning difficulty and achieve stable +matching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV +transformation with BEV feature priors. Besides, we propose the DMD +cross-attention, which decouples cross-attention along multi-scale and +multi-sample respectively to achieve efficiency. Our proposed PriorMapNet +achieves state-of-the-art performance in the online vectorized HD map +construction task on nuScenes and Argoverse2 datasets. The code will be +released publicly soon. + +
+
+
+
+
+ + ☆ Backward-Compatible Aligned Representations via an Orthogonal + Transformation Layer ECCV2024 + + +
+ Visual retrieval systems face significant challenges when updating models +with improved representations due to misalignment between the old and new +representations. The costly and resource-intensive backfilling process involves +recalculating feature vectors for images in the gallery set whenever a new +model is introduced. To address this, prior research has explored +backward-compatible training methods that enable direct comparisons between new +and old representations without backfilling. Despite these advancements, +achieving a balance between backward compatibility and the performance of +independently trained models remains an open problem. In this paper, we address +it by expanding the representation space with additional dimensions and +learning an orthogonal transformation to achieve compatibility with old models +and, at the same time, integrate new information. This transformation preserves +the original feature space's geometry, ensuring that our model aligns with +previous versions while also learning new data. Our Orthogonal Compatible +Aligned (OCA) approach eliminates the need for re-indexing during model updates +and ensures that features can be compared directly across different model +updates without additional mapping functions. Experimental results on CIFAR-100 +and ImageNet-1k demonstrate that our method not only maintains compatibility +with previous models but also achieves state-of-the-art accuracy, outperforming +several existing methods. + +
+
+ comment: Accepted at BEW2024 Workshop at ECCV2024 +
+
+
+
+
+ + ☆ Assessing Generalization Capabilities of Malaria Diagnostic Models from + Thin Blood Smears MICCAI 2024 + + +
+ Malaria remains a significant global health challenge, necessitating rapid +and accurate diagnostic methods. While computer-aided diagnosis (CAD) tools +utilizing deep learning have shown promise, their generalization to diverse +clinical settings remains poorly assessed. This study evaluates the +generalization capabilities of a CAD model for malaria diagnosis from thin +blood smear images across four sites. We explore strategies to enhance +generalization, including fine-tuning and incremental learning. Our results +demonstrate that incorporating site-specific data significantly improves model +performance, paving the way for broader clinical application. + +
+
+ comment: MICCAI 2024 AMAI Workshop, Accepted for presentation, Submitted + Manuscript Version, 10 pages +
+
+
+
+
+ + ☆ A Disease-Specific Foundation Model Using Over 100K Fundus Images: + Release and Validation for Abnormality and Multi-Disease Classification on + Downstream Tasks + + +
+ Artificial intelligence applied to retinal images offers significant +potential for recognizing signs and symptoms of retinal conditions and +expediting the diagnosis of eye diseases and systemic disorders. However, +developing generalized artificial intelligence models for medical data often +requires a large number of labeled images representing various disease signs, +and most models are typically task-specific, focusing on major retinal +diseases. In this study, we developed a Fundus-Specific Pretrained Model +(Image+Fundus), a supervised artificial intelligence model trained to detect +abnormalities in fundus images. A total of 57,803 images were used to develop +this pretrained model, which achieved superior performance across various +downstream tasks, indicating that our proposed model outperforms other general +methods. Our Image+Fundus model offers a generalized approach to improve model +performance while reducing the number of labeled datasets required. +Additionally, it provides more disease-specific insights into fundus images, +with visualizations generated by our model. These disease-specific foundation +models are invaluable in enhancing the performance and efficiency of deep +learning models in the field of fundus imaging. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Multi-task Learning Approach for Intracranial Hemorrhage Prognosis + + +
+ Prognosis after intracranial hemorrhage (ICH) is influenced by a complex +interplay between imaging and tabular data. Rapid and reliable prognosis are +crucial for effective patient stratification and informed treatment +decision-making. In this study, we aim to enhance image-based prognosis by +learning a robust feature representation shared between prognosis and the +clinical and demographic variables most highly correlated with it. Our approach +mimics clinical decision-making by reinforcing the model to learn valuable +prognostic data embedded in the image. We propose a 3D multi-task image model +to predict prognosis, Glasgow Coma Scale and age, improving accuracy and +interpretability. Our method outperforms current state-of-the-art baseline +image models, and demonstrates superior performance in ICH prognosis compared +to four board-certified neuroradiologists using only CT scans as input. We +further validate our model with interpretability saliency maps. Code is +available at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ VF-NeRF: Learning Neural Vector Fields for Indoor Scene Reconstruction + + +
+ Implicit surfaces via neural radiance fields (NeRF) have shown surprising +accuracy in surface reconstruction. Despite their success in reconstructing +richly textured surfaces, existing methods struggle with planar regions with +weak textures, which account for the majority of indoor scenes. In this paper, +we address indoor dense surface reconstruction by revisiting key aspects of +NeRF in order to use the recently proposed Vector Field (VF) as the implicit +representation. VF is defined by the unit vector directed to the nearest +surface point. It therefore flips direction at the surface and equals to the +explicit surface normals. Except for this flip, VF remains constant along +planar surfaces and provides a strong inductive bias in representing planar +surfaces. Concretely, we develop a novel density-VF relationship and a training +scheme that allows us to learn VF via volume rendering By doing this, VF-NeRF +can model large planar surfaces and sharp corners accurately. We show that, +when depth cues are available, our method further improves and achieves +state-of-the-art results in reconstructing indoor scenes and rendering novel +views. We extensively evaluate VF-NeRF on indoor datasets and run ablations of +its components. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ PCP-MAE: Learning to Predict Centers for Point Masked Autoencoders + + +
+ Masked autoencoder has been widely explored in point cloud self-supervised +learning, whereby the point cloud is generally divided into visible and masked +parts. These methods typically include an encoder accepting visible patches +(normalized) and corresponding patch centers (position) as input, with the +decoder accepting the output of the encoder and the centers (position) of the +masked parts to reconstruct each point in the masked patches. Then, the +pre-trained encoders are used for downstream tasks. In this paper, we show a +motivating empirical result that when directly feeding the centers of masked +patches to the decoder without information from the encoder, it still +reconstructs well. In other words, the centers of patches are important and the +reconstruction objective does not necessarily rely on representations of the +encoder, thus preventing the encoder from learning semantic representations. +Based on this key observation, we propose a simple yet effective method, i.e., +learning to Predict Centers for Point Masked AutoEncoders (PCP-MAE) which +guides the model to learn to predict the significant centers and use the +predicted centers to replace the directly provided centers. Specifically, we +propose a Predicting Center Module (PCM) that shares parameters with the +original encoder with extra cross-attention to predict centers. Our method is +of high pre-training efficiency compared to other alternatives and achieves +great improvement over Point-MAE, particularly outperforming it by 5.50%, +6.03%, and 5.17% on three variants of ScanObjectNN. The code will be made +publicly available. + +
+
+
+
+
+ + ☆ Comparative Analysis of Generative Models: Enhancing Image Synthesis + with VAEs, GANs, and Stable Diffusion + + +
+ This paper examines three major generative modelling frameworks: Variational +Autoencoders (VAEs), Generative Adversarial Networks (GANs), and Stable +Diffusion models. VAEs are effective at learning latent representations but +frequently yield blurry results. GANs can generate realistic images but face +issues such as mode collapse. Stable Diffusion models, while producing +high-quality images with strong semantic coherence, are demanding in terms of +computational resources. Additionally, the paper explores how incorporating +Grounding DINO and Grounded SAM with Stable Diffusion improves image accuracy +by utilising sophisticated segmentation and inpainting techniques. The analysis +guides on selecting suitable models for various applications and highlights +areas for further research. + +
+
+
+
+
+ + ☆ MicroSSIM: Improved Structural Similarity for Comparing Microscopy Data ECCV 24 + + +
+ Microscopy is routinely used to image biological structures of interest. Due +to imaging constraints, acquired images are typically low-SNR and contain +noise. Over the last few years, regression-based tasks like unsupervised +denoising and splitting have found utility in working with such noisy +micrographs. For evaluation, Structural Similarity (SSIM) is one of the most +popular measures used in the field. For such tasks, the best evaluation would +be when both low-SNR noisy images and corresponding high-SNR clean images are +obtained directly from a microscope. However, due to the following three +peculiar properties of the microscopy data, we observe that SSIM is not well +suited to this data regime: (a) high-SNR micrographs have higher intensity +pixels as compared to low SNR micrographs, (b) high-SNR micrographs have higher +intensity pixels than found in natural images, images for which SSIM was +developed, and (c) a digitally configurable offset is added by the detector +present inside the microscope. We show that SSIM components behave unexpectedly +when the prediction generated from low-SNR input is compared with the +corresponding high-SNR data. We explain this behavior by introducing the +phenomenon of saturation, where the value of SSIM components becomes less +sensitive to (dis)similarity between the images. We introduce microSSIM, a +variant of SSIM, which overcomes the above-discussed issues. We justify the +soundness and utility of microSSIM using theoretical and empirical arguments +and show the utility of microSSIM on two tasks: unsupervised denoising and +joint image splitting with unsupervised denoising. Since our formulation can be +applied to a broad family of SSIM-based measures, we also introduce MicroMS3IM, +a microscopy-specific variation of MS-SSIM. The source code and python package +is available at https://github.com/juglab/MicroSSIM. + +
+
+ comment: Accepted at BIC workshop, ECCV 24 +
+
+
+
+
+ + ☆ A lifted Bregman strategy for training unfolded proximal neural network + Gaussian denoisers + + +
+ Unfolded proximal neural networks (PNNs) form a family of methods that +combines deep learning and proximal optimization approaches. They consist in +designing a neural network for a specific task by unrolling a proximal +algorithm for a fixed number of iterations, where linearities can be learned +from prior training procedure. PNNs have shown to be more robust than +traditional deep learning approaches while reaching at least as good +performances, in particular in computational imaging. However, training PNNs +still depends on the efficiency of available training algorithms. In this work, +we propose a lifted training formulation based on Bregman distances for +unfolded PNNs. Leveraging the deterministic mini-batch block-coordinate +forward-backward method, we design a bespoke computational strategy beyond +traditional back-propagation methods for solving the resulting learning problem +efficiently. We assess the behaviour of the proposed training approach for PNNs +through numerical simulations on image denoising, considering a denoising PNN +whose structure is based on dual proximal-gradient iterations. + +
+
+ comment: 2024 IEEE International Workshop on Machine Learning for Signal + Processing, Sept. 22--25, 2024, London, UK +
+
+
+
+
+ + ☆ Task-Aware Dynamic Transformer for Efficient Arbitrary-Scale Image + Super-Resolution ECAI 2024 + + +
+ Arbitrary-scale super-resolution (ASSR) aims to learn a single model for +image super-resolution at arbitrary magnifying scales. Existing ASSR networks +typically comprise an off-the-shelf scale-agnostic feature extractor and an +arbitrary scale upsampler. These feature extractors often use fixed network +architectures to address different ASSR inference tasks, each of which is +characterized by an input image and an upsampling scale. However, this +overlooks the difficulty variance of super-resolution on different inference +scenarios, where simple images or small SR scales could be resolved with less +computational effort than difficult images or large SR scales. To tackle this +difficulty variability, in this paper, we propose a Task-Aware Dynamic +Transformer (TADT) as an input-adaptive feature extractor for efficient image +ASSR. Our TADT consists of a multi-scale feature extraction backbone built upon +groups of Multi-Scale Transformer Blocks (MSTBs) and a Task-Aware Routing +Controller (TARC). The TARC predicts the inference paths within feature +extraction backbone, specifically selecting MSTBs based on the input images and +SR scales. The prediction of inference path is guided by a new loss function to +trade-off the SR accuracy and efficiency. Experiments demonstrate that, when +working with three popular arbitrary-scale upsamplers, our TADT achieves +state-of-the-art ASSR performance when compared with mainstream feature +extractors, but with relatively fewer computational costs. The code will be +publicly released. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ☆ Correspondence-Guided SfM-Free 3D Gaussian Splatting for NVS + + +
+ Novel View Synthesis (NVS) without Structure-from-Motion (SfM) pre-processed +camera poses--referred to as SfM-free methods--is crucial for promoting rapid +response capabilities and enhancing robustness against variable operating +conditions. Recent SfM-free methods have integrated pose optimization, +designing end-to-end frameworks for joint camera pose estimation and NVS. +However, most existing works rely on per-pixel image loss functions, such as L2 +loss. In SfM-free methods, inaccurate initial poses lead to misalignment issue, +which, under the constraints of per-pixel image loss functions, results in +excessive gradients, causing unstable optimization and poor convergence for +NVS. In this study, we propose a correspondence-guided SfM-free 3D Gaussian +splatting for NVS. We use correspondences between the target and the rendered +result to achieve better pixel alignment, facilitating the optimization of +relative poses between frames. We then apply the learned poses to optimize the +entire scene. Each 2D screen-space pixel is associated with its corresponding +3D Gaussians through approximated surface rendering to facilitate gradient back +propagation. Experimental results underline the superior performance and time +efficiency of the proposed approach compared to the state-of-the-art baselines. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.07504 by other authors +
+
+
+
+
+ + ☆ Decoupling Feature Representations of Ego and Other Modalities for + Incomplete Multi-modal Brain Tumor Segmentation + + +
+ Multi-modal brain tumor segmentation typically involves four magnetic +resonance imaging (MRI) modalities, while incomplete modalities significantly +degrade performance. Existing solutions employ explicit or implicit modality +adaptation, aligning features across modalities or learning a fused feature +robust to modality incompleteness. They share a common goal of encouraging each +modality to express both itself and the others. However, the two expression +abilities are entangled as a whole in a seamless feature space, resulting in +prohibitive learning burdens. In this paper, we propose DeMoSeg to enhance the +modality adaptation by Decoupling the task of representing the ego and other +Modalities for robust incomplete multi-modal Segmentation. The decoupling is +super lightweight by simply using two convolutions to map each modality onto +four feature sub-spaces. The first sub-space expresses itself (Self-feature), +while the remaining sub-spaces substitute for other modalities +(Mutual-features). The Self- and Mutual-features interactively guide each other +through a carefully-designed Channel-wised Sparse Self-Attention (CSSA). After +that, a Radiologist-mimic Cross-modality expression Relationships (RCR) is +introduced to have available modalities provide Self-feature and also `lend' +their Mutual-features to compensate for the absent ones by exploiting the +clinical prior knowledge. The benchmark results on BraTS2020, BraTS2018 and +BraTS2015 verify the DeMoSeg's superiority thanks to the alleviated modality +adaptation difficulty. Concretely, for BraTS2020, DeMoSeg increases Dice by at +least 0.92%, 2.95% and 4.95% on whole tumor, tumor core and enhanced tumor +regions, respectively, compared to other state-of-the-arts. Codes are at +https://github.com/kk42yy/DeMoSeg + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Beyond the Hype: A dispassionate look at vision-language models in + medical scenario + + +
+ Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated +remarkable capabilities across diverse tasks, garnering significant attention +in AI communities. However, their performance and reliability in specialized +domains such as medicine remain insufficiently assessed. In particular, most +assessments over-concentrate in evaluating VLMs based on simple Visual Question +Answering (VQA) on multi-modality data, while ignoring the in-depth +characteristic of LVLMs. In this study, we introduce RadVUQA, a novel +Radiological Visual Understanding and Question Answering benchmark, to +comprehensively evaluate existing LVLMs. RadVUQA mainly validates LVLMs across +five dimensions: 1) Anatomical understanding, assessing the models' ability to +visually identify biological structures; 2) Multimodal comprehension, which +involves the capability of interpreting linguistic and visual instructions to +produce desired outcomes; 3) Quantitative and spatial reasoning, evaluating the +models' spatial awareness and proficiency in combining quantitative analysis +with visual and linguistic information; 4) Physiological knowledge, measuring +the models' capability to comprehend functions and mechanisms of organs and +systems; and 5) Robustness, which assesses the models' capabilities against +unharmonised and synthetic data. The results indicate that both generalized +LVLMs and medical-specific LVLMs have critical deficiencies with weak +multimodal comprehension and quantitative reasoning capabilities. Our findings +reveal the large gap between existing LVLMs and clinicians, highlighting the +urgent need for more robust and intelligent LVLMs. The code and dataset will be +available after the acceptance of this paper. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ TsCA: On the Semantic Consistency Alignment via Conditional Transport + for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to recognize novel +\textit{state-object} compositions by leveraging the shared knowledge of their +primitive components. Despite considerable progress, effectively calibrating +the bias between semantically similar multimodal representations, as well as +generalizing pre-trained knowledge to novel compositional contexts, remains an +enduring challenge. In this paper, our interest is to revisit the conditional +transport (CT) theory and its homology to the visual-semantics interaction in +CZSL and further, propose a novel Trisets Consistency Alignment framework +(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three +distinct yet semantically homologous sets, i.e., patches, primitives, and +compositions, to construct pairwise CT costs to minimize their semantic +discrepancies. To further ensure the consistency transfer within these sets, we +implement a cycle-consistency constraint that refines the learning by +guaranteeing the feature consistency of the self-mapping during transport flow, +regardless of modality. Moreover, we extend the CT plans to an open-world +setting, which enables the model to effectively filter out unfeasible pairs, +thereby speeding up the inference as well as increasing the accuracy. Extensive +experiments are conducted to verify the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ HyCoT: Hyperspectral Compression Transformer with an Efficient Training + Strategy + + +
+ The development of learning-based hyperspectral image (HSI) compression +models has recently attracted significant interest. Existing models +predominantly utilize convolutional filters, which capture only local +dependencies. Furthermore, they often incur high training costs and exhibit +substantial computational complexity. To address these limitations, in this +paper we propose Hyperspectral Compression Transformer (HyCoT) that is a +transformer-based autoencoder for pixelwise HSI compression. Additionally, we +introduce an efficient training strategy to accelerate the training process. +Experimental results on the HySpecNet-11k dataset demonstrate that HyCoT +surpasses the state-of-the-art across various compression ratios by over 1 dB +with significantly reduced computational requirements. Our code and pre-trained +weights are publicly available at https://git.tu-berlin.de/rsim/hycot . + +
+
+
+
+
+ + ☆ LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression + + +
+ The key to effective point cloud compression is to obtain a robust context +model consistent with complex 3D data structures. Recently, the advancement of +large language models (LLMs) has highlighted their capabilities not only as +powerful generators for in-context learning and generation but also as +effective compressors. These dual attributes of LLMs make them particularly +well-suited to meet the demands of data compression. Therefore, this paper +explores the potential of using LLM for compression tasks, focusing on lossless +point cloud geometry compression (PCGC) experiments. However, applying LLM +directly to PCGC tasks presents some significant challenges, i.e., LLM does not +understand the structure of the point cloud well, and it is a difficult task to +fill the gap between text and point cloud through text description, especially +for large complicated and small shapeless point clouds. To address these +problems, we introduce a novel architecture, namely the Large Language +Model-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to +compress point cloud geometry information without any text description or +aligning operation. By utilizing different adaptation techniques for +cross-modality representation alignment and semantic consistency, including +clustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA), +the proposed method can translate LLM to a compressor/generator for point +cloud. To the best of our knowledge, this is the first structure to employ LLM +as a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC +outperforms the other existing methods significantly, by achieving -40.213% bit +rate reduction compared to the reference software of MPEG Geometry-based Point +Cloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction +compared to the state-of-the-art learning-based method. + +
+
+
+
+
+ + ☆ Towards Physical World Backdoor Attacks against Skeleton Action + Recognition ECCV 2024 + + +
+ Skeleton Action Recognition (SAR) has attracted significant interest for its +efficient representation of the human skeletal structure. Despite its +advancements, recent studies have raised security concerns in SAR models, +particularly their vulnerability to adversarial attacks. However, such +strategies are limited to digital scenarios and ineffective in physical +attacks, limiting their real-world applicability. To investigate the +vulnerabilities of SAR in the physical world, we introduce the Physical +Skeleton Backdoor Attacks (PSBA), the first exploration of physical backdoor +attacks against SAR. Considering the practicalities of physical execution, we +introduce a novel trigger implantation method that integrates infrequent and +imperceivable actions as triggers into the original skeleton data. By +incorporating a minimal amount of this manipulated data into the training set, +PSBA enables the system misclassify any skeleton sequences into the target +class when the trigger action is present. We examine the resilience of PSBA in +both poisoned and clean-label scenarios, demonstrating its efficacy across a +range of datasets, poisoning ratios, and model architectures. Additionally, we +introduce a trigger-enhancing strategy to strengthen attack performance in the +clean label setting. The robustness of PSBA is tested against three distinct +backdoor defenses, and the stealthiness of PSBA is evaluated using two +quantitative metrics. Furthermore, by employing a Kinect V2 camera, we compile +a dataset of human actions from the real world to mimic physical attack +situations, with our findings confirming the effectiveness of our proposed +attacks. Our project website can be found at +https://qichenzheng.github.io/psba-website. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Adaptive Layer Selection for Efficient Vision Transformer Fine-Tuning + + +
+ Recently, foundation models based on Vision Transformers (ViTs) have become +widely available. However, their fine-tuning process is highly +resource-intensive, and it hinders their adoption in several edge or low-energy +applications. To this end, in this paper we introduce an efficient fine-tuning +method for ViTs called $\textbf{ALaST}$ ($\textit{Adaptive Layer Selection +Fine-Tuning for Vision Transformers}$) to speed up the fine-tuning process +while reducing computational cost, memory load, and training time. Our approach +is based on the observation that not all layers are equally critical during +fine-tuning, and their importance varies depending on the current mini-batch. +Therefore, at each fine-tuning step, we adaptively estimate the importance of +all layers and we assign what we call ``compute budgets'' accordingly. Layers +that were allocated lower budgets are either trained with a reduced number of +input tokens or kept frozen. Freezing a layer reduces the computational cost +and memory usage by preventing updates to its weights, while discarding tokens +removes redundant data, speeding up processing and reducing memory +requirements. We show that this adaptive compute allocation enables a +nearly-optimal schedule for distributing computational resources across layers, +resulting in substantial reductions in training time (up to 1.5x), FLOPs (up to +2x), and memory load (up to 2x) compared to traditional full fine-tuning +approaches. Additionally, it can be successfully combined with other +parameter-efficient fine-tuning methods, such as LoRA. + +
+
+
+
+
+ + ☆ QMambaBSR: Burst Image Super-Resolution with Query State Space Model + + +
+ Burst super-resolution aims to reconstruct high-resolution images with higher +quality and richer details by fusing the sub-pixel information from multiple +burst low-resolution frames. In BusrtSR, the key challenge lies in extracting +the base frame's content complementary sub-pixel details while simultaneously +suppressing high-frequency noise disturbance. Existing methods attempt to +extract sub-pixels by modeling inter-frame relationships frame by frame while +overlooking the mutual correlations among multi-current frames and neglecting +the intra-frame interactions, leading to inaccurate and noisy sub-pixels for +base frame super-resolution. Further, existing methods mainly employ static +upsampling with fixed parameters to improve spatial resolution for all scenes, +failing to perceive the sub-pixel distribution difference across multiple +frames and cannot balance the fusion weights of different frames, resulting in +over-smoothed details and artifacts. To address these limitations, we introduce +a novel Query Mamba Burst Super-Resolution (QMambaBSR) network, which +incorporates a Query State Space Model (QSSM) and Adaptive Up-sampling module +(AdaUp). Specifically, based on the observation that sub-pixels have consistent +spatial distribution while random noise is inconsistently distributed, a novel +QSSM is proposed to efficiently extract sub-pixels through inter-frame querying +and intra-frame scanning while mitigating noise interference in a single step. +Moreover, AdaUp is designed to dynamically adjust the upsampling kernel based +on the spatial distribution of multi-frame sub-pixel information in the +different burst scenes, thereby facilitating the reconstruction of the spatial +arrangement of high-resolution details. Extensive experiments on four popular +synthetic and real-world benchmarks demonstrate that our method achieves a new +state-of-the-art performance. + +
+
+
+
+
+ + ☆ Modeling the Neonatal Brain Development Using Implicit Neural + Representations MICCAI 2024 + + +
+ The human brain undergoes rapid development during the third trimester of +pregnancy. In this work, we model the neonatal development of the infant brain +in this age range. As a basis, we use MR images of preterm- and term-birth +neonates from the developing human connectome project (dHCP). We propose a +neural network, specifically an implicit neural representation (INR), to +predict 2D- and 3D images of varying time points. In order to model a +subject-specific development process, it is necessary to disentangle the age +from the subjects' identity in the latent space of the INR. We propose two +methods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent +Augmentation (SGLA), enabling this disentanglement. We perform an analysis of +the results and compare our proposed model to an age-conditioned denoising +diffusion model as a baseline. We also show that our method can be applied in a +memory-efficient way, which is especially important for 3D data. + +
+
+ comment: Preprint, Accepted for PRIME MICCAI 2024 +
+
+
+
+
+ + ☆ Extracting polygonal footprints in off-nadir images with Segment + Anything Model + + +
+ Building Footprint Extraction (BFE) in off-nadir aerial images often relies +on roof segmentation and roof-to-footprint offset prediction, then drugging +roof-to-footprint via the offset. However, the results from this multi-stage +inference are not applicable in data production, because of the low quality of +masks given by prediction. To solve this problem, we proposed OBMv2 in this +paper, which supports both end-to-end and promptable polygonal footprint +prediction. Different from OBM, OBMv2 using a newly proposed Self Offset +Attention (SOFA) to bridge the performance gap on bungalow and skyscraper, +which realized a real end-to-end footprint polygon prediction without +postprocessing. %, such as Non-Maximum Suppression (NMS) and Distance NMS +(DNMS). % To fully use information contained in roof masks, building masks and +offsets, we proposed a Multi-level Information SyStem (MISS) for footprint +prediction, with which OBMv2 can predict footprints even with insufficient +predictions. Additionally, to squeeze information from the same model, we were +inspired by Retrieval-Augmented Generation (RAG) in Nature Language Processing +and proposed "RAG in BFE" problem. To verify the effectiveness of the proposed +method, experiments were conducted on open datasets BONAI and OmniCity-view3. A +generalization test was also conducted on Huizhou test set. The code will be +available at \url{https://github.com/likaiucas/OBM}. + +
+
+
+
+
+ + ☆ Historical Printed Ornaments: Dataset and Tasks + + +
+ This paper aims to develop the study of historical printed ornaments with +modern unsupervised computer vision. We highlight three complex tasks that are +of critical interest to book historians: clustering, element discovery, and +unsupervised change localization. For each of these tasks, we introduce an +evaluation benchmark, and we adapt and evaluate state-of-the-art models. Our +Rey's Ornaments dataset is designed to be a representative example of a set of +ornaments historians would be interested in. It focuses on an XVIIIth century +bookseller, Marc-Michel Rey, providing a consistent set of ornaments with a +wide diversity and representative challenges. Our results highlight the +limitations of state-of-the-art models when faced with real data and show +simple baselines such as k-means or congealing can outperform more +sophisticated approaches on such data. Our dataset and code can be found at +https://printed-ornaments.github.io/. + +
+
+
+
+
+ + ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of \textbf{180 benchmarks} and evaluation for MLLMs, +focusing on (1)perception and understanding, (2)cognition and reasoning, +(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we +discuss the limitations of the current evaluation methods for MLLMs and explore +promising future directions. Our key argument is that evaluation should be +regarded as a crucial discipline to better support the development of MLLMs. +For more details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ☆ SketchRef: A Benchmark Dataset and Evaluation Metrics for Automated + Sketch Synthesis + + +
+ Sketch, a powerful artistic technique to capture essential visual information +about real-world objects, is increasingly gaining attention in the image +synthesis field. However, evaluating the quality of synthesized sketches +presents unique unsolved challenges. Current evaluation methods for sketch +synthesis are inadequate due to the lack of a unified benchmark dataset, +over-reliance on classification accuracy for recognizability, and unfair +evaluation of sketches with different levels of simplification. To address +these issues, we introduce SketchRef, a benchmark dataset comprising 4 +categories of reference photos--animals, human faces, human bodies, and common +objects--alongside novel evaluation metrics. Considering that classification +accuracy is insufficient to measure the structural consistency between a sketch +and its reference photo, we propose the mean Object Keypoint Similarity (mOKS) +metric, utilizing pose estimation to assess structure-level recognizability. To +ensure fair evaluation sketches with different simplification levels, we +propose a recognizability calculation method constrained by simplicity. We also +collect 8K responses from art enthusiasts, validating the effectiveness of our +proposed evaluation methods. We hope this work can provide a comprehensive +evaluation of sketch synthesis algorithms, thereby aligning their performance +more closely with human understanding. + +
+
+
+
+
+ + ☆ Reference-free Axial Super-resolution of 3D Microscopy Images using + Implicit Neural Representation with a 2D Diffusion Prior MICCAI2024 + + +
+ Analysis and visualization of 3D microscopy images pose challenges due to +anisotropic axial resolution, demanding volumetric super-resolution along the +axial direction. While training a learning-based 3D super-resolution model +seems to be a straightforward solution, it requires ground truth isotropic +volumes and suffers from the curse of dimensionality. Therefore, existing +methods utilize 2D neural networks to reconstruct each axial slice, eventually +piecing together the entire volume. However, reconstructing each slice in the +pixel domain fails to give consistent reconstruction in all directions leading +to misalignment artifacts. In this work, we present a reconstruction framework +based on implicit neural representation (INR), which allows 3D coherency even +when optimized by independent axial slices in a batch-wise manner. Our method +optimizes a continuous volumetric representation from low-resolution axial +slices, using a 2D diffusion prior trained on high-resolution lateral slices +without requiring isotropic volumes. Through experiments on real and synthetic +anisotropic microscopy images, we demonstrate that our method surpasses other +state-of-the-art reconstruction methods. The source code is available on +GitHub: https://github.com/hvcl/INR-diffusion. + +
+
+ comment: MICCAI2024 accepted +
+
+
+
+
+ + ☆ Generative Dataset Distillation Based on Diffusion Model ECCV 2024 + + +
+ This paper presents our method for the generative track of The First Dataset +Distillation Challenge at ECCV 2024. Since the diffusion model has become the +mainstay of generative models because of its high-quality generative effects, +we focus on distillation methods based on the diffusion model. Considering that +the track can only generate a fixed number of images in 10 minutes using a +generative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a +generative model that can generate images at high speed. In this study, we +proposed a novel generative dataset distillation method based on Stable +Diffusion. Specifically, we use the SDXL-Turbo model which can generate images +at high speed and quality. Compared to other diffusion models that can only +generate images per class (IPC) = 1, our method can achieve an IPC = 10 for +Tiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to +generate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we +use the class information as text prompts and post data augmentation for the +SDXL-Turbo model. Experimental results show the effectiveness of the proposed +method, and we achieved third place in the generative track of the ECCV 2024 DD +Challenge. Codes are available at https://github.com/Guang000/BANKO. + +
+
+ comment: The Third Place Winner in Generative Track of the ECCV 2024 DD + Challenge +
+
+
+
+
+ + ☆ Bi-Directional Deep Contextual Video Compression + + +
+ Deep video compression has made remarkable process in recent years, with the +majority of advancements concentrated on P-frame coding. Although efforts to +enhance B-frame coding are ongoing, their compression performance is still far +behind that of traditional bi-directional video codecs. In this paper, we +introduce a bi-directional deep contextual video compression scheme tailored +for B-frames, termed DCVC-B, to improve the compression performance of deep +B-frame coding. Our scheme mainly has three key innovations. First, we develop +a bi-directional motion difference context propagation method for effective +motion difference coding, which significantly reduces the bit cost of +bi-directional motions. Second, we propose a bi-directional contextual +compression model and a corresponding bi-directional temporal entropy model, to +make better use of the multi-scale temporal contexts. Third, we propose a +hierarchical quality structure-based training strategy, leading to an effective +bit allocation across large groups of pictures (GOP). Experimental results show +that our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to +the reference software for H.265/HEVC under random access conditions. +Remarkably, it surpasses the performance of the H.266/VVC reference software on +certain test datasets under the same configuration. + +
+
+
+
+
+ + ☆ Learning A Low-Level Vision Generalist via Visual Task Prompt + + +
+ Building a unified model for general low-level vision tasks holds significant +research and practical value. Current methods encounter several critical +issues. Multi-task restoration approaches can address multiple +degradation-to-clean restoration tasks, while their applicability to tasks with +different target domains (e.g., image stylization) is limited. Methods like +PromptGIP can handle multiple input-target domains but rely on the Masked +Autoencoder (MAE) paradigm. Consequently, they are tied to the ViT +architecture, resulting in suboptimal image reconstruction quality. In +addition, these methods are sensitive to prompt image content and often +struggle with low-frequency information processing. In this paper, we propose a +Visual task Prompt-based Image Processing (VPIP) framework to overcome these +challenges. VPIP employs visual task prompts to manage tasks with different +input-target domains and allows flexible selection of backbone network suitable +for general tasks. Besides, a new prompt cross-attention is introduced to +facilitate interaction between the input and prompt information. Based on the +VPIP framework, we train a low-level vision generalist model, namely GenLV, on +30 diverse tasks. Experimental results show that GenLV can successfully address +a variety of low-level tasks, significantly outperforming existing methods both +quantitatively and qualitatively. Codes are available at +https://github.com/chxy95/GenLV. + +
+
+ comment: Accepted to ACMMM24 +
+
+
+
+
+ + ☆ MM-UNet: A Mixed MLP Architecture for Improved Ophthalmic Image + Segmentation + + +
+ Ophthalmic image segmentation serves as a critical foundation for ocular +disease diagnosis. Although fully convolutional neural networks (CNNs) are +commonly employed for segmentation, they are constrained by inductive biases +and face challenges in establishing long-range dependencies. Transformer-based +models address these limitations but introduce substantial computational +overhead. Recently, a simple yet efficient Multilayer Perceptron (MLP) +architecture was proposed for image classification, achieving competitive +performance relative to advanced transformers. However, its effectiveness for +ophthalmic image segmentation remains unexplored. In this paper, we introduce +MM-UNet, an efficient Mixed MLP model tailored for ophthalmic image +segmentation. Within MM-UNet, we propose a multi-scale MLP (MMLP) module that +facilitates the interaction of features at various depths through a grouping +strategy, enabling simultaneous capture of global and local information. We +conducted extensive experiments on both a private anterior segment optical +coherence tomography (AS-OCT) image dataset and a public fundus image dataset. +The results demonstrated the superiority of our MM-UNet model in comparison to +state-of-the-art deep segmentation networks. + +
+
+ comment: OMIA2024 +
+
+
+
+
+ + ☆ Zero-Shot Dual-Path Integration Framework for Open-Vocabulary 3D + Instance Segmentation CVPR 2024 + + +
+ Open-vocabulary 3D instance segmentation transcends traditional +closed-vocabulary methods by enabling the identification of both previously +seen and unseen objects in real-world scenarios. It leverages a dual-modality +approach, utilizing both 3D point clouds and 2D multi-view images to generate +class-agnostic object mask proposals. Previous efforts predominantly focused on +enhancing 3D mask proposal models; consequently, the information that could +come from 2D association to 3D was not fully exploited. This bias towards 3D +data, while effective for familiar indoor objects, limits the system's +adaptability to new and varied object types, where 2D models offer greater +utility. Addressing this gap, we introduce Zero-Shot Dual-Path Integration +Framework that equally values the contributions of both 3D and 2D modalities. +Our framework comprises three components: 3D pathway, 2D pathway, and Dual-Path +Integration. 3D pathway generates spatially accurate class-agnostic mask +proposals of common indoor objects from 3D point cloud data using a pre-trained +3D model, while 2D pathway utilizes pre-trained open-vocabulary instance +segmentation model to identify a diverse array of object proposals from +multi-view RGB-D images. In Dual-Path Integration, our Conditional Integration +process, which operates in two stages, filters and merges the proposals from +both pathways adaptively. This process harmonizes output proposals to enhance +segmentation capabilities. Our framework, utilizing pre-trained models in a +zero-shot manner, is model-agnostic and demonstrates superior performance on +both seen and unseen data, as evidenced by comprehensive evaluations on the +ScanNet200 and qualitative results on ARKitScenes datasets. + +
+
+ comment: OpenSUN 3D: 2nd Workshop on Open-Vocabulary 3D Scene Understanding + (CVPR 2024) +
+
+
+
+
+ + ☆ S-RAF: A Simulation-Based Robustness Assessment Framework for + Responsible Autonomous Driving + + +
+ As artificial intelligence (AI) technology advances, ensuring the robustness +and safety of AI-driven systems has become paramount. However, varying +perceptions of robustness among AI developers create misaligned evaluation +metrics, complicating the assessment and certification of safety-critical and +complex AI systems such as autonomous driving (AD) agents. To address this +challenge, we introduce Simulation-Based Robustness Assessment Framework +(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to +rigorously assess AD agents across diverse conditions, including faulty +sensors, environmental changes, and complex traffic situations. By quantifying +robustness and its relationship with other safety-critical factors, such as +carbon emissions, S-RAF aids developers and stakeholders in building safe and +responsible driving agents, and streamlining safety certification processes. +Furthermore, S-RAF offers significant advantages, such as reduced testing +costs, and the ability to explore edge cases that may be unsafe to test in the +real world. The code for this framework is available here: +https://github.com/cognitive-robots/rai-leaderboard + +
+
+
+
+
+ + ☆ TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression + Recognition + + +
+ Handwritten Mathematical Expression Recognition (HMER) has extensive +applications in automated grading and office automation. However, existing +sequence-based decoding methods, which directly predict $\LaTeX$ sequences, +struggle to understand and model the inherent tree structure of $\LaTeX$ and +often fail to ensure syntactic correctness in the decoded results. To address +these challenges, we propose a novel model named TAMER (Tree-Aware Transformer) +for handwritten mathematical expression recognition. TAMER introduces an +innovative Tree-aware Module while maintaining the flexibility and efficient +training of Transformer. TAMER combines the advantages of both sequence +decoding and tree decoding models by jointly optimizing sequence prediction and +tree structure prediction tasks, which enhances the model's understanding and +generalization of complex mathematical expression structures. During inference, +TAMER employs a Tree Structure Prediction Scoring Mechanism to improve the +structural validity of the generated $\LaTeX$ sequences. Experimental results +on CROHME datasets demonstrate that TAMER outperforms traditional sequence +decoding and tree decoding models, especially in handling complex mathematical +structures, achieving state-of-the-art (SOTA) performance. + +
+
+
+
+
+ + ☆ Tuning a SAM-Based Model with Multi-Cognitive Visual Adapter to Remote + Sensing Instance Segmentation + + +
+ The Segment Anything Model (SAM), a foundational model designed for +promptable segmentation tasks, demonstrates exceptional generalization +capabilities, making it highly promising for natural scene image segmentation. +However, SAM's lack of pretraining on massive remote sensing images and its +interactive structure limit its automatic mask prediction capabilities. In this +paper, a Multi-Cognitive SAM-Based Instance Segmentation Model (MC-SAM SEG) is +introduced to employ SAM on remote sensing domain. The SAM-Mona encoder +utilizing the Multi-cognitive Visual Adapter (Mona) is conducted to facilitate +SAM's transfer learning in remote sensing applications. The proposed method +named MC-SAM SEG extracts high-quality features by fine-tuning the SAM-Mona +encoder along with a feature aggregator. Subsequently, a pixel decoder and +transformer decoder are designed for prompt-free mask generation and instance +classification. The comprehensive experiments are conducted on the HRSID and +WHU datasets for instance segmentation tasks on Synthetic Aperture Radar (SAR) +images and optical remote sensing images respectively. The evaluation results +indicate the proposed method surpasses other deep learning algorithms and +verify its effectiveness and generalization. + +
+
+
+
+
+ + ☆ Tell Codec What Worth Compressing: Semantically Disentangled Image + Coding for Machine with LMMs + + +
+ We present a new image compression paradigm to achieve ``intelligently coding +for machine'' by cleverly leveraging the common sense of Large Multimodal +Models (LMMs). We are motivated by the evidence that large language/multimodal +models are powerful general-purpose semantics predictors for understanding the +real world. Different from traditional image compression typically optimized +for human eyes, the image coding for machines (ICM) framework we focus on +requires the compressed bitstream to more comply with different downstream +intelligent analysis tasks. To this end, we employ LMM to \textcolor{red}{tell +codec what to compress}: 1) first utilize the powerful semantic understanding +capability of LMMs w.r.t object grounding, identification, and importance +ranking via prompts, to disentangle image content before compression, 2) and +then based on these semantic priors we accordingly encode and transmit objects +of the image in order with a structured bitstream. In this way, diverse vision +benchmarks including image classification, object detection, instance +segmentation, etc., can be well supported with such a semantically structured +bitstream. We dub our method ``\textit{SDComp}'' for ``\textit{S}emantically +\textit{D}isentangled \textit{Comp}ression'', and compare it with +state-of-the-art codecs on a wide variety of different vision tasks. SDComp +codec leads to more flexible reconstruction results, promised decoded visual +quality, and a more generic/satisfactory intelligent task-supporting ability. + +
+
+
+
+
+ + ☆ EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver + Attention Estimation + + +
+ Associating driver attention with driving scene across two fields of views +(FOVs) is a hard cross-domain perception problem, which requires comprehensive +consideration of cross-view mapping, dynamic driving scene analysis, and driver +status tracking. Previous methods typically focus on a single view or map +attention to the scene via estimated gaze, failing to exploit the implicit +connection between them. Moreover, simple fusion modules are insufficient for +modeling the complex relationships between the two views, making information +integration challenging. To address these issues, we propose a novel method for +end-to-end scene-associated driver attention estimation, called EraW-Net. This +method enhances the most discriminative dynamic cues, refines feature +representations, and facilitates semantically aligned cross-domain integration +through a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive +Filter Module (DAF-Module) is proposed to address the challenges of frequently +changing driving environments by extracting vital regions. It suppresses the +indiscriminately recorded dynamics and highlights crucial ones by innovative +joint frequency-spatial analysis, enhancing the model's ability to parse +complex dynamics. Additionally, to track driver states during non-fixed facial +poses, we propose a Global Context Sharing Module (GCS-Module) to construct +refined feature representations by capturing hierarchical features that adapt +to various scales of head and eye movements. Finally, W-Net achieves systematic +cross-view information integration through its "Encoding-Independent Partial +Decoding-Fusion Decoding" structure, addressing semantic misalignment in +heterogeneous data integration. Experiments demonstrate that the proposed +method robustly and accurately estimates the mapping of driver attention in +scene on large public datasets. + +
+
+ comment: 13pages, 9 figures, +
+
+
+
+
+ + ☆ Unsupervised Non-Rigid Point Cloud Matching through Large Vision Models + + +
+ In this paper, we propose a novel learning-based framework for non-rigid +point cloud matching, which can be trained purely on point clouds without any +correspondence annotation but also be extended naturally to partial-to-full +matching. Our key insight is to incorporate semantic features derived from +large vision models (LVMs) to geometry-based shape feature learning. Our +framework effectively leverages the structural information contained in the +semantic features to address ambiguities arise from self-similarities among +local geometries. Furthermore, our framework also enjoys the strong +generalizability and robustness regarding partial observations of LVMs, leading +to improvements in the regarding point cloud matching tasks. In order to +achieve the above, we propose a pixel-to-point feature aggregation module, a +local and global attention network as well as a geometrical similarity loss +function. Experimental results show that our method achieves state-of-the-art +results in matching non-rigid point clouds in both near-isometric and +heterogeneous shape collection as well as more realistic partial and noisy +data. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton + Sketching + + +
+ Attention based models have achieved many remarkable breakthroughs in +numerous applications. However, the quadratic complexity of Attention makes the +vanilla Attention based models hard to apply to long sequence tasks. Various +improved Attention structures are proposed to reduce the computation cost by +inducing low rankness and approximating the whole sequence by sub-sequences. +The most challenging part of those approaches is maintaining the proper balance +between information preservation and computation reduction: the longer +sub-sequences used, the better information is preserved, but at the price of +introducing more noise and computational costs. In this paper, we propose a +smoothed skeleton sketching based Attention structure, coined S$^3$Attention, +which significantly improves upon the previous attempts to negotiate this +trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact +of noise while keeping the linear complexity to the sequence length: a +smoothing block to mix information over long sequences and a matrix sketching +method that simultaneously selects columns and rows from the input matrix. We +verify the effectiveness of S$^3$Attention both theoretically and empirically. +Extensive studies over Long Range Arena (LRA) datasets and six time-series +forecasting show that S$^3$Attention significantly outperforms both vanilla +Attention and other state-of-the-art variants of Attention structures. + +
+
+
+
+
+ + ☆ A New Chinese Landscape Paintings Generation Model based on Stable + Diffusion using DreamBooth HPCA + + +
+ This study mainly introduces a method combining the Stable Diffusion Model +(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese +Landscape Paintings. This training process is accelerated by combining LoRA +with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the +Chinese Landscape Paintings Internet dataset used in this paper, this study +finds that SDM combined with DreamBooth exhibits superior performance, +outperforming other models, including the generic pre-trained SDM and +LoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of +12.75 on the dataset and outperforms all other models in terms of expert +evaluation, highlighting the model's versatility in the field of Chinese +Landscape Paintings given the unique identifier, high fidelity and high +quality. This study illustrates the potential of specialised fine-tuning method +to improve the performance of SDM on domain-specific tasks, particularly in the +domain of Landscape Paintings. + +
+
+ comment: accepted by AHPCAI +
+
+
+
+
+ + ☆ A training regime to learn unified representations from complementary + breast imaging modalities + + +
+ Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT) +are the two most widely used imaging modalities for breast cancer screening. +Although DBT has increased cancer detection compared to FFDM, its widespread +adoption in clinical practice has been slowed by increased interpretation times +and a perceived decrease in the conspicuity of specific lesion types. +Specifically, the non-inferiority of DBT for microcalcifications remains under +debate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM +acquisitions remain popular, leading to overall increased exam times and +radiation dosage. Enabling DBT to provide diagnostic information present in +both FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in +both quantities. We propose a machine learning methodology that learns +high-level representations leveraging the complementary diagnostic signal from +both DBT and FFDM. Experiments on a large-scale data set validate our claims +and show that our representations enable more accurate breast lesion detection +than any DBT- or FFDM-based model. + +
+
+
+
+
+ + ☆ Detection and tracking of MAVs using a LiDAR with rosette scanning + pattern + + +
+ The usage of commercial Micro Aerial Vehicles (MAVs) has increased +drastically during the last decade. While the added value of MAVs to society is +apparent, their growing use is also coming with increasing risks like violating +public airspace at airports or committing privacy violations. To mitigate these +issues it is becoming critical to develop solutions that incorporate the +detection and tracking of MAVs with autonomous systems. This work presents a +method for the detection and tracking of MAVs using a novel, low-cost rosette +scanning LiDAR on a pan-tilt turret. Once the static background is captured, a +particle filter is utilized to detect a possible target and track its position +with a physical, programmable pan-tilt system. The tracking makes it possible +to keep the MAV in the center, maximizing the density of 3D points measured on +the target by the LiDAR sensor. The developed algorithm was evaluated within +the indoor MIcro aerial vehicle and MOtion capture (MIMO) arena and has +state-of-the-art tracking accuracy, stability, and fast re-detection time in +case of tracking loss. Based on the outdoor tests, it was possible to +significantly increase the detection distance and number of returned points +compared to other similar methods using LiDAR. + +
+
+
+
+
+ + ☆ Scaling up Multimodal Pre-training for Sign Language Understanding + + +
+ Sign language serves as the primary meaning of communication for the +deaf-mute community. Different from spoken language, it commonly conveys +information by the collaboration of manual features, i.e., hand gestures and +body movements, and non-manual features, i.e., facial expressions and mouth +cues. To facilitate communication between the deaf-mute and hearing people, a +series of sign language understanding (SLU) tasks have been studied in recent +years, including isolated/continuous sign language recognition (ISLR/CSLR), +gloss-free sign language translation (GF-SLT) and sign language retrieval +(SL-RT). Sign language recognition and translation aims to understand the +semantic meaning conveyed by sign languages from gloss-level and +sentence-level, respectively. In contrast, SL-RT focuses on retrieving sign +videos or corresponding texts from a closed-set under the query-by-example +search paradigm. These tasks investigate sign language topics from diverse +perspectives and raise challenges in learning effective representation of sign +language videos. To advance the development of sign language understanding, +exploring a generalized model that is applicable across various SLU tasks is a +profound research direction. + +
+
+ comment: Sign language recognition; Sign language translation; Sign language + retrieval +
+
+
+
+
+ + ☆ Language-Driven Interactive Shadow Detection ACM MM 2024 + + +
+ Traditional shadow detectors often identify all shadow regions of static +images or video sequences. This work presents the Referring Video Shadow +Detection (RVSD), which is an innovative task that rejuvenates the classic +paradigm by facilitating the segmentation of particular shadows in videos based +on descriptive natural language prompts. This novel RVSD not only achieves +segmentation of arbitrary shadow areas of interest based on descriptions +(flexibility) but also allows users to interact with visual content more +directly and naturally by using natural language prompts (interactivity), +paving the way for abundant applications ranging from advanced video editing to +virtual reality experiences. To pioneer the RVSD research, we curated a +well-annotated RVSD dataset, which encompasses 86 videos and a rich set of +15,011 paired textual descriptions with corresponding shadows. To the best of +our knowledge, this dataset is the first one for addressing RVSD. Based on this +dataset, we propose a Referring Shadow-Track Memory Network (RSM-Net) for +addressing the RVSD task. In our RSM-Net, we devise a Twin-Track Synergistic +Memory (TSM) to store intra-clip memory features and hierarchical inter-clip +memory features, and then pass these memory features into a memory read module +to refine features of the current video frame for referring shadow detection. +We also develop a Mixed-Prior Shadow Attention (MSA) to utilize physical priors +to obtain a coarse shadow map for learning more visual features by weighting it +with the input video frame. Experimental results show that our RSM-Net achieves +state-of-the-art performance for RVSD with a notable Overall IOU increase of +4.4\%. Our code and dataset are available at https://github.com/whq-xxh/RVSD. + +
+
+ comment: ACM MM 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Vision Transformer Using Images Encrypted with + Restricted Random Permutation Matrices + + +
+ We propose a novel method for privacy-preserving fine-tuning vision +transformers (ViTs) with encrypted images. Conventional methods using encrypted +images degrade model performance compared with that of using plain images due +to the influence of image encryption. In contrast, the proposed encryption +method using restricted random permutation matrices can provide a higher +performance than the conventional ones. + +
+
+ comment: 4 pages, 9 figures +
+
+
+
+
+ + ☆ Focus on Focus: Focus-oriented Representation Learning and Multi-view + Cross-modal Alignment for Glioma Grading + + +
+ Recently, multimodal deep learning, which integrates histopathology slides +and molecular biomarkers, has achieved a promising performance in glioma +grading. Despite great progress, due to the intra-modality complexity and +inter-modality heterogeneity, existing studies suffer from inadequate +histopathology representation learning and inefficient molecular-pathology +knowledge alignment. These two issues hinder existing methods to precisely +interpret diagnostic molecular-pathology features, thereby limiting their +grading performance. Moreover, the real-world applicability of existing +multimodal approaches is significantly restricted as molecular biomarkers are +not always available during clinical deployment. To address these problems, we +introduce a novel Focus on Focus (FoF) framework with paired pathology-genomic +training and applicable pathology-only inference, enhancing molecular-pathology +representation effectively. Specifically, we propose a Focus-oriented +Representation Learning (FRL) module to encourage the model to identify regions +positively or negatively related to glioma grading and guide it to focus on the +diagnostic areas with a consistency constraint. To effectively link the +molecular biomarkers to morphological features, we propose a Multi-view +Cross-modal Alignment (MCA) module that projects histopathology representations +into molecular subspaces, aligning morphological features with corresponding +molecular biomarker status by supervised contrastive learning. Experiments on +the TCGA GBM-LGG dataset demonstrate that our FoF framework significantly +improves the glioma grading. Remarkably, our FoF achieves superior performance +using only histopathology slides compared to existing multimodal methods. The +source code is available at https://github.com/peterlipan/FoF. + +
+
+
+
+
+ + ☆ GS-ID: Illumination Decomposition on Gaussian Splatting via Diffusion + Prior and Parametric Light Source Optimization + + +
+ We present GS-ID, a novel framework for illumination decomposition on +Gaussian Splatting, achieving photorealistic novel view synthesis and intuitive +light editing. Illumination decomposition is an ill-posed problem facing three +main challenges: 1) priors for geometry and material are often lacking; 2) +complex illumination conditions involve multiple unknown light sources; and 3) +calculating surface shading with numerous light sources is computationally +expensive. To address these challenges, we first introduce intrinsic diffusion +priors to estimate the attributes for physically based rendering. Then we +divide the illumination into environmental and direct components for joint +optimization. Last, we employ deferred rendering to reduce the computational +load. Our framework uses a learnable environment map and Spherical Gaussians +(SGs) to represent light sources parametrically, therefore enabling +controllable and photorealistic relighting on Gaussian Splatting. Extensive +experiments and applications demonstrate that GS-ID produces state-of-the-art +illumination decomposition results while achieving better geometry +reconstruction and rendering performance. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Visual-Friendly Concept Protection via Selective Adversarial + Perturbations + + +
+ Personalized concept generation by tuning diffusion models with a few images +raises potential legal and ethical concerns regarding privacy and intellectual +property rights. Researchers attempt to prevent malicious personalization using +adversarial perturbations. However, previous efforts have mainly focused on the +effectiveness of protection while neglecting the visibility of perturbations. +They utilize global adversarial perturbations, which introduce noticeable +alterations to original images and significantly degrade visual quality. In +this work, we propose the Visual-Friendly Concept Protection (VCPro) framework, +which prioritizes the protection of key concepts chosen by the image owner +through adversarial perturbations with lower perceptibility. To ensure these +perturbations are as inconspicuous as possible, we introduce a relaxed +optimization objective to identify the least perceptible yet effective +adversarial perturbations, solved using the Lagrangian multiplier method. +Qualitative and quantitative experiments validate that VCPro achieves a better +trade-off between the visibility of perturbations and protection effectiveness, +effectively prioritizing the protection of target concepts in images with less +perceptible perturbations. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Efficient Image-to-Image Diffusion Classifier for Adversarial Robustness + + +
+ Diffusion models (DMs) have demonstrated great potential in the field of +adversarial robustness, where DM-based defense methods can achieve superior +defense capability without adversarial training. However, they all require huge +computational costs due to the usage of large-scale pre-trained DMs, making it +difficult to conduct full evaluation under strong attacks and compare with +traditional CNN-based methods. Simply reducing the network size and timesteps +in DMs could significantly harm the image generation quality, which invalidates +previous frameworks. To alleviate this issue, we redesign the diffusion +framework from generating high-quality images to predicting distinguishable +image labels. Specifically, we employ an image translation framework to learn +many-to-one mapping from input samples to designed orthogonal image labels. +Based on this framework, we introduce an efficient Image-to-Image diffusion +classifier with a pruned U-Net structure and reduced diffusion timesteps. +Besides the framework, we redesign the optimization objective of DMs to fit the +target of image classification, where a new classification loss is incorporated +in the DM-based image translation framework to distinguish the generated label +from those of other classes. We conduct sufficient evaluations of the proposed +classifier under various attacks on popular benchmarks. Extensive experiments +show that our method achieves better adversarial robustness with fewer +computational costs than DM-based and CNN-based methods. The code is available +at https://github.com/hfmei/IDC. + +
+
+
+
+
+ + ☆ CoSEC: A Coaxial Stereo Event Camera Dataset for Autonomous Driving + + +
+ Conventional frame camera is the mainstream sensor of the autonomous driving +scene perception, while it is limited in adverse conditions, such as low light. +Event camera with high dynamic range has been applied in assisting frame camera +for the multimodal fusion, which relies heavily on the pixel-level spatial +alignment between various modalities. Typically, existing multimodal datasets +mainly place event and frame cameras in parallel and directly align them +spatially via warping operation. However, this parallel strategy is less +effective for multimodal fusion, since the large disparity exacerbates spatial +misalignment due to the large event-frame baseline. We argue that baseline +minimization can reduce alignment error between event and frame cameras. In +this work, we introduce hybrid coaxial event-frame devices to build the +multimodal system, and propose a coaxial stereo event camera (CoSEC) dataset +for autonomous driving. As for the multimodal system, we first utilize the +microcontroller to achieve time synchronization, and then spatially calibrate +different sensors, where we perform intra- and inter-calibration of stereo +coaxial devices. As for the multimodal dataset, we filter LiDAR point clouds to +generate depth and optical flow labels using reference depth, which is further +improved by fusing aligned event and frame data in nighttime conditions. With +the help of the coaxial device, the proposed dataset can promote the all-day +pixel-level multimodal fusion. Moreover, we also conduct experiments to +demonstrate that the proposed dataset can improve the performance and +generalization of the multimodal fusion. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Achieving Complex Image Edits via Function Aggregation with Diffusion + Models + + +
+ Diffusion models have demonstrated strong performance in generative tasks, +making them ideal candidates for image editing. Recent studies highlight their +ability to apply desired edits effectively by following textual instructions, +yet two key challenges persist. First, these models struggle to apply multiple +edits simultaneously, resulting in computational inefficiencies due to their +reliance on sequential processing. Second, relying on textual prompts to +determine the editing region can lead to unintended alterations in other parts +of the image. In this work, we introduce FunEditor, an efficient diffusion +model designed to learn atomic editing functions and perform complex edits by +aggregating simpler functions. This approach enables complex editing tasks, +such as object movement, by aggregating multiple functions and applying them +simultaneously to specific areas. FunEditor is 5 to 24 times faster inference +than existing methods on complex tasks like object movement. Our experiments +demonstrate that FunEditor significantly outperforms recent baselines, +including both inference-time optimization methods and fine-tuned models, +across various metrics, such as image quality assessment (IQA) and +object-background consistency. + +
+
+
+
+
+ + ☆ DFT-Based Adversarial Attack Detection in MRI Brain Imaging: Enhancing + Diagnostic Accuracy in Alzheimer's Case Studies + + +
+ Recent advancements in deep learning, particularly in medical imaging, have +significantly propelled the progress of healthcare systems. However, examining +the robustness of medical images against adversarial attacks is crucial due to +their real-world applications and profound impact on individuals' health. These +attacks can result in misclassifications in disease diagnosis, potentially +leading to severe consequences. Numerous studies have explored both the +implementation of adversarial attacks on medical images and the development of +defense mechanisms against these threats, highlighting the vulnerabilities of +deep neural networks to such adversarial activities. In this study, we +investigate adversarial attacks on images associated with Alzheimer's disease +and propose a defensive method to counteract these attacks. Specifically, we +examine adversarial attacks that employ frequency domain transformations on +Alzheimer's disease images, along with other well-known adversarial attacks. +Our approach utilizes a convolutional neural network (CNN)-based autoencoder +architecture in conjunction with the two-dimensional Fourier transform of +images for detection purposes. The simulation results demonstrate that our +detection and defense mechanism effectively mitigates several adversarial +attacks, thereby enhancing the robustness of deep neural networks against such +vulnerabilities. + +
+
+ comment: 10 pages, 4 figures, conference +
+
+
+
+
+ + ☆ TEXTOC: Text-driven Object-Centric Style Transfer + + +
+ We present Text-driven Object-Centric Style Transfer (TEXTOC), a novel method +that guides style transfer at an object-centric level using textual inputs. The +core of TEXTOC is our Patch-wise Co-Directional (PCD) loss, meticulously +designed for precise object-centric transformations that are closely aligned +with the input text. This loss combines a patch directional loss for +text-guided style direction and a patch distribution consistency loss for even +CLIP embedding distribution across object regions. It ensures a seamless and +harmonious style transfer across object regions. Key to our method are the +Text-Matched Patch Selection (TMPS) and Pre-fixed Region Selection (PRS) +modules for identifying object locations via text, eliminating the need for +segmentation masks. Lastly, we introduce an Adaptive Background Preservation +(ABP) loss to maintain the original style and structural essence of the image's +background. This loss is applied to dynamically identified background areas. +Extensive experiments underline the effectiveness of our approach in creating +visually coherent and textually aligned style transfers. + +
+
+
+
+
+ + ♻ ☆ DivCon: Divide and Conquer for Progressive Text-to-Image Generation + + +
+ Diffusion-driven text-to-image (T2I) generation has achieved remarkable +advancements. To further improve T2I models' capability in numerical and +spatial reasoning, the layout is employed as an intermedium to bridge large +language models and layout-based diffusion models. However, these methods still +struggle with generating images from textural prompts with multiple objects and +complicated spatial relationships. To tackle this challenge, we introduce a +divide-and-conquer approach which decouples the T2I generation task into simple +subtasks. Our approach divides the layout prediction stage into numerical & +spatial reasoning and bounding box prediction. Then, the layout-to-image +generation stage is conducted in an iterative manner to reconstruct objects +from easy ones to difficult ones. We conduct experiments on the HRS and NSR-1K +benchmarks and our approach outperforms previous state-of-the-art models with +notable margins. In addition, visual results demonstrate that our approach +significantly improves the controllability and consistency in generating +multiple objects from complex textural prompts. + +
+
+
+
+
+ + ♻ ☆ DopQ-ViT: Towards Distribution-Friendly and Outlier-Aware Post-Training + Quantization for Vision Transformers + + +
+ Vision transformers (ViTs) have garnered significant attention for their +performance in vision tasks, but the high computational cost and significant +latency issues have hindered widespread adoption. Post-training quantization +(PTQ), a promising method for model compression, still faces accuracy +degradation challenges with ViTs. There are two reasons for this: the existing +quantization paradigm does not fit the power-law distribution of post-Softmax +activations well, and accuracy inevitably decreases after reparameterizing +post-LayerNorm activations. We propose a Distribution-Friendly and +Outlier-Aware Post-training Quantization method for Vision Transformers, named +DopQ-ViT. DopQ-ViT analyzes the inefficiencies of current quantizers and +introduces a distribution-friendly Tan Quantizer called TanQ. TanQ focuses more +on values near 1, more accurately preserving the power-law distribution of +post-Softmax activations, and achieves favorable results. Besides, during the +reparameterization of post-LayerNorm activations from channel-wise to +layer-wise quantization, the accuracy degradation is mainly due to the +significant impact of outliers in the scaling factors. Therefore, DopQ-ViT +proposes a method to select Median as the Optimal Scaling Factor, denoted as +MOSF, which compensates for the influence of outliers and preserves the +performance of the quantization model. DopQ-ViT has been extensively validated +and significantly improves the performance of quantization models, especially +in low-bit settings. + +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ SLAM for Visually Impaired People: a Survey + + +
+ In recent decades, several assistive technologies have been developed to +improve the ability of blind and visually impaired (BVI) individuals to +navigate independently and safely. At the same time, simultaneous localization +and mapping (SLAM) techniques have become sufficiently robust and efficient to +be adopted in developing these assistive technologies. We present the first +systematic literature review of 54 recent studies on SLAM-based solutions for +blind and visually impaired people, focusing on literature published from 2017 +onward. This review explores various localization and mapping techniques +employed in this context. We systematically identified and categorized diverse +SLAM approaches and analyzed their localization and mapping techniques, sensor +types, computing resources, and machine-learning methods. We discuss the +advantages and limitations of these techniques for blind and visually impaired +navigation. Moreover, we examine the major challenges described across studies, +including practical challenges and considerations that affect usability and +adoption. Our analysis also evaluates the effectiveness of these SLAM-based +solutions in real-world scenarios and user satisfaction, providing insights +into their practical impact on BVI mobility. The insights derived from this +review identify critical gaps and opportunities for future research activities, +particularly in addressing the challenges presented by dynamic and complex +environments. We explain how SLAM technology offers the potential to improve +the ability of visually impaired individuals to navigate effectively. Finally, +we present future opportunities and challenges in this domain. + +
+
+ comment: 47 pages, 42 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ CeCNN: Copula-enhanced convolutional neural networks in joint prediction + of refraction error and axial length based on ultra-widefield fundus images + + +
+ The ultra-widefield (UWF) fundus image is an attractive 3D biomarker in +AI-aided myopia screening because it provides much richer myopia-related +information. Though axial length (AL) has been acknowledged to be highly +related to the two key targets of myopia screening, Spherical Equivalence (SE) +measurement and high myopia diagnosis, its prediction based on the UWF fundus +image is rarely considered. To save the high expense and time costs of +measuring SE and AL, we propose the Copula-enhanced Convolutional Neural +Network (CeCNN), a one-stop UWF-based ophthalmic AI framework to jointly +predict SE, AL, and myopia status. The CeCNN formulates a multiresponse +regression that relates multiple dependent discrete-continuous responses and +the image covariate, where the nonlinearity of the association is modeled by a +backbone CNN. To thoroughly describe the dependence structure among the +responses, we model and incorporate the conditional dependence among responses +in a CNN through a new copula-likelihood loss. We provide statistical +interpretations of the conditional dependence among responses, and reveal that +such dependence is beyond the dependence explained by the image covariate. We +heuristically justify that the proposed loss can enhance the estimation +efficiency of the CNN weights. We apply the CeCNN to the UWF dataset collected +by us and demonstrate that the CeCNN sharply enhances the predictive capability +of various backbone CNNs. Our study evidences the ophthalmology view that +besides SE, AL is also an important measure to myopia. + +
+
+
+
+
+ + ♻ ☆ Multi-task Image Restoration Guided By Robust DINO Features + + +
+ Multi-task image restoration has gained significant interest due to its +inherent versatility and efficiency compared to its single-task counterpart. +However, performance decline is observed with an increase in the number of +tasks, primarily attributed to the restoration model's challenge in handling +different tasks with distinct natures at the same time. Thus, a perspective +emerged aiming to explore the degradation-insensitive semantic commonalities +among different degradation tasks. In this paper, we observe that the features +of DINOv2 can effectively model semantic information and are independent of +degradation factors. Motivated by this observation, we propose +\mbox{\textbf{DINO-IR}}, a multi-task image restoration approach leveraging +robust features extracted from DINOv2 to solve multi-task image restoration +simultaneously. We first propose a pixel-semantic fusion (PSF) module to +dynamically fuse DINOV2's shallow features containing pixel-level information +and deep features containing degradation-independent semantic information. To +guide the restoration model with the features of DINOv2, we develop a +DINO-Restore adaption and fusion module to adjust the channel of fused features +from PSF and then integrate them with the features from the restoration model. +By formulating these modules into a unified deep model, we propose a DINO +perception contrastive loss to constrain the model training. Extensive +experimental results demonstrate that our DINO-IR performs favorably against +existing multi-task image restoration approaches in various tasks by a large +margin. The source codes and trained models will be made available. + +
+
+
+
+
+ + ♻ ☆ GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent + Diffusion Transformer + + +
+ Speech-driven talking head generation is an important but challenging task +for many downstream applications such as augmented reality. Existing methods +have achieved remarkable performance by utilizing autoregressive models or +diffusion models. However, most still suffer from modality inconsistencies, +specifically the misalignment between audio and mesh modalities, which causes +inconsistencies in motion diversity and lip-sync accuracy. To address this +issue, this paper introduces GLDiTalker, a novel speech-driven 3D facial +animation model that employs a Graph Latent Diffusion Transformer. The core +idea behind GLDiTalker is that the audio-mesh modality misalignment can be +resolved by diffusing the signal in a latent quantilized spatial-temporal +space. To achieve this, GLDiTalker builds upon a quantilized space-time +diffusion training pipeline, which consists of a Graph Enhanced Quantilized +Space Learning Stage and a Space-Time Powered Latent Diffusion Stage. The first +stage ensures lip-sync accuracy, while the second stage enhances motion +diversity. Together, these stages enable GLDiTalker to generate temporally and +spatially stable, realistic models. Extensive evaluations on several widely +used benchmarks demonstrate that our method achieves superior performance +compared to existing methods. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ MIMIR: Masked Image Modeling for Mutual Information-based Adversarial + Robustness + + +
+ Vision Transformers (ViTs) achieve excellent performance in various tasks, +but they are also vulnerable to adversarial attacks. Building robust ViTs is +highly dependent on dedicated Adversarial Training (AT) strategies. However, +current ViTs' adversarial training only employs well-established training +approaches from convolutional neural network (CNN) training, where pre-training +provides the basis for AT fine-tuning with the additional help of tailored data +augmentations. In this paper, we take a closer look at the adversarial +robustness of ViTs by providing a novel theoretical Mutual Information (MI) +analysis in its autoencoder-based self-supervised pre-training. Specifically, +we show that MI between the adversarial example and its latent representation +in ViT-based autoencoders should be constrained by utilizing the MI bounds. +Based on this finding, we propose a masked autoencoder-based pre-training +method, MIMIR, that employs an MI penalty to facilitate the adversarial +training of ViTs. Extensive experiments show that MIMIR outperforms +state-of-the-art adversarially trained ViTs on benchmark datasets with higher +natural and robust accuracy, indicating that ViTs can substantially benefit +from exploiting MI. In addition, we consider two adaptive attacks by assuming +that the adversary is aware of the MIMIR design, which further verifies the +provided robustness. + +
+
+
+
+
+ + ♻ ☆ Motion-compensated MR CINE reconstruction with reconstruction-driven + motion estimation + + +
+ In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective +approach to address highly undersampled acquisitions by incorporating motion +information between frames. In this work, we propose a novel perspective for +addressing the MCMR problem and a more integrated and efficient solution to the +MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the +original problem into two sub-optimization problems, i.e. motion estimation and +reconstruction, we formulate this problem as a single entity with one single +optimization. Our approach is unique in that the motion estimation is directly +driven by the ultimate goal, reconstruction, but not by the canonical +motion-warping loss (similarity measurement between motion-warped images and +target images). We align the objectives of motion estimation and +reconstruction, eliminating the drawbacks of artifacts-affected motion +estimation and therefore error-propagated reconstruction. Further, we can +deliver high-quality reconstruction and realistic motion without applying any +regularization/smoothness loss terms, circumventing the non-trivial weighting +factor tuning. We evaluate our method on two datasets: 1) an in-house acquired +2D CINE dataset for the retrospective study and 2) the public OCMR cardiac +dataset for the prospective study. The conducted experiments indicate that the +proposed MCMR framework can deliver artifact-free motion estimation and +high-quality MR images even for imaging accelerations up to 20x, outperforming +SOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation +across all experiments. The code is available at +https://github.com/JZPeterPan/MCMR-Recon-Driven-Motion. + +
+
+
+
+
+ + ♻ ☆ Beyond Full Label: Single-Point Prompt for Infrared Small Target Label + Generation + + +
+ In this work, we make the first attempt to construct a learning-based +single-point annotation paradigm for infrared small target label generation +(IRSTLG). Our intuition is that label generation requires just one more point +prompt than target detection: IRSTLG can be regarded as an infrared small +target detection (IRSTD) task with the target location hint. Based on this +insight, we introduce an energy double guided single-point prompt (EDGSP) +framework, which adeptly transforms the target detection network into a refined +label generation method. Specifically, the proposed EDGSP includes: 1) target +energy initialization (TEI) to create a foundational outline for sufficient +shape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid +localization of interested regions and reinforcement of individual differences +to avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate +false alarms. Experimental results show that pseudo labels generated by three +baselines equipped with EDGSP achieve 100% object-level probability of +detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k +datasets, with a pixel-level intersection over union (IoU) improvement of +13.28% over state-of-the-art (SOTA) label generation methods. In the practical +application of downstream IRSTD, EDGSP realizes, for the first time, a +single-point generated pseudo mask beyond the full label. Even with coarse +single-point annotations, it still achieves 99.5% performance of full labeling. + +
+
+
+
+
+ + ♻ ☆ CLIP-EBC: CLIP Can Count Accurately through Enhanced Blockwise + Classification + + +
+ We propose CLIP-EBC, the first fully CLIP-based model for accurate crowd +density estimation. While the CLIP model has demonstrated remarkable success in +addressing recognition tasks such as zero-shot image classification, its +potential for counting has been largely unexplored due to the inherent +challenges in transforming a regression problem, such as counting, into a +recognition task. In this work, we investigate and enhance CLIP's ability to +count, focusing specifically on the task of estimating crowd sizes from images. +Existing classification-based crowd-counting frameworks have significant +limitations, including the quantization of count values into bordering +real-valued bins and the sole focus on classification errors. These practices +result in label ambiguity near the shared borders and inaccurate prediction of +count values. Hence, directly applying CLIP within these frameworks may yield +suboptimal performance. + To address these challenges, we first propose the Enhanced Blockwise +Classification (EBC) framework. Unlike previous methods, EBC utilizes +integer-valued bins, effectively reducing ambiguity near bin boundaries. +Additionally, it incorporates a regression loss based on density maps to +improve the prediction of count values. Within our backbone-agnostic EBC +framework, we then introduce CLIP-EBC to fully leverage CLIP's recognition +capabilities for this task. Extensive experiments demonstrate the effectiveness +of EBC and the competitive performance of CLIP-EBC. Specifically, our EBC +framework can improve existing classification-based methods by up to 44.5% on +the UCF-QNRF dataset, and CLIP-EBC achieves state-of-the-art performance on the +NWPU-Crowd test set, with an MAE of 58.2 and an RMSE of 268.5, representing +improvements of 8.6% and 13.3% over the previous best method, STEERER. The code +and weights are available at https://github.com/Yiming-M/CLIP-EBC. + +
+
+
+
+
+ + ♻ ☆ DualFocus: Integrating Plausible Descriptions in Text-based Person + Re-identification + + +
+ Text-based Person Re-identification (TPR) aims to retrieve specific +individual images from datasets based on textual descriptions. Existing TPR +methods primarily focus on recognizing explicit and positive characteristics, +often overlooking the role of negative descriptions. This oversight can lead to +false positives-images that meet positive criteria but should be excluded based +on negative descriptions. To address these limitations, we introduce DualFocus, +a unified framework that integrates plausible descriptions to enhance the +interpretative accuracy of vision-language models in TPR tasks. DualFocus +leverages Dual (Positive/Negative) Attribute Prompt Learning (DAPL), which +incorporates Dual Image-Attribute Contrastive (DIAC) Learning and Sensitive +Image-Attributes Matching (SIAM) Learning, enabling the detection of +non-existent attributes and reducing false positives. To achieve a balance +between coarse and fine-grained alignment of visual and textual embeddings, we +propose the Dynamic Tokenwise Similarity (DTS) loss, which refines the +representation of both matching and non-matching descriptions, thereby +improving the matching process through detailed and adaptable similarity +assessments. The comprehensive experiments on CUHK-PEDES, ICFG-PEDES, and +RSTPReid, DualFocus demonstrates superior performance over state-of-the-art +methods, significantly enhancing both precision and robustness in TPR. + +
+
+
+
+
+ + ♻ ☆ Distilling High Diagnostic Value Patches for Whole Slide Image + Classification Using Attention Mechanism + + +
+ Multiple Instance Learning (MIL) has garnered widespread attention in the +field of Whole Slide Image (WSI) classification as it replaces pixel-level +manual annotation with diagnostic reports as labels, significantly reducing +labor costs. Recent research has shown that bag-level MIL methods often yield +better results because they can consider all patches of the WSI as a whole. +However, a drawback of such methods is the incorporation of more redundant +patches, leading to interference. To extract patches with high diagnostic value +while excluding interfering patches to address this issue, we developed an +attention-based feature distillation multi-instance learning (AFD-MIL) +approach. This approach proposed the exclusion of redundant patches as a +preprocessing operation in weakly supervised learning, directly mitigating +interference from extensive noise. It also pioneers the use of attention +mechanisms to distill features with high diagnostic value, as opposed to the +traditional practice of indiscriminately and forcibly integrating all patches. +Additionally, we introduced global loss optimization to finely control the +feature distillation module. AFD-MIL is orthogonal to many existing MIL +methods, leading to consistent performance improvements. This approach has +surpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy) +and 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge +2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The +Cancer Genome Atlas Program: non-small cell lung cancer). Different feature +distillation methods were used for the two datasets, tailored to the specific +diseases, thereby improving performance and interpretability. + +
+
+
+
+
+ + ♻ ☆ Adaptive Learning of Consistency and Inconsistency Information for Fake + News Detection + + +
+ The rapid advancement of social media platforms has significantly reduced the +cost of information dissemination, yet it has also led to a proliferation of +fake news, posing a threat to societal trust and credibility. Most of fake news +detection research focused on integrating text and image information to +represent the consistency of multiple modes in news content, while paying less +attention to inconsistent information. Besides, existing methods that leveraged +inconsistent information often caused one mode overshadowing another, leading +to ineffective use of inconsistent clue. To address these issues, we propose an +adaptive multi-modal feature fusion network (MFF-Net). Inspired by human +judgment processes for determining truth and falsity in news, MFF-Net focuses +on inconsistent parts when news content is generally consistent and consistent +parts when it is generally inconsistent. Specifically, MFF-Net extracts +semantic and global features from images and texts respectively, and learns +consistency information between modes through a multiple feature fusion module. +To deal with the problem of modal information being easily masked, we design a +single modal feature filtering strategy to capture inconsistent information +from corresponding modes separately. Finally, similarity scores are calculated +based on global features with adaptive adjustments made to achieve weighted +fusion of consistent and inconsistent features. Extensive experimental results +demonstrate that MFF-Net outperforms state-of-the-art methods across three +public news datasets derived from real social medias. + +
+
+
+
+
+ + ♻ ☆ Gradient Alignment Improves Test-Time Adaptation for Medical Image + Segmentation + + +
+ Although recent years have witnessed significant advancements in medical +image segmentation, the pervasive issue of domain shift among medical images +from diverse centres hinders the effective deployment of pre-trained models. +Many Test-time Adaptation (TTA) methods have been proposed to address this +issue by fine-tuning pre-trained models with test data during inference. These +methods, however, often suffer from less-satisfactory optimization due to +suboptimal optimization direction (dictated by the gradient) and fixed +step-size (predicated on the learning rate). In this paper, we propose the +Gradient alignment-based Test-time adaptation (GraTa) method to improve both +the gradient direction and learning rate in the optimization procedure. Unlike +conventional TTA methods, which primarily optimize the pseudo gradient derived +from a self-supervised objective, our method incorporates an auxiliary gradient +with the pseudo one to facilitate gradient alignment. Such gradient alignment +enables the model to excavate the similarities between different gradients and +correct the gradient direction to approximate the empirical gradient related to +the current segmentation task. Additionally, we design a dynamic learning rate +based on the cosine similarity between the pseudo and auxiliary gradients, +thereby empowering the adaptive fine-tuning of pre-trained models on diverse +test data. Extensive experiments establish the effectiveness of the proposed +gradient alignment and dynamic learning rate and substantiate the superiority +of our GraTa method over other state-of-the-art TTA methods on a benchmark +medical image segmentation task. The code and weights of pre-trained source +models will be available. + +
+
+
+
+
+ + ♻ ☆ HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for + Ultrasound Image Segmentation + + +
+ Precise ultrasound segmentation is vital for clinicians to provide +comprehensive diagnoses. However, developing a model that accurately segments +ultrasound images is challenging due to the images' low quality and the +scarcity of extensive labeled data. This results in two main solutions: (1) +optimizing multi-scale feature representations, and (2) increasing resistance +to data dependency. The first approach necessitates an advanced network +architecture, but a handcrafted network is knowledge-intensive and often yields +limited improvement. In contrast, neural architecture search (NAS) can more +easily attain optimal performance, albeit with significant computational costs. +Regarding the second issue, semi-supervised learning (SSL) is an established +method, but combining it with complex NAS faces the risk of overfitting to a +few labeled samples without extra constraints. Therefore, we introduce a hybrid +constraint-driven semi-supervised Transformer-NAS (HCS-TNAS), balancing both +solutions for segmentation. HCS-TNAS includes an Efficient NAS-ViT module for +multi-scale token search before ViT's attention calculation, effectively +capturing contextual and local information with lower computational costs, and +a hybrid SSL framework that adds network independence and contrastive learning +to the optimization for solving data dependency. By further developing a +stage-wise optimization strategy, a rational network structure is identified. +Experiments on public datasets show that HCS-TNAS achieves state-of-the-art +performance, pushing the limit of ultrasound segmentation. + +
+
+
+
+
+ + ♻ ☆ ICAL: Implicit Character-Aided Learning for Enhanced Handwritten + Mathematical Expression Recognition ICDAR 2024 + + +
+ Significant progress has been made in the field of handwritten mathematical +expression recognition, while existing encoder-decoder methods are usually +difficult to model global information in $LaTeX$. Therefore, this paper +introduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine +the global expression information and enhance handwritten mathematical +expression recognition. Specifically, we propose the Implicit Character +Construction Module (ICCM) to predict implicit character sequences and use a +Fusion Module to merge the outputs of the ICCM and the decoder, thereby +producing corrected predictions. By modeling and utilizing implicit character +information, ICAL achieves a more accurate and context-aware interpretation of +handwritten mathematical expressions. Experimental results demonstrate that +ICAL notably surpasses the state-of-the-art(SOTA) models, improving the +expression recognition rate (ExpRate) by 2.25\%/1.81\%/1.39\% on the CROHME +2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\% on the +challenging HME100k test set. We make our code available on the GitHub: +https://github.com/qingzhenduyu/ICAL + +
+
+ comment: ICDAR 2024 Oral Paper +
+
+
+
+
+ + ♻ ☆ FancyVideo: Towards Dynamic and Consistent Video Generation via + Cross-frame Textual Guidance + + +
+ Synthesizing motion-rich and temporally consistent videos remains a challenge +in artificial intelligence, especially when dealing with extended durations. +Existing text-to-video (T2V) models commonly employ spatial cross-attention for +text control, equivalently guiding different frame generations without +frame-specific textual guidance. Thus, the model's capacity to comprehend the +temporal logic conveyed in prompts and generate videos with coherent motion is +restricted. To tackle this limitation, we introduce FancyVideo, an innovative +video generator that improves the existing text-control mechanism with the +well-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM +incorporates the Temporal Information Injector (TII), Temporal Affinity Refiner +(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of +cross-attention, respectively, to achieve frame-specific textual guidance. +Firstly, TII injects frame-specific information from latent features into text +conditions, thereby obtaining cross-frame textual conditions. Then, TAR refines +the correlation matrix between cross-frame textual conditions and latent +features along the time dimension. Lastly, TFB boosts the temporal consistency +of latent features. Extensive experiments comprising both quantitative and +qualitative evaluations demonstrate the effectiveness of FancyVideo. Our video +demo, code and model are available at https://360cvgroup.github.io/FancyVideo/. + +
+
+
+
+
+ + ♻ ☆ PEANO-ViT: Power-Efficient Approximations of Non-Linearities in Vision + Transformers + + +
+ The deployment of Vision Transformers (ViTs) on hardware platforms, specially +Field-Programmable Gate Arrays (FPGAs), presents many challenges, which are +mainly due to the substantial computational and power requirements of their +non-linear functions, notably layer normalization, softmax, and Gaussian Error +Linear Unit (GELU). These critical functions pose significant obstacles to +efficient hardware implementation due to their complex mathematical operations +and the inherent resource count and architectural limitations of FPGAs. +PEANO-ViT offers a novel approach to streamlining the implementation of the +layer normalization layer by introducing a division-free technique that +simultaneously approximates the division and square root function. +Additionally, PEANO-ViT provides a multi-scale division strategy to eliminate +division operations in the softmax layer, aided by a Pade-based approximation +for the exponential function. Finally, PEANO-ViT introduces a piece-wise linear +approximation for the GELU function, carefully designed to bypass the +computationally intensive operations associated with GELU. In our comprehensive +evaluations, PEANO-ViT exhibits minimal accuracy degradation (<= 0.5% for +DeiT-B) while significantly enhancing power efficiency, achieving improvements +of 1.91x, 1.39x, 8.01x for layer normalization, softmax, and GELU, +respectively. This improvement is achieved through substantial reductions in +DSP, LUT, and register counts for these non-linear operations. Consequently, +PEANO-ViT enables efficient deployment of Vision Transformers on resource- and +power-constrained FPGA platforms. + +
+
+
+
+
+ + ♻ ☆ VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via + SLM-Based Prompt Engineering and Generative Adversary + + +
+ With the rapid development of Text-to-Image (T2I) models, biases in human +image generation against demographic social groups become a significant +concern, impacting fairness and ethical standards in AI. Some researchers +propose their methods to tackle with the issue. However, existing methods are +designed for specific models with fixed prompts, limiting their adaptability to +the fast-evolving models and diverse practical scenarios. Moreover, they +neglect the impact of hallucinations, leading to discrepancies between expected +and actual results. To address these issues, we introduce VersusDebias, a novel +and universal debiasing framework for biases in arbitrary T2I models, +consisting of an array generation (AG) module and an image generation (IG) +module. The self-adaptive AG module generates specialized attribute arrays to +post-process hallucinations and debias multiple attributes simultaneously. The +IG module employs a small language model to modify prompts according to the +arrays and drives the T2I model to generate debiased images, enabling zero-shot +debiasing. Extensive experiments demonstrate VersusDebias's capability to +debias any models across gender, race, and age simultaneously. In both +zero-shot and few-shot scenarios, VersusDebias outperforms existing methods, +showcasing its exceptional utility. Our work is accessible at +https://github.com/VersusDebias/VersusDebias to ensure reproducibility and +facilitate further research. + +
+
+
+
+
+ + ♻ ☆ Self-Learning Symmetric Multi-view Probabilistic Clustering + + +
+ Multi-view Clustering (MVC) has achieved significant progress, with many +efforts dedicated to learn knowledge from multiple views. However, most +existing methods are either not applicable or require additional steps for +incomplete MVC. Such a limitation results in poor-quality clustering +performance and poor missing view adaptation. Besides, noise or outliers might +significantly degrade the overall clustering performance, which are not handled +well by most existing methods. In this paper, we propose a novel unified +framework for incomplete and complete MVC named self-learning symmetric +multi-view probabilistic clustering (SLS-MPC). SLS-MPC proposes a novel +symmetric multi-view probability estimation and equivalently transforms +multi-view pairwise posterior matching probability into composition of each +view's individual distribution, which tolerates data missing and might extend +to any number of views. Then, SLS-MPC proposes a novel self-learning +probability function without any prior knowledge and hyper-parameters to learn +each view's individual distribution. Next, graph-context-aware refinement with +path propagation and co-neighbor propagation is used to refine pairwise +probability, which alleviates the impact of noise and outliers. Finally, +SLS-MPC proposes a probabilistic clustering algorithm to adjust clustering +assignments by maximizing the joint probability iteratively without category +information. Extensive experiments on multiple benchmarks show that SLS-MPC +outperforms previous state-of-the-art methods. + +
+
+ comment: accepted by IEEE Transactions on Knowledge and Data Engineering(TKDE) +
+
+
+
+
+ + ♻ ☆ BIGbench: A Unified Benchmark for Social Bias in Text-to-Image + Generative Models Based on Multi-modal LLM + + +
+ Text-to-Image (T2I) generative models are becoming increasingly crucial due +to their ability to generate high-quality images, which also raises concerns +about the social biases in their outputs, especially in the human generation. +Sociological research has established systematic classifications of bias. +However, existing bias research about T2I models conflates different types of +bias, impeding methodological progress. In this paper, we introduce BIGbench, a +unified benchmark for Biases of Image Generation, featuring a meticulously +designed dataset. Unlike existing benchmarks, BIGbench classifies and evaluates +biases across four dimensions: manifestation of bias, visibility of bias, +acquired attributes, and protected attributes, which ensures exceptional +accuracy for analysis. Furthermore, BIGbench applies advanced multi-modal large +language models to achieve fully automated and highly accurate evaluations. We +apply BIGbench to evaluate eight representative general T2I models and three +debiased methods. Our human evaluation results underscore BIGbench's +effectiveness in aligning images and identifying various biases. Besides, our +study also reveal new research directions about biases, such as the effect of +distillation and irrelevant protected attributes. Our benchmark is openly +accessible at https://github.com/BIGbench2024/BIGbench2024/ to ensure +reproducibility. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.17814 +
+
+
+
+
+ + ♻ ☆ Metric3D v2: A Versatile Monocular Geometric Foundation Model for + Zero-shot Metric Depth and Surface Normal Estimation + + +
+ We introduce Metric3D v2, a geometric foundation model for zero-shot metric +depth and surface normal estimation from a single image, which is crucial for +metric 3D recovery. While depth and normal are geometrically related and highly +complimentary, they present distinct challenges. SoTA monocular depth methods +achieve zero-shot generalization by learning affine-invariant depths, which +cannot recover real-world metrics. Meanwhile, SoTA normal estimation methods +have limited zero-shot performance due to the lack of large-scale labeled data. +To tackle these issues, we propose solutions for both metric depth estimation +and surface normal estimation. For metric depth estimation, we show that the +key to a zero-shot single-view model lies in resolving the metric ambiguity +from various camera models and large-scale data training. We propose a +canonical camera space transformation module, which explicitly addresses the +ambiguity problem and can be effortlessly plugged into existing monocular +models. For surface normal estimation, we propose a joint depth-normal +optimization module to distill diverse data knowledge from metric depth, +enabling normal estimators to learn beyond normal labels. Equipped with these +modules, our depth-normal models can be stably trained with over 16 million of +images from thousands of camera models with different-type annotations, +resulting in zero-shot generalization to in-the-wild images with unseen camera +settings. Our method enables the accurate recovery of metric 3D structures on +randomly collected internet images, paving the way for plausible single-image +metrology. Our project page is at https://JUGGHM.github.io/Metric3Dv2. + +
+
+ comment: Our project page is at https://JUGGHM.github.io/Metric3Dv2. Accpeted + to TPAMI. arXiv admin note: text overlap with arXiv:2307.10984 +
+
+
+
+
+ + ♻ ☆ HandRefiner: Refining Malformed Hands in Generated Images by + Diffusion-based Conditional Inpainting + + +
+ Diffusion models have achieved remarkable success in generating realistic +images but suffer from generating accurate human hands, such as incorrect +finger counts or irregular shapes. This difficulty arises from the complex task +of learning the physical structure and pose of hands from training images, +which involves extensive deformations and occlusions. For correct hand +generation, our paper introduces a lightweight post-processing solution called +$\textbf{HandRefiner}$. HandRefiner employs a conditional inpainting approach +to rectify malformed hands while leaving other parts of the image untouched. We +leverage the hand mesh reconstruction model that consistently adheres to the +correct number of fingers and hand shape, while also being capable of fitting +the desired hand pose in the generated image. Given a generated failed image +due to malformed hands, we utilize ControlNet modules to re-inject such correct +hand information. Additionally, we uncover a phase transition phenomenon within +ControlNet as we vary the control strength. It enables us to take advantage of +more readily available synthetic data without suffering from the domain gap +between realistic and synthetic hands. Experiments demonstrate that HandRefiner +can significantly improve the generation quality quantitatively and +qualitatively. The code is available at +https://github.com/wenquanlu/HandRefiner . + +
+
+
+
+
+ + ♻ ☆ AdaDiff: Accelerating Diffusion Models through Step-Wise Adaptive + Computation + + +
+ Diffusion models achieve great success in generating diverse and +high-fidelity images, yet their widespread application, especially in real-time +scenarios, is hampered by their inherently slow generation speed. The slow +generation stems from the necessity of multi-step network inference. While some +certain predictions benefit from the full computation of the model in each +sampling iteration, not every iteration requires the same amount of +computation, potentially leading to inefficient computation. Unlike typical +adaptive computation challenges that deal with single-step generation problems, +diffusion processes with a multi-step generation need to dynamically adjust +their computational resource allocation based on the ongoing assessment of each +step's importance to the final image output, presenting a unique set of +challenges. In this work, we propose AdaDiff, an adaptive framework that +dynamically allocates computation resources in each sampling step to improve +the generation efficiency of diffusion models. To assess the effects of changes +in computational effort on image quality, we present a timestep-aware +uncertainty estimation module (UEM). Integrated at each intermediate layer, the +UEM evaluates the predictive uncertainty. This uncertainty measurement serves +as an indicator for determining whether to terminate the inference process. +Additionally, we introduce an uncertainty-aware layer-wise loss aimed at +bridging the performance gap between full models and their adaptive +counterparts. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multimodal Industrial Anomaly Detection via Cross-Modal + Distillation + + +
+ Recent studies of multimodal industrial anomaly detection (IAD) based on 3D +point clouds and RGB images have highlighted the importance of exploiting the +redundancy and complementarity among modalities for accurate classification and +segmentation. However, achieving multimodal IAD in practical production lines +remains a work in progress. It is essential to consider the trade-offs between +the costs and benefits associated with the introduction of new modalities while +ensuring compatibility with current processes. Existing quality control +processes combine rapid in-line inspections, such as optical and infrared +imaging with high-resolution but time-consuming near-line characterization +techniques, including industrial CT and electron microscopy to manually or +semi-automatically locate and analyze defects in the production of Li-ion +batteries and composite materials. Given the cost and time limitations, only a +subset of the samples can be inspected by all in-line and near-line methods, +and the remaining samples are only evaluated through one or two forms of +in-line inspection. To fully exploit data for deep learning-driven automatic +defect detection, the models must have the ability to leverage multimodal +training and handle incomplete modalities during inference. In this paper, we +propose CMDIAD, a Cross-Modal Distillation framework for IAD to demonstrate the +feasibility of a Multi-modal Training, Few-modal Inference (MTFI) pipeline. Our +findings show that the MTFI pipeline can more effectively utilize incomplete +multimodal information compared to applying only a single modality for training +and inference. Moreover, we investigate the reasons behind the asymmetric +performance improvement using point clouds or RGB images as the main modality +of inference. This provides a foundation for our future multimodal dataset +construction with additional modalities from manufacturing scenarios. + +
+
+
+
+
+ + ♻ ☆ OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse + Click Annotation + + +
+ LiDAR-based outdoor 3D object detection has received widespread attention. +However, training 3D detectors from the LiDAR point cloud typically relies on +expensive bounding box annotations. This paper presents OC3D, an innovative +weakly supervised method requiring only coarse clicks on the bird's eye view of +the 3D point cloud. A key challenge here is the absence of complete geometric +descriptions of the target objects from such simple click annotations. To +address this problem, our proposed OC3D adopts a two-stage strategy. In the +first stage, we initially design a novel dynamic and static classification +strategy and then propose the Click2Box and Click2Mask modules to generate +box-level and mask-level pseudo-labels for static and dynamic instances, +respectively. In the second stage, we design a Mask2Box module, leveraging the +learning capabilities of neural networks to update mask-level pseudo-labels, +which contain less information, to box-level pseudo-labels. Experimental +results on the widely used KITTI and nuScenes datasets demonstrate that our +OC3D with only coarse clicks achieves state-of-the-art performance compared to +weakly-supervised 3D detection methods. Combining OC3D with a missing click +mining strategy, we propose an OC3D++ pipeline, which requires only 0.2% +annotation cost in the KITTI dataset to achieve performance comparable to fully +supervised methods. The code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Enhanced Self-Checkout System for Retail Based on Improved YOLOv10 + + +
+ With the rapid advancement of deep learning technologies, computer vision has +shown immense potential in retail automation. This paper presents a novel +self-checkout system for retail based on an improved YOLOv10 network, aimed at +enhancing checkout efficiency and reducing labor costs. We propose targeted +optimizations to the YOLOv10 model, by incorporating the detection head +structure from YOLOv8, which significantly improves product recognition +accuracy. Additionally, we develop a post-processing algorithm tailored for +self-checkout scenarios, to further enhance the application of system. +Experimental results demonstrate that our system outperforms existing methods +in both product recognition accuracy and checkout speed. This research not only +provides a new technical solution for retail automation but offers valuable +insights into optimizing deep learning models for real-world applications. + +
+
+
+
+
+ + ♻ ☆ Exploring learning environments for label\-efficient cancer diagnosis + + +
+ Despite significant research efforts and advancements, cancer remains a +leading cause of mortality. Early cancer prediction has become a crucial focus +in cancer research to streamline patient care and improve treatment outcomes. +Manual tumor detection by histopathologists can be time consuming, prompting +the need for computerized methods to expedite treatment planning. Traditional +approaches to tumor detection rely on supervised learning, necessitates a large +amount of annotated data for model training. However, acquiring such extensive +labeled data can be laborious and time\-intensive. This research examines the +three learning environments: supervised learning (SL), semi\-supervised +learning (Semi\-SL), and self\-supervised learning (Self\-SL): to predict +kidney, lung, and breast cancer. Three pre\-trained deep learning models +(Residual Network\-50, Visual Geometry Group\-16, and EfficientNetB0) are +evaluated based on these learning settings using seven carefully curated +training sets. To create the first training set (TS1), SL is applied to all +annotated image samples. Five training sets (TS2\-TS6) with different ratios of +labeled and unlabeled cancer images are used to evaluateSemi\-SL. Unlabeled +cancer images from the final training set (TS7) are utilized for Self\-SL +assessment. Among different learning environments, outcomes from the Semi\-SL +setting show a strong degree of agreement with the outcomes achieved in the SL +setting. The uniform pattern of observations from the pre\-trained models +across all three datasets validates the methodology and techniques of the +research. Based on modest number of labeled samples and minimal computing cost, +our study suggests that the Semi\-SL option can be a highly viable replacement +for the SL option under label annotation constraint scenarios. + +
+
+ comment: Submitted to the journal +
+
+
+
+
+ + ♻ ☆ Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A + Bayesian Fusion Approach + + +
+ Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs) +involves transmitting Electromagnetic Waves (EMWs) and performing target type +recognition on the received radar echo, crucial for defense and aerospace +applications. Previous studies highlighted the advantages of multistatic radar +configurations over monostatic ones in RATR. However, fusion methods in +multistatic radar configurations often suboptimally combine classification +vectors from individual radars probabilistically. To address this, we propose a +fully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to +aggregate classification probability vectors from multiple radars. OBF, based +on expected 0-1 loss, updates a Recursive Bayesian Classification (RBC) +posterior distribution for target UAV type, conditioned on historical +observations across multiple time steps. We evaluate the approach using +simulated random walk trajectories for seven drones, correlating target aspect +angles to Radar Cross Section (RCS) measurements in an anechoic chamber. +Comparing against single radar Automated Target Recognition (ATR) systems and +suboptimal fusion methods, our empirical results demonstrate that the OBF +method integrated with RBC significantly enhances classification accuracy +compared to other fusion methods and single radar configurations. + +
+
+ comment: Accepted to IEEE Transactions on Aerospace and Electronic Systems +
+
+
+
+
+ + ♻ ☆ Novel-View Acoustic Synthesis from 3D Reconstructed Rooms + + +
+ We investigate the benefit of combining blind audio recordings with 3D scene +information for novel-view acoustic synthesis. Given audio recordings from 2-4 +microphones and the 3D geometry and material of a scene containing multiple +unknown sound sources, we estimate the sound anywhere in the scene. We identify +the main challenges of novel-view acoustic synthesis as sound source +localization, separation, and dereverberation. While naively training an +end-to-end network fails to produce high-quality results, we show that +incorporating room impulse responses (RIRs) derived from 3D reconstructed rooms +enables the same network to jointly tackle these tasks. Our method outperforms +existing methods designed for the individual tasks, demonstrating its +effectiveness at utilizing 3D visual information. In a simulated study on the +Matterport3D-NVAS dataset, our model achieves near-perfect accuracy on source +localization, a PSNR of 26.44dB and a SDR of 14.23dB for source separation and +dereverberation, resulting in a PSNR of 25.55 dB and a SDR of 14.20 dB on +novel-view acoustic synthesis. We release our code and model on our project +website at https://github.com/apple/ml-nvas3d. Please wear headphones when +listening to the results. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Component Selection for Craft Assembly Tasks + + +
+ Inspired by traditional handmade crafts, where a person improvises assemblies +based on the available objects, we formally introduce the Craft Assembly Task. +It is a robotic assembly task that involves building an accurate representation +of a given target object using the available objects, which do not directly +correspond to its parts. In this work, we focus on selecting the subset of +available objects for the final craft, when the given input is an RGB image of +the target in the wild. We use a mask segmentation neural network to identify +visible parts, followed by retrieving labelled template meshes. These meshes +undergo pose optimization to determine the most suitable template. Then, we +propose to simplify the parts of the transformed template mesh to primitive +shapes like cuboids or cylinders. Finally, we design a search algorithm to find +correspondences in the scene based on local and global proportions. We develop +baselines for comparison that consider all possible combinations, and choose +the highest scoring combination for common metrics used in foreground maps and +mask accuracy. Our approach achieves comparable results to the baselines for +two different scenes, and we show qualitative results for an implementation in +a real-world scenario. + +
+
+ comment: Published on IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Interactive Character Control with Auto-Regressive Motion Diffusion + Models + + +
+ Real-time character control is an essential component for interactive +experiences, with a broad range of applications, including physics simulations, +video games, and virtual reality. The success of diffusion models for image +synthesis has led to the use of these models for motion synthesis. However, the +majority of these motion diffusion models are primarily designed for offline +applications, where space-time models are used to synthesize an entire sequence +of frames simultaneously with a pre-specified length. To enable real-time +motion synthesis with diffusion model that allows time-varying controls, we +propose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional +diffusion model takes an initial pose as input, and auto-regressively generates +successive motion frames conditioned on the previous frame. Despite its +streamlined network architecture, which uses simple MLPs, our framework is +capable of generating diverse, long-horizon, and high-fidelity motion +sequences. Furthermore, we introduce a suite of techniques for incorporating +interactive controls into A-MDM, such as task-oriented sampling, in-painting, +and hierarchical reinforcement learning. These techniques enable a pre-trained +A-MDM to be efficiently adapted for a variety of new downstream tasks. We +conduct a comprehensive suite of experiments to demonstrate the effectiveness +of A-MDM, and compare its performance against state-of-the-art auto-regressive +methods. + +
+
+
+
+
+ + ♻ ☆ Relative-Interior Solution for the (Incomplete) Linear Assignment + Problem with Applications to the Quadratic Assignment Problem + + +
+ We study the set of optimal solutions of the dual linear programming +formulation of the linear assignment problem (LAP) to propose a method for +computing a solution from the relative interior of this set. Assuming that an +arbitrary dual-optimal solution and an optimal assignment are available (for +which many efficient algorithms already exist), our method computes a +relative-interior solution in linear time. Since the LAP occurs as a subproblem +in the linear programming (LP) relaxation of the quadratic assignment problem +(QAP), we employ our method as a new component in the family of dual-ascent +algorithms that provide bounds on the optimal value of the QAP. To make our +results applicable to the incomplete QAP, which is of interest in practical +use-cases, we also provide a linear-time reduction from the incomplete LAP to +the complete LAP along with a mapping that preserves optimality and membership +in the relative interior. Our experiments on publicly available benchmarks +indicate that our approach with relative-interior solution can frequently +provide bounds near the optimum of the LP relaxation and its runtime is much +lower when compared to a commercial LP solver. + +
+
+
+
+
+
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ EasyRec: Simple yet Effective Language Models for Recommendation + + +
+ Deep neural networks have become a powerful technique for learning +representations from user-item interaction data in collaborative filtering (CF) +for recommender systems. However, many existing methods heavily rely on unique +user and item IDs, which limits their ability to perform well in practical +zero-shot learning scenarios where sufficient training data may be unavailable. +Inspired by the success of language models (LMs) and their strong +generalization capabilities, a crucial question arises: How can we harness the +potential of language models to empower recommender systems and elevate its +generalization capabilities to new heights? In this study, we propose EasyRec - +an effective and easy-to-use approach that seamlessly integrates text-based +semantic understanding with collaborative signals. EasyRec employs a +text-behavior alignment framework, which combines contrastive learning with +collaborative language model tuning, to ensure a strong alignment between the +text-enhanced semantic space and the collaborative behavior information. +Extensive empirical evaluations across diverse real-world datasets demonstrate +the superior performance of EasyRec compared to state-of-the-art alternative +models, particularly in the challenging text-based zero-shot recommendation +scenarios. Furthermore, the study highlights the potential of seamlessly +integrating EasyRec as a plug-and-play component into text-enhanced +collaborative filtering frameworks, thereby empowering existing recommender +systems to elevate their recommendation performance and adapt to the evolving +user preferences in dynamic environments. For better result reproducibility of +our EasyRec framework, the model implementation details, source code, and +datasets are available at the link: https://github.com/HKUDS/EasyRec. + +
+
+
+
+
+ + ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature + Interaction Modeling in CTR Prediction + + +
+ Modeling feature interactions is crucial for click-through rate (CTR) +prediction, particularly when it comes to high-order explicit interactions. +Traditional methods struggle with this task because they often predefine a +maximum interaction order, which relies heavily on prior knowledge and can +limit the model's effectiveness. Additionally, modeling high-order interactions +typically leads to increased computational costs. Therefore, the challenge lies +in adaptively modeling high-order feature interactions while maintaining +efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented +Sparse Efficient Interaction Network (KarSein), designed to optimize both +predictive accuracy and computational efficiency. We firstly identify +limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and +then introduce KarSein to overcome these issues. It features a novel +architecture that reduces the computational costs of KAN and supports embedding +vectors as feature inputs. Additionally, KarSein employs guided symbolic +regression to address the challenge of KAN in spontaneously learning +multiplicative relationships. Extensive experiments demonstrate KarSein's +superior performance, achieving significant predictive accuracy with minimal +computational overhead. Furthermore, KarSein maintains strong global +explainability while enabling the removal of redundant features, resulting in a +sparse network structure. These advantages also position KarSein as a promising +method for efficient inference. + +
+
+ comment: KarSein for CTR +
+
+
+
+
+ + ☆ Multimodal Relational Triple Extraction with Query-based Entity Object + Transformer + + +
+ Multimodal Relation Extraction is crucial for constructing flexible and +realistic knowledge graphs. Recent studies focus on extracting the relation +type with entity pairs present in different modalities, such as one entity in +the text and another in the image. However, existing approaches require +entities and objects given beforehand, which is costly and impractical. To +address the limitation, we propose a novel task, Multimodal Entity-Object +Relational Triple Extraction, which aims to extract all triples (entity span, +relation, object region) from image-text pairs. To facilitate this study, we +modified a multimodal relation extraction dataset MORE, which includes 21 +relation types, to create a new dataset containing 20,264 triples, averaging +5.75 triples per image-text pair. Moreover, we propose QEOT, a query-based +model with a selective attention mechanism, to dynamically explore the +interaction and fusion of textual and visual information. In particular, the +proposed method can simultaneously accomplish entity extraction, relation +classification, and object detection with a set of queries. Our method is +suitable for downstream applications and reduces error accumulation due to the +pipeline-style approaches. Extensive experimental results demonstrate that our +proposed method outperforms the existing baselines by 8.06% and achieves +state-of-the-art performance. + +
+
+ comment: 15 pages, 7 figures, preprint +
+
+
+
+
+ + ☆ SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking + for~Sequential Recommendation + + +
+ Language Models (LMs) are increasingly employed in recommendation systems due +to their advanced language understanding and generation capabilities. Recent +recommender systems based on generative retrieval have leveraged the +inferential abilities of LMs to directly generate the index tokens of the next +item, based on item sequences within the user's interaction history. Previous +studies have mostly focused on item indices based solely on textual semantic or +collaborative information. However, although the standalone effectiveness of +these aspects has been demonstrated, the integration of this information has +remained unexplored. Our in-depth analysis finds that there is a significant +difference in the knowledge captured by the model from heterogeneous item +indices and diverse input prompts, which can have a high potential for +complementarity. In this paper, we propose SC-Rec, a unified recommender system +that learns diverse preference knowledge from two distinct item indices and +multiple prompt templates. Furthermore, SC-Rec adopts a novel reranking +strategy that aggregates a set of ranking results, inferred based on different +indices and prompts, to achieve the self-consistency of the model. Our +empirical evaluation on three real-world datasets demonstrates that SC-Rec +considerably outperforms the state-of-the-art methods for sequential +recommendation, effectively incorporating complementary knowledge from varied +outputs of the model. + +
+
+
+
+
+ + ☆ OptDist: Learning Optimal Distribution for Customer Lifetime Value + Prediction CIKM 2024 + + +
+ Customer Lifetime Value (CLTV) prediction is a critical task in business +applications. Accurately predicting CLTV is challenging in real-world business +scenarios, as the distribution of CLTV is complex and mutable. Firstly, there +is a large number of users without any consumption consisting of a long-tailed +part that is too complex to fit. Secondly, the small set of high-value users +spent orders of magnitude more than a typical user leading to a wide range of +the CLTV distribution which is hard to capture in a single distribution. +Existing approaches for CLTV estimation either assume a prior probability +distribution and fit a single group of distribution-related parameters for all +samples, or directly learn from the posterior distribution with manually +predefined buckets in a heuristic manner. However, all these methods fail to +handle complex and mutable distributions. In this paper, we propose a novel +optimal distribution selection model OptDist for CLTV prediction, which +utilizes an adaptive optimal sub-distribution selection mechanism to improve +the accuracy of complex distribution modeling. Specifically, OptDist trains +several candidate sub-distribution networks in the distribution learning module +(DLM) for modeling the probability distribution of CLTV. Then, a distribution +selection module (DSM) is proposed to select the sub-distribution for each +sample, thus making the selection automatically and adaptively. Besides, we +design an alignment mechanism that connects both modules, which effectively +guides the optimization. We conduct extensive experiments on both two public +and one private dataset to verify that OptDist outperforms state-of-the-art +baselines. Furthermore, OptDist has been deployed on a large-scale financial +platform for customer acquisition marketing campaigns and the online +experiments also demonstrate the effectiveness of OptDist. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ Collaborative Cross-modal Fusion with Large Language Model for + Recommendation CIKM 2024 + + +
+ Despite the success of conventional collaborative filtering (CF) approaches +for recommendation systems, they exhibit limitations in leveraging semantic +knowledge within the textual attributes of users and items. Recent focus on the +application of large language models for recommendation (LLM4Rec) has +highlighted their capability for effective semantic knowledge capture. However, +these methods often overlook the collaborative signals in user behaviors. Some +simply instruct-tune a language model, while others directly inject the +embeddings of a CF-based model, lacking a synergistic fusion of different +modalities. To address these issues, we propose a framework of Collaborative +Cross-modal Fusion with Large Language Models, termed CCF-LLM, for +recommendation. In this framework, we translate the user-item interactions into +a hybrid prompt to encode both semantic knowledge and collaborative signals, +and then employ an attentive cross-modal fusion strategy to effectively fuse +latent embeddings of both modalities. Extensive experiments demonstrate that +CCF-LLM outperforms existing methods by effectively utilizing semantic and +collaborative signals in the LLM4Rec context. + +
+
+ comment: 10 pages, 4 figures, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Don't Click the Bait: Title Debiasing News Recommendation via + Cross-Field Contrastive Learning + + +
+ News recommendation emerges as a primary means for users to access content of +interest from the vast amount of news. The title clickbait extensively exists +in news domain and increases the difficulty for news recommendation to offer +satisfactory services for users. Fortunately, we find that news abstract, as a +critical field of news, aligns cohesively with the news authenticity. To this +end, we propose a Title Debiasing News Recommendation with Cross-field +Contrastive learning (TDNR-C2) to overcome the title bias by incorporating news +abstract. Specifically, a multi-field knowledge extraction module is devised to +extract multi-view knowledge about news from various fields. Afterwards, we +present a cross-field contrastive learning module to conduct bias removal via +contrasting learned knowledge from title and abstract fileds. Experimental +results on a real-world dataset demonstrate the superiority of the proposed +TDNR-C2 over existing state-of-the-art methods. Further analysis also indicates +the significance of news abstract for title debiasing. + +
+
+
+
+
+ + ☆ MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement + Framework for Multimodal Question Answering + + +
+ Recent advancements in retrieval-augmented generation (RAG) have demonstrated +impressive performance in the question-answering (QA) task. However, most +previous works predominantly focus on text-based answers. While some studies +address multimodal data, they still fall short in generating comprehensive +multimodal answers, particularly for explaining concepts or providing +step-by-step tutorials on how to accomplish specific goals. This capability is +especially valuable for applications such as enterprise chatbots and settings +such as customer service and educational systems, where the answers are sourced +from multimodal data. In this paper, we introduce a simple and effective +framework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR +enhances text-based answers by retrieving relevant multimodal data and refining +the responses to create coherent multimodal answers. This framework can be +easily extended to support multimodal answers in enterprise chatbots with +minimal modifications. Human evaluation results indicate that multimodal +answers generated by MuRAR are more useful and readable compared to plain text +answers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Address-Specific Sustainable Accommodation Choice Through Real-World + Data Integration + + +
+ Consumers wish to choose sustainable accommodation for their travels, and in +the case of corporations, may be required to do so. Yet accommodation +marketplaces provide no meaningful capability for sustainable choice: typically +CO2 estimates are provided that are identical for all accommodation of the same +type across an entire country. We propose a decision support system that +enables real choice of sustainable accommodation. We develop a data-driven +address-specific metric called EcoGrade, which integrates government approved +datasets and uses interpolation where data is sparse. We validate the metric on +10,000 UK addresses in 10 cities, showing the match of our interpolations to +reality is statistically significant. We show how the metric has been embedded +into a decision support system for a global accommodation marketplace and +tested by real users over several months with positive user feedback. In the +EU, forty percent of final energy consumption is from buildings. We need to +encourage all building owners to make their accommodation more efficient. The +rental sector is one area where change can occur rapidly, as rented +accommodation is renovated frequently. We anticipate our decision support +system using EcoGrade will encourage this positive change. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Guaranteeing Accuracy and Fairness under Fluctuating User Traffic: A + Bankruptcy-Inspired Re-ranking Approach + + +
+ Out of sustainable and economical considerations, two-sided recommendation +platforms must satisfy the needs of both users and providers. Previous studies +often show that the two sides' needs show different urgency: providers need a +relatively long-term exposure demand while users want more short-term and +accurate service. However, our empirical study reveals that previous methods +for trading off fairness-accuracy often fail to guarantee long-term fairness +and short-term accuracy simultaneously in real applications of fluctuating user +traffic. Especially, when user traffic is low, the user experience often drops +a lot. Our theoretical analysis also confirms that user traffic is a key factor +in such a trade-off problem. How to guarantee accuracy and fairness under +fluctuating user traffic remains a problem. Inspired by the bankruptcy problem +in economics, we propose a novel fairness-aware re-ranking approach named +BankFair. Intuitively, BankFair employs the Talmud rule to leverage periods of +abundant user traffic to offset periods of user traffic scarcity, ensuring +consistent user service at every period while upholding long-term fairness. +Specifically, BankFair consists of two modules: (1) employing the Talmud rule +to determine the required fairness degree under varying periods of user +traffic; and (2) conducting an online re-ranking algorithm based on the +fairness degree determined by the Talmud rule. Experiments on two real-world +recommendation datasets show that BankFair outperforms all baselines regarding +accuracy and provider fairness. + +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ TWIN V2: Scaling Ultra-Long User Behavior Sequence Modeling for Enhanced + CTR Prediction at Kuaishou CIKM 2024 + + +
+ The significance of modeling long-term user interests for CTR prediction +tasks in large-scale recommendation systems is progressively gaining attention +among researchers and practitioners. Existing work, such as SIM and TWIN, +typically employs a two-stage approach to model long-term user behavior +sequences for efficiency concerns. The first stage rapidly retrieves a subset +of sequences related to the target item from a long sequence using a +search-based mechanism namely the General Search Unit (GSU), while the second +stage calculates the interest scores using the Exact Search Unit (ESU) on the +retrieved results. Given the extensive length of user behavior sequences +spanning the entire life cycle, potentially reaching up to 10^6 in scale, there +is currently no effective solution for fully modeling such expansive user +interests. To overcome this issue, we introduced TWIN-V2, an enhancement of +TWIN, where a divide-and-conquer approach is applied to compress life-cycle +behaviors and uncover more accurate and diverse user interests. Specifically, a +hierarchical clustering method groups items with similar characteristics in +life-cycle behaviors into a single cluster during the offline phase. By +limiting the size of clusters, we can compress behavior sequences well beyond +the magnitude of 10^5 to a length manageable for online inference in GSU +retrieval. Cluster-aware target attention extracts comprehensive and +multi-faceted long-term interests of users, thereby making the final +recommendation results more accurate and diverse. Extensive offline experiments +on a multi-billion-scale industrial dataset and online A/B tests have +demonstrated the effectiveness of TWIN-V2. Under an efficient deployment +framework, TWIN-V2 has been successfully deployed to the primary traffic that +serves hundreds of millions of daily active users at Kuaishou. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 131 + +
+
+
+ + ☆ Accelerating Giant Impact Simulations with Machine Learning + + +
+ Constraining planet formation models based on the observed exoplanet +population requires generating large samples of synthetic planetary systems, +which can be computationally prohibitive. A significant bottleneck is +simulating the giant impact phase, during which planetary embryos evolve +gravitationally and combine to form planets, which may themselves experience +later collisions. To accelerate giant impact simulations, we present a machine +learning (ML) approach to predicting collisional outcomes in multiplanet +systems. Trained on more than 500,000 $N$-body simulations of three-planet +systems, we develop an ML model that can accurately predict which two planets +will experience a collision, along with the state of the post-collision +planets, from a short integration of the system's initial conditions. Our model +greatly improves on non-ML baselines that rely on metrics from dynamics theory, +which struggle to accurately predict which pair of planets will experience a +collision. By combining with a model for predicting long-term stability, we +create an efficient ML-based giant impact emulator, which can predict the +outcomes of giant impact simulations with a speedup of up to four orders of +magnitude. We expect our model to enable analyses that would not otherwise be +computationally feasible. As such, we release our full training code, along +with an easy-to-use API for our collision outcome model and giant impact +emulator. + +
+
+ comment: 15 pages, 7 figures, 1 table. Easy-to-use API available at + https://github.com/dtamayo/spock +
+
+
+
+
+ + ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable gains in accuracy for Large +Language Models (LLMs). However, such techniques depend on the availability of +an accurate answer extraction process to aggregate across multiple outputs. +Moreover, they acquire higher inference cost, in comparison to Greedy Decoding, +due to generation of relatively higher number of output tokens. Research has +shown that the free form text outputs from Self-Consistency can be aggregated +reliably using LLMs to produce the final output. Additionally, recent +advancements in LLM inference have demonstrated that usage of diverse exemplars +in prompts have the ability to induce diversity in the LLM outputs. Such proven +techniques can be easily extended to self-ensembling based approaches to +achieve enhanced results in text generation. In this paper, we introduce PEDAL +(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid +self-ensembling approach, that combines the strengths of diverse exemplar based +prompts and LLM based aggregation to achieve improvement in overall +performance. On the publicly available SVAMP and ARC datasets, our experiments +reveal that PEDAL can achieve better accuracy than Greedy Decoding based +strategies with lower inference cost compared to Self Consistency based +approaches. + +
+
+
+
+
+ + ☆ A Hassle-free Algorithm for Private Learning in Practice: Don't Use Tree + Aggregation, Use BLTs + + +
+ The state-of-the-art for training on-device language models for mobile +keyboard applications combines federated learning (FL) with differential +privacy (DP) via the DP-Follow-the-Regularized-Leader (DP-FTRL) algorithm. Two +variants of DP-FTRL are used in practice, tree aggregation and matrix +factorization. However, tree aggregation suffers from significantly suboptimal +privacy/utility tradeoffs, while matrix mechanisms require expensive +optimization parameterized by hard-to-estimate-in-advance constants, and high +runtime memory costs.This paper extends the recently introduced Buffered Linear +Toeplitz (BLT) mechanism to multi-participation scenarios. Our BLT-DP-FTRL +maintains the ease-of-use advantages of tree aggregation, while essentially +matching matrix factorization in terms of utility and privacy. We evaluate +BLT-DP-FTRL on the StackOverflow dataset, serving as a re-producible simulation +benchmark, and across four on-device language model tasks in a production FL +system. Our empirical results highlight the advantages of the BLT mechanism and +elevate the practicality and effectiveness of DP in real-world scenarios. + +
+
+
+
+
+ + ☆ Visual Agents as Fast and Slow Thinkers + + +
+ Achieving human-level intelligence requires refining cognitive distinctions +between System 1 and System 2 thinking. While contemporary AI, driven by large +language models, demonstrates human-like traits, it falls short of genuine +cognition. Transitioning from structured benchmarks to real-world scenarios +presents challenges for visual agents, often leading to inaccurate and overly +confident responses. To address the challenge, we introduce FaST, which +incorporates the Fast and Slow Thinking mechanism into visual agents. FaST +employs a switch adapter to dynamically select between System 1/2 modes, +tailoring the problem-solving approach to different task complexity. It tackles +uncertain and unseen objects by adjusting model confidence and integrating new +contextual data. With this novel design, we advocate a flexible system, +hierarchical reasoning capabilities, and a transparent decision-making +pipeline, all of which contribute to its ability to emulate human-like +cognitive processes in visual intelligence. Empirical results demonstrate that +FaST outperforms various well-known baselines, achieving 80.8% accuracy over +VQA^{v2} for visual question answering and 48.7% GIoU score over ReasonSeg for +reasoning segmentation, demonstrate FaST's superior performance. Extensive +testing validates the efficacy and robustness of FaST's core components, +showcasing its potential to advance the development of cognitive visual agents +in AI systems. + +
+
+
+
+
+ + ☆ Stochastic Bandits Robust to Adversarial Attacks + + +
+ This paper investigates stochastic multi-armed bandit algorithms that are +robust to adversarial attacks, where an attacker can first observe the +learner's action and {then} alter their reward observation. We study two cases +of this model, with or without the knowledge of an attack budget $C$, defined +as an upper bound of the summation of the difference between the actual and +altered rewards. For both cases, we devise two types of algorithms with regret +bounds having additive or multiplicative $C$ dependence terms. For the known +attack budget case, we prove our algorithms achieve the regret bound of +${O}((K/\Delta)\log T + KC)$ and $\tilde{O}(\sqrt{KTC})$ for the additive and +multiplicative $C$ terms, respectively, where $K$ is the number of arms, $T$ is +the time horizon, $\Delta$ is the gap between the expected rewards of the +optimal arm and the second-best arm, and $\tilde{O}$ hides the logarithmic +factors. For the unknown case, we prove our algorithms achieve the regret bound +of $\tilde{O}(\sqrt{KT} + KC^2)$ and $\tilde{O}(KC\sqrt{T})$ for the additive +and multiplicative $C$ terms, respectively. In addition to these upper bound +results, we provide several lower bounds showing the tightness of our bounds +and the optimality of our algorithms. These results delineate an intrinsic +separation between the bandits with attacks and corruption models [Lykouris et +al., 2018]. + +
+
+
+
+
+ + ☆ GeoTransformer: Enhancing Urban Forecasting with Geospatial Attention + Mechanisms + + +
+ Recent advancements have focused on encoding urban spatial information into +high-dimensional spaces, with notable efforts dedicated to integrating +sociodemographic data and satellite imagery. These efforts have established +foundational models in this field. However, the effective utilization of these +spatial representations for urban forecasting applications remains +under-explored. To address this gap, we introduce GeoTransformer, a novel +structure that synergizes the Transformer architecture with geospatial +statistics prior. GeoTransformer employs an innovative geospatial attention +mechanism to incorporate extensive urban information and spatial dependencies +into a unified predictive model. Specifically, we compute geospatial weighted +attention scores between the target region and surrounding regions and leverage +the integrated urban information for predictions. Extensive experiments on GDP +and ride-share demand prediction tasks demonstrate that GeoTransformer +significantly outperforms existing baseline models, showcasing its potential to +enhance urban forecasting tasks. + +
+
+
+
+
+ + ☆ HistoGym: A Reinforcement Learning Environment for Histopathological + Image Analysis + + +
+ In pathological research, education, and clinical practice, the +decision-making process based on pathological images is critically important. +This significance extends to digital pathology image analysis: its adequacy is +demonstrated by the extensive information contained within tissue structures, +which is essential for accurate cancer classification and grading. +Additionally, its necessity is highlighted by the inherent requirement for +interpretability in the conclusions generated by algorithms. For humans, +determining tumor type and grade typically involves multi-scale analysis, which +presents a significant challenge for AI algorithms. Traditional patch-based +methods are inadequate for modeling such complex structures, as they fail to +capture the intricate, multi-scale information inherent in whole slide images. +Consequently, there is a pressing need for advanced AI techniques capable of +efficiently and accurately replicating this complex analytical process. To +address this issue, we introduce HistoGym, an open-source reinforcement +learning environment for histopathological image analysis. Following OpenAI Gym +APIs, HistoGym aims to foster whole slide image diagnosis by mimicking the +real-life processes of doctors. Leveraging the pyramid feature of WSIs and the +OpenSlide API, HistoGym provides a unified framework for various clinical +tasks, including tumor detection and classification. We detail the observation, +action, and reward specifications tailored for the histopathological image +analysis domain and provide an open-source Python-based interface for both +clinicians and researchers. To accommodate different clinical demands, we offer +various scenarios for different organs and cancers, including both WSI-based +and selected region-based scenarios, showcasing several noteworthy results. + +
+
+
+
+
+ + ☆ Shapley Marginal Surplus for Strong Models + + +
+ Shapley values have seen widespread use in machine learning as a way to +explain model predictions and estimate the importance of covariates. Accurately +explaining models is critical in real-world models to both aid in decision +making and to infer the properties of the true data-generating process (DGP). +In this paper, we demonstrate that while model-based Shapley values might be +accurate explainers of model predictions, machine learning models themselves +are often poor explainers of the DGP even if the model is highly accurate. +Particularly in the presence of interrelated or noisy variables, the output of +a highly predictive model may fail to account for these relationships. This +implies explanations of a trained model's behavior may fail to provide +meaningful insight into the DGP. In this paper we introduce a novel variable +importance algorithm, Shapley Marginal Surplus for Strong Models, that samples +the space of possible models to come up with an inferential measure of feature +importance. We compare this method to other popular feature importance methods, +both Shapley-based and non-Shapley based, and demonstrate significant +outperformance in inferential capabilities relative to other methods. + +
+
+
+
+
+ + ☆ Entropy Coding of Unordered Data Structures ICLR 2024 + + +
+ We present shuffle coding, a general method for optimal compression of +sequences of unordered objects using bits-back coding. Data structures that can +be compressed using shuffle coding include multisets, graphs, hypergraphs, and +others. We release an implementation that can easily be adapted to different +data types and statistical models, and demonstrate that our implementation +achieves state-of-the-art compression rates on a range of graph datasets +including molecular data. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ☆ LEVIS: Large Exact Verifiable Input Spaces for Neural Networks + + +
+ The robustness of neural networks is paramount in safety-critical +applications. While most current robustness verification methods assess the +worst-case output under the assumption that the input space is known, +identifying a verifiable input space $\mathcal{C}$, where no adversarial +examples exist, is crucial for effective model selection, robustness +evaluation, and the development of reliable control strategies. To address this +challenge, we introduce a novel framework, $\texttt{LEVIS}$, comprising +$\texttt{LEVIS}$-$\alpha$ and $\texttt{LEVIS}$-$\beta$. +$\texttt{LEVIS}$-$\alpha$ locates the largest possible verifiable ball within +the central region of $\mathcal{C}$ that intersects at least two boundaries. In +contrast, $\texttt{LEVIS}$-$\beta$ integrates multiple verifiable balls to +encapsulate the entirety of the verifiable space comprehensively. Our +contributions are threefold: (1) We propose $\texttt{LEVIS}$ equipped with +three pioneering techniques that identify the maximum verifiable ball and the +nearest adversarial point along collinear or orthogonal directions. (2) We +offer a theoretical analysis elucidating the properties of the verifiable balls +acquired through $\texttt{LEVIS}$-$\alpha$ and $\texttt{LEVIS}$-$\beta$. (3) We +validate our methodology across diverse applications, including electrical +power flow regression and image classification, showcasing performance +enhancements and visualizations of the searching characteristics. + +
+
+
+
+
+ + ☆ Optimal Symmetries in Binary Classification + + +
+ We explore the role of group symmetries in binary classification tasks, +presenting a novel framework that leverages the principles of Neyman-Pearson +optimality. Contrary to the common intuition that larger symmetry groups lead +to improved classification performance, our findings show that selecting the +appropriate group symmetries is crucial for optimising generalisation and +sample efficiency. We develop a theoretical foundation for designing group +equivariant neural networks that align the choice of symmetries with the +underlying probability distributions of the data. Our approach provides a +unified methodology for improving classification accuracy across a broad range +of applications by carefully tailoring the symmetry group to the specific +characteristics of the problem. Theoretical analysis and experimental results +demonstrate that optimal classification performance is not always associated +with the largest equivariant groups possible in the domain, even when the +likelihood ratio is invariant under one of its proper subgroups, but rather +with those subgroups themselves. This work offers insights and practical +guidelines for constructing more effective group equivariant architectures in +diverse machine-learning contexts. + +
+
+ comment: 13 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ An Empirical Examination of Balancing Strategy for Counterfactual + Estimation on Time Series ICML 2024 + + +
+ Counterfactual estimation from observations represents a critical endeavor in +numerous application fields, such as healthcare and finance, with the primary +challenge being the mitigation of treatment bias. The balancing strategy aimed +at reducing covariate disparities between different treatment groups serves as +a universal solution. However, when it comes to the time series data, the +effectiveness of balancing strategies remains an open question, with a thorough +analysis of the robustness and applicability of balancing strategies still +lacking. This paper revisits counterfactual estimation in the temporal setting +and provides a brief overview of recent advancements in balancing strategies. +More importantly, we conduct a critical empirical examination for the +effectiveness of the balancing strategies within the realm of temporal +counterfactual estimation in various settings on multiple datasets. Our +findings could be of significant interest to researchers and practitioners and +call for a reexamination of the balancing strategy in time series settings. + +
+
+ comment: ICML 2024 Carema Ready Version. 20 Pages, 12 Figures, 10 Tables +
+
+
+
+
+ + ☆ CAT: Caution Aware Transfer in Reinforcement Learning via Distributional + Risk + + +
+ Transfer learning in reinforcement learning (RL) has become a pivotal +strategy for improving data efficiency in new, unseen tasks by utilizing +knowledge from previously learned tasks. This approach is especially beneficial +in real-world deployment scenarios where computational resources are +constrained and agents must adapt rapidly to novel environments. However, +current state-of-the-art methods often fall short in ensuring safety during the +transfer process, particularly when unforeseen risks emerge in the deployment +phase. In this work, we address these limitations by introducing a novel +Caution-Aware Transfer Learning (CAT) framework. Unlike traditional approaches +that limit risk considerations to mean-variance, we define "caution" as a more +generalized and comprehensive notion of risk. Our core innovation lies in +optimizing a weighted sum of reward return and caution-based on state-action +occupancy measures-during the transfer process, allowing for a rich +representation of diverse risk factors. To the best of our knowledge, this is +the first work to explore the optimization of such a generalized risk notion +within the context of transfer RL. Our contributions are threefold: (1) We +propose a Caution-Aware Transfer (CAT) framework that evaluates source policies +within the test environment and constructs a new policy that balances reward +maximization and caution. (2) We derive theoretical sub-optimality bounds for +our method, providing rigorous guarantees of its efficacy. (3) We empirically +validate CAT, demonstrating that it consistently outperforms existing methods +by delivering safer policies under varying risk conditions in the test tasks. + +
+
+
+
+
+ + ☆ Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge + + +
+ Large Language Models (LLMs) have revolutionized the landscape of machine +learning, yet current benchmarks often fall short in capturing the diverse +behavior of these models in real-world applications. A benchmark's usefulness +is determined by its ability to clearly differentiate between models of varying +capabilities (separability) and closely align with human preferences. Existing +frameworks like Alpaca-Eval 2.0 LC +\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1 +\cite{li2024crowdsourced} are limited by their focus on general-purpose queries +and lack of diversity across domains such as law, medicine, and multilingual +contexts. In this paper, we address these limitations by introducing a novel +data pipeline that curates diverse, domain-specific evaluation sets tailored +for LLM-as-a-Judge frameworks. Our approach leverages a combination of manual +curation, semi-supervised learning to generate clusters, and stratified +sampling to ensure balanced representation across a wide range of domains and +languages. The resulting evaluation set, which includes 1573 samples across 14 +categories, demonstrates high separability (84\%) across ten top-ranked models, +and agreement (84\%) with Chatbot Arena and (0.915) Spearman correlation. The +agreement values are 9\% better than Arena Hard and 20\% better than AlpacaEval +2.0 LC, while the Spearman coefficient is 0.7 more than the next best +benchmark, showcasing a significant improvement in the usefulness of the +benchmark. We further provide an open-source evaluation tool that enables +fine-grained analysis of model performance across user-defined categories, +offering valuable insights for practitioners. This work contributes to the +ongoing effort to enhance the transparency, diversity, and effectiveness of LLM +evaluation methodologies. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Representation Learning of Geometric Trees + + +
+ Geometric trees are characterized by their tree-structured layout and +spatially constrained nodes and edges, which significantly impacts their +topological attributes. This inherent hierarchical structure plays a crucial +role in domains such as neuron morphology and river geomorphology, but +traditional graph representation methods often overlook these specific +characteristics of tree structures. To address this, we introduce a new +representation learning framework tailored for geometric trees. It first +features a unique message passing neural network, which is both provably +geometrical structure-recoverable and rotation-translation invariant. To +address the data label scarcity issue, our approach also includes two +innovative training targets that reflect the hierarchical ordering and +geometric structure of these geometric trees. This enables fully +self-supervised learning without explicit labels. We validate our method's +effectiveness on eight real-world datasets, demonstrating its capability to +represent geometric trees. + +
+
+
+
+
+ + ☆ Neighbor Overlay-Induced Graph Attention Network + + +
+ Graph neural networks (GNNs) have garnered significant attention due to their +ability to represent graph data. Among various GNN variants, graph attention +network (GAT) stands out since it is able to dynamically learn the importance +of different nodes. However, present GATs heavily rely on the smoothed node +features to obtain the attention coefficients rather than graph structural +information, which fails to provide crucial contextual cues for node +representations. To address this issue, this study proposes a neighbor +overlay-induced graph attention network (NO-GAT) with the following two-fold +ideas: a) learning favorable structural information, i.e., overlaid neighbors, +outside the node feature propagation process from an adjacency matrix; b) +injecting the information of overlaid neighbors into the node feature +propagation process to compute the attention coefficient jointly. Empirical +studies on graph benchmark datasets indicate that the proposed NO-GAT +consistently outperforms state-of-the-art models. + +
+
+
+
+
+ + ☆ A Transparency Paradox? Investigating the Impact of Explanation + Specificity and Autonomous Vehicle Perceptual Inaccuracies on Passengers + + +
+ Transparency in automated systems could be afforded through the provision of +intelligible explanations. While transparency is desirable, might it lead to +catastrophic outcomes (such as anxiety), that could outweigh its benefits? It's +quite unclear how the specificity of explanations (level of transparency) +influences recipients, especially in autonomous driving (AD). In this work, we +examined the effects of transparency mediated through varying levels of +explanation specificity in AD. We first extended a data-driven explainer model +by adding a rule-based option for explanation generation in AD, and then +conducted a within-subject lab study with 39 participants in an immersive +driving simulator to study the effect of the resulting explanations. +Specifically, our investigation focused on: (1) how different types of +explanations (specific vs. abstract) affect passengers' perceived safety, +anxiety, and willingness to take control of the vehicle when the vehicle +perception system makes erroneous predictions; and (2) the relationship between +passengers' behavioural cues and their feelings during the autonomous drives. +Our findings showed that passengers felt safer with specific explanations when +the vehicle's perception system had minimal errors, while abstract explanations +that hid perception errors led to lower feelings of safety. Anxiety levels +increased when specific explanations revealed perception system errors (high +transparency). We found no significant link between passengers' visual patterns +and their anxiety levels. Our study suggests that passengers prefer clear and +specific explanations (high transparency) when they originate from autonomous +vehicles (AVs) with optimal perceptual accuracy. + +
+
+ comment: Submitted to Transportation Research Part F: Traffic Psychology and + Behaviour. arXiv admin note: text overlap with arXiv:2307.00633 +
+
+
+
+
+ + ☆ NEAR: A Training-Free Pre-Estimator of Machine Learning Model + Performance + + +
+ Artificial neural networks have been shown to be state-of-the-art machine +learning models in a wide variety of applications, including natural language +processing and image recognition. However, building a performant neural network +is a laborious task and requires substantial computing power. Neural +Architecture Search (NAS) addresses this issue by an automatic selection of the +optimal network from a set of potential candidates. While many NAS methods +still require training of (some) neural networks, zero-cost proxies promise to +identify the optimal network without training. In this work, we propose the +zero-cost proxy Network Expressivity by Activation Rank (NEAR). It is based on +the effective rank of the pre- and post-activation matrix, i.e., the values of +a neural network layer before and after applying its activation function. We +demonstrate the cutting-edge correlation between this network score and the +model accuracy on NAS-Bench-101 and NATS-Bench-SSS/TSS. In addition, we present +a simple approach to estimate the optimal layer sizes in multi-layer +perceptrons. Furthermore, we show that this score can be utilized to select +hyperparameters such as the activation function and the neural network weight +initialization scheme. + +
+
+ comment: 12 pages, 4 figures, 10 tables +
+
+
+
+
+ + ☆ Speckle Noise Analysis for Synthetic Aperture Radar (SAR) Space Data + + +
+ This research tackles the challenge of speckle noise in Synthetic Aperture +Radar (SAR) space data, a prevalent issue that hampers the clarity and utility +of SAR images. The study presents a comparative analysis of six distinct +speckle noise reduction techniques: Lee Filtering, Frost Filtering, Kuan +Filtering, Gaussian Filtering, Median Filtering, and Bilateral Filtering. These +methods, selected for their unique approaches to noise reduction and image +preservation, were applied to SAR datasets sourced from the Alaska Satellite +Facility (ASF). The performance of each technique was evaluated using a +comprehensive set of metrics, including Peak Signal-to-Noise Ratio (PSNR), Mean +Squared Error (MSE), Structural Similarity Index (SSIM), Equivalent Number of +Looks (ENL), and Speckle Suppression Index (SSI). The study concludes that both +the Lee and Kuan Filters are effective, with the choice of filter depending on +the specific application requirements for image quality and noise suppression. +This work provides valuable insights into optimizing SAR image processing, with +significant implications for remote sensing, environmental monitoring, and +geological surveying. + +
+
+
+
+
+ + ☆ Pessimistic Iterative Planning for Robust POMDPs + + +
+ Robust partially observable Markov decision processes (robust POMDPs) extend +classical POMDPs to handle additional uncertainty on the transition and +observation probabilities via so-called uncertainty sets. Policies for robust +POMDPs must not only be memory-based to account for partial observability but +also robust against model uncertainty to account for the worst-case instances +from the uncertainty sets. We propose the pessimistic iterative planning (PIP) +framework, which finds robust memory-based policies for robust POMDPs. PIP +alternates between two main steps: (1) selecting an adversarial (non-robust) +POMDP via worst-case probability instances from the uncertainty sets; and (2) +computing a finite-state controller (FSC) for this adversarial POMDP. We +evaluate the performance of this FSC on the original robust POMDP and use this +evaluation in step (1) to select the next adversarial POMDP. Within PIP, we +propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC through +a recurrent neural network trained using supervision policies optimized for the +adversarial POMDP. The empirical evaluation in four benchmark environments +showcases improved robustness against a baseline method in an ablation study +and competitive performance compared to a state-of-the-art robust POMDP solver. + +
+
+
+
+
+ + ☆ SYMPOL: Symbolic Tree-Based On-Policy Reinforcement Learning + + +
+ Reinforcement learning (RL) has seen significant success across various +domains, but its adoption is often limited by the black-box nature of neural +network policies, making them difficult to interpret. In contrast, symbolic +policies allow representing decision-making strategies in a compact and +interpretable way. However, learning symbolic policies directly within +on-policy methods remains challenging. In this paper, we introduce SYMPOL, a +novel method for SYMbolic tree-based on-POLicy RL. SYMPOL employs a tree-based +model integrated with a policy gradient method, enabling the agent to learn and +adapt its actions while maintaining a high level of interpretability. We +evaluate SYMPOL on a set of benchmark RL tasks, demonstrating its superiority +over alternative tree-based RL approaches in terms of performance and +interpretability. To the best of our knowledge, this is the first method, that +allows a gradient-based end-to-end learning of interpretable, axis-aligned +decision trees on-policy. Therefore, SYMPOL can become the foundation for a new +class of interpretable RL based on decision trees. Our implementation is +available under: https://github.com/s-marton/SYMPOL + +
+
+
+
+
+ + ☆ SE-SGformer: A Self-Explainable Signed Graph Transformer for Link Sign + Prediction + + +
+ Signed Graph Neural Networks (SGNNs) have been shown to be effective in +analyzing complex patterns in real-world situations where positive and negative +links coexist. However, SGNN models suffer from poor explainability, which +limit their adoptions in critical scenarios that require understanding the +rationale behind predictions. To the best of our knowledge, there is currently +no research work on the explainability of the SGNN models. Our goal is to +address the explainability of decision-making for the downstream task of link +sign prediction specific to signed graph neural networks. Since post-hoc +explanations are not derived directly from the models, they may be biased and +misrepresent the true explanations. Therefore, in this paper we introduce a +Self-Explainable Signed Graph transformer (SE-SGformer) framework, which can +not only outputs explainable information while ensuring high prediction +accuracy. Specifically, We propose a new Transformer architecture for signed +graphs and theoretically demonstrate that using positional encoding based on +signed random walks has greater expressive power than current SGNN methods and +other positional encoding graph Transformer-based approaches. We constructs a +novel explainable decision process by discovering the $K$-nearest (farthest) +positive (negative) neighbors of a node to replace the neural network-based +decoder for predicting edge signs. These $K$ positive (negative) neighbors +represent crucial information about the formation of positive (negative) edges +between nodes and thus can serve as important explanatory information in the +decision-making process. We conducted experiments on several real-world +datasets to validate the effectiveness of SE-SGformer, which outperforms the +state-of-the-art methods by improving 2.2\% prediction accuracy and 73.1\% +explainablity accuracy in the best-case scenario. + +
+
+
+
+
+ + ☆ ML Study of MaliciousTransactions in Ethereum + + +
+ Smart contracts are a major tool in Ethereum transactions. Therefore hackers +can exploit them by adding code vulnerabilities to their sources and using +these vulnerabilities for performing malicious transactions. This paper +presents two successful approaches for detecting malicious contracts: one uses +opcode and relies on GPT2 and the other uses the Solidity source and a LORA +fine-tuned CodeLlama. Finally, we present an XGBOOST model that combines gas +properties and Hexa-decimal signatures for detecting malicious transactions. +This approach relies on early assumptions that maliciousness is manifested by +the uncommon usage of the contracts' functions and the effort to pursue the +transaction. + +
+
+
+
+
+ + ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature + Interaction Modeling in CTR Prediction + + +
+ Modeling feature interactions is crucial for click-through rate (CTR) +prediction, particularly when it comes to high-order explicit interactions. +Traditional methods struggle with this task because they often predefine a +maximum interaction order, which relies heavily on prior knowledge and can +limit the model's effectiveness. Additionally, modeling high-order interactions +typically leads to increased computational costs. Therefore, the challenge lies +in adaptively modeling high-order feature interactions while maintaining +efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented +Sparse Efficient Interaction Network (KarSein), designed to optimize both +predictive accuracy and computational efficiency. We firstly identify +limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and +then introduce KarSein to overcome these issues. It features a novel +architecture that reduces the computational costs of KAN and supports embedding +vectors as feature inputs. Additionally, KarSein employs guided symbolic +regression to address the challenge of KAN in spontaneously learning +multiplicative relationships. Extensive experiments demonstrate KarSein's +superior performance, achieving significant predictive accuracy with minimal +computational overhead. Furthermore, KarSein maintains strong global +explainability while enabling the removal of redundant features, resulting in a +sparse network structure. These advantages also position KarSein as a promising +method for efficient inference. + +
+
+ comment: KarSein for CTR +
+
+
+
+
+ + ☆ Beam Prediction based on Large Language Models + + +
+ Millimeter-wave (mmWave) communication is promising for next-generation +wireless networks but suffers from significant path loss, requiring extensive +antenna arrays and frequent beam training. Traditional deep learning models, +such as long short-term memory (LSTM), enhance beam tracking accuracy however +are limited by poor robustness and generalization. In this letter, we use large +language models (LLMs) to improve the robustness of beam prediction. By +converting time series data into text-based representations and employing the +Prompt-as-Prefix (PaP) technique for contextual enrichment, our approach +unleashes the strength of LLMs for time series forecasting. Simulation results +demonstrate that our LLM-based method offers superior robustness and +generalization compared to LSTM-based models, showcasing the potential of LLMs +in wireless communications. + +
+
+
+
+
+ + ☆ Efficient Multi-Policy Evaluation for Reinforcement Learning + + +
+ To unbiasedly evaluate multiple target policies, the dominant approach among +RL practitioners is to run and evaluate each target policy separately. However, +this evaluation method is far from efficient because samples are not shared +across policies, and running target policies to evaluate themselves is actually +not optimal. In this paper, we address these two weaknesses by designing a +tailored behavior policy to reduce the variance of estimators across all target +policies. Theoretically, we prove that executing this behavior policy with +manyfold fewer samples outperforms on-policy evaluation on every target policy +under characterized conditions. Empirically, we show our estimator has a +substantially lower variance compared with previous best methods and achieves +state-of-the-art performance in a broad range of environments. + +
+
+
+
+
+ + ☆ RBLA: Rank-Based-LoRA-Aggregation for Fine-tuning Heterogeneous Models + in FLaaS + + +
+ Federated Learning (FL) is a promising privacy-aware distributed learning +framework that can be deployed on various devices, such as mobile phones, +desktops, and devices equipped with CPUs or GPUs. In the context of +server-based Federated Learning as a Service (FLaas), FL enables the central +server to coordinate the training process across multiple devices without +direct access to the local data, thereby enhancing privacy and data security. +Low-Rank Adaptation (LoRA) is a method that fine-tunes models efficiently by +focusing on a low-dimensional subspace of the model's parameters. This approach +significantly reduces computational and memory costs compared to fine-tuning +all parameters from scratch. When integrated with FL, especially in a FLaas +environment, LoRA allows for flexible and efficient deployment across diverse +hardware with varying computational capabilities by adjusting the local model's +rank. However, in LoRA-enabled FL, different clients may train models with +varying ranks, which poses a challenge for model aggregation on the server. +Current methods of aggregating models of different ranks require padding +weights to a uniform shape, which can degrade the global model's performance. +To address this issue, we propose Rank-Based LoRA Aggregation (RBLA), a novel +model aggregation method designed for heterogeneous LoRA structures. RBLA +preserves key features across models with different ranks. This paper analyzes +the issues with current padding methods that reshape models for aggregation in +a FLaas environment. Then, we introduce RBLA, a rank-based aggregation method +that maintains both low-rank and high-rank features. Finally, we demonstrate +the effectiveness of RBLA through comparative experiments with state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Turning Trash into Treasure: Accelerating Inference of Large Language + Models with Token Recycling + + +
+ The rapid growth in the parameters of large language models (LLMs) has made +inference latency a fundamental bottleneck, limiting broader application of +LLMs. Speculative decoding represents a lossless approach to accelerate +inference through a guess-and-verify paradigm, leveraging the parallel +capabilities of modern hardware. Some speculative decoding methods rely on +additional structures to guess draft tokens, such as small models or +parameter-efficient architectures, which need extra training before use. +Alternatively, retrieval-based train-free techniques build libraries from +pre-existing corpora or by n-gram generation. However, they face challenges +like large storage requirements, time-consuming retrieval, and limited +adaptability. Observing that candidate tokens generated during the decoding +process are likely to reoccur in future sequences, we propose Token Recycling. +This approach stores candidate tokens in an adjacency matrix and employs a +breadth-first search (BFS)-like algorithm on the matrix to construct a draft +tree. The tree is then validated through tree attention. New candidate tokens +from the decoding process are then used to update the matrix. Token Recycling +requires \textless2MB of additional storage and achieves approximately 2x +speedup across all sizes of LLMs. It significantly outperforms existing +train-free methods by 30\% and even a training method by 25\%. It can be +directly applied to any existing LLMs and tasks without the need for +adaptation. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Explore-then-Commit Algorithms for Decentralized Two-Sided Matching + Markets + + +
+ Online learning in a decentralized two-sided matching markets, where the +demand-side (players) compete to match with the supply-side (arms), has +received substantial interest because it abstracts out the complex interactions +in matching platforms (e.g. UpWork, TaskRabbit). However, past works assume +that each arm knows their preference ranking over the players (one-sided +learning), and each player aim to learn the preference over arms through +successive interactions. Moreover, several (impractical) assumptions on the +problem are usually made for theoretical tractability such as broadcast +player-arm match Liu et al. (2020; 2021); Kong & Li (2023) or serial +dictatorship Sankararaman et al. (2021); Basu et al. (2021); Ghosh et al. +(2022). In this paper, we study a decentralized two-sided matching market, +where we do not assume that the preference ranking over players are known to +the arms apriori. Furthermore, we do not have any structural assumptions on the +problem. We propose a multi-phase explore-then-commit type algorithm namely +epoch-based CA-ETC (collision avoidance explore then commit) (\texttt{CA-ETC} +in short) for this problem that does not require any communication across +agents (players and arms) and hence decentralized. We show that for the initial +epoch length of $T_{\circ}$ and subsequent epoch-lengths of $2^{l/\gamma} +T_{\circ}$ (for the $l-$th epoch with $\gamma \in (0,1)$ as an input parameter +to the algorithm), \texttt{CA-ETC} yields a player optimal expected regret of +$\mathcal{O}\left(T_{\circ} (\frac{K \log T}{T_{\circ} \Delta^2})^{1/\gamma} + +T_{\circ} (\frac{T}{T_{\circ}})^\gamma\right)$ for the $i$-th player, where $T$ +is the learning horizon, $K$ is the number of arms and $\Delta$ is an +appropriately defined problem gap. Furthermore, we propose a blackboard +communication based baseline achieving logarithmic regret in $T$. + +
+
+ comment: Accepted at International Symposium of Information Theory (ISIT) 2024 +
+
+
+
+
+ + ☆ Can Large Language Models Improve the Adversarial Robustness of Graph + Neural Networks? + + +
+ Graph neural networks (GNNs) are vulnerable to adversarial perturbations, +especially for topology attacks, and many methods that improve the robustness +of GNNs have received considerable attention. Recently, we have witnessed the +significant success of large language models (LLMs), leading many to explore +the great potential of LLMs on GNNs. However, they mainly focus on improving +the performance of GNNs by utilizing LLMs to enhance the node features. +Therefore, we ask: Will the robustness of GNNs also be enhanced with the +powerful understanding and inference capabilities of LLMs? By presenting the +empirical results, we find that despite that LLMs can improve the robustness of +GNNs, there is still an average decrease of 23.1% in accuracy, implying that +the GNNs remain extremely vulnerable against topology attack. Therefore, +another question is how to extend the capabilities of LLMs on graph adversarial +robustness. In this paper, we propose an LLM-based robust graph structure +inference framework, LLM4RGNN, which distills the inference capabilities of +GPT-4 into a local LLM for identifying malicious edges and an LM-based edge +predictor for finding missing important edges, so as to recover a robust graph +structure. Extensive experiments demonstrate that LLM4RGNN consistently +improves the robustness across various GNNs. Even in some cases where the +perturbation ratio increases to 40%, the accuracy of GNNs is still better than +that on the clean graph. + +
+
+
+
+
+ + ☆ Research on Personalized Compression Algorithm for Pre-trained Models + Based on Homomorphic Entropy Increase + + +
+ In this article, we explore the challenges and evolution of two key +technologies in the current field of AI: Vision Transformer model and Large +Language Model (LLM). Vision Transformer captures global information by +splitting images into small pieces and leveraging Transformer's multi-head +attention mechanism, but its high reference count and compute overhead limit +deployment on mobile devices. At the same time, the rapid development of LLM +has revolutionized natural language processing, but it also faces huge +deployment challenges. To address these issues, we investigate model pruning +techniques, with a particular focus on how to reduce redundant parameters +without losing accuracy to accommodate personalized data and +resource-constrained environments. In this paper, a new layered pruning +strategy is proposed to distinguish the personalized layer from the common +layer by compressed sensing and random sampling, thus significantly reducing +the model parameters. Our experimental results show that the introduced step +buffering mechanism further improves the accuracy of the model after pruning, +providing new directions and possibilities for the deployment of efficient and +personalized AI models on mobile devices in the future. + +
+
+
+
+
+ + ☆ A Mean Field Ansatz for Zero-Shot Weight Transfer + + +
+ The pre-training cost of large language models (LLMs) is prohibitive. One +cutting-edge approach to reduce the cost is zero-shot weight transfer, also +known as model growth for some cases, which magically transfers the weights +trained in a small model to a large model. However, there are still some +theoretical mysteries behind the weight transfer. In this paper, inspired by +prior applications of mean field theory to neural network dynamics, we +introduce a mean field ansatz to provide a theoretical explanation for weight +transfer. Specifically, we propose the row-column (RC) ansatz under the mean +field point of view, which describes the measure structure of the weights in +the neural network (NN) and admits a close measure dynamic. Thus, the weights +of different sizes NN admit a common distribution under proper assumptions, and +weight transfer methods can be viewed as sampling methods. We empirically +validate the RC ansatz by exploring simple MLP examples and LLMs such as GPT-3 +and Llama-3.1. We show the mean-field point of view is adequate under suitable +assumptions which can provide theoretical support for zero-shot weight +transfer. + +
+
+ comment: 40 pages, 6 Figures, 1 table +
+
+
+
+
+ + ☆ Neural Reward Machines + + +
+ Non-markovian Reinforcement Learning (RL) tasks are very hard to solve, +because agents must consider the entire history of state-action pairs to act +rationally in the environment. Most works use symbolic formalisms (as Linear +Temporal Logic or automata) to specify the temporally-extended task. These +approaches only work in finite and discrete state environments or continuous +problems for which a mapping between the raw state and a symbolic +interpretation is known as a symbol grounding (SG) function. Here, we define +Neural Reward Machines (NRM), an automata-based neurosymbolic framework that +can be used for both reasoning and learning in non-symbolic non-markovian RL +domains, which is based on the probabilistic relaxation of Moore Machines. We +combine RL with semisupervised symbol grounding (SSSG) and we show that NRMs +can exploit high-level symbolic knowledge in non-symbolic environments without +any knowledge of the SG function, outperforming Deep RL methods which cannot +incorporate prior knowledge. Moreover, we advance the research in SSSG, +proposing an algorithm for analysing the groundability of temporal +specifications, which is more efficient than baseline techniques of a factor +$10^3$. + +
+
+
+
+
+ + ☆ Misclassification excess risk bounds for PAC-Bayesian classification via + convexified loss + + +
+ PAC-Bayesian bounds have proven to be a valuable tool for deriving +generalization bounds and for designing new learning algorithms in machine +learning. However, it typically focus on providing generalization bounds with +respect to a chosen loss function. In classification tasks, due to the +non-convex nature of the 0-1 loss, a convex surrogate loss is often used, and +thus current PAC-Bayesian bounds are primarily specified for this convex +surrogate. This work shifts its focus to providing misclassification excess +risk bounds for PAC-Bayesian classification when using a convex surrogate loss. +Our key ingredient here is to leverage PAC-Bayesian relative bounds in +expectation rather than relying on PAC-Bayesian bounds in probability. We +demonstrate our approach in several important applications. + +
+
+
+
+
+ + ☆ A Multivocal Literature Review on Privacy and Fairness in Federated + Learning + + +
+ Federated Learning presents a way to revolutionize AI applications by +eliminating the necessity for data sharing. Yet, research has shown that +information can still be extracted during training, making additional +privacy-preserving measures such as differential privacy imperative. To +implement real-world federated learning applications, fairness, ranging from a +fair distribution of performance to non-discriminative behaviour, must be +considered. Particularly in high-risk applications (e.g. healthcare), avoiding +the repetition of past discriminatory errors is paramount. As recent research +has demonstrated an inherent tension between privacy and fairness, we conduct a +multivocal literature review to examine the current methods to integrate +privacy and fairness in federated learning. Our analyses illustrate that the +relationship between privacy and fairness has been neglected, posing a critical +risk for real-world applications. We highlight the need to explore the +relationship between privacy, fairness, and performance, advocating for the +creation of integrated federated learning frameworks. + +
+
+ comment: Accepted for publication at the Internationale Tagung + Wirtschaftsinformatik 2024 +
+
+
+
+
+ + ☆ A new perspective on Bayesian Operational Modal Analysis + + +
+ In the field of operational modal analysis (OMA), obtained modal information +is frequently used to assess the current state of aerospace, mechanical, +offshore and civil structures. However, the stochasticity of operational +systems and the lack of forcing information can lead to inconsistent results. +Quantifying the uncertainty of the recovered modal parameters through OMA is +therefore of significant value. In this article, a new perspective on Bayesian +OMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm. +Distinct from existing approaches to Bayesian OMA, a hierarchical probabilistic +model is embedded at the core of covariance-driven SSI. Through substitution of +canonical correlation analysis with a Bayesian equivalent, posterior +distributions over the modal properties are obtained. Two inference schemes are +presented for the proposed Bayesian formulation: Markov Chain Monte Carlo and +variational Bayes. Two case studies are then explored. The first is benchmark +study using data from a simulated, multi degree-of-freedom, linear system. +Following application of Bayesian SSI, it is shown that the same posterior is +targeted and recovered by both inference schemes, with good agreement between +the posterior mean and the conventional SSI result. The second study applies +the variational form to data obtained from an in-service structure: The Z24 +bridge. The results of this study are presented at single model orders, and +then using a stabilisation diagram. The recovered posterior uncertainty is +presented and compared to the classic SSI result. It is observed that the +posterior distributions with mean values coinciding with the natural +frequencies exhibit lower variance than values situated away from the natural +frequencies. + +
+
+
+
+
+ + ☆ MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector + + +
+ The increasing parameters and expansive dataset of large language models +(LLMs) highlight the urgent demand for a technical solution to audit the +underlying privacy risks and copyright issues associated with LLMs. Existing +studies have partially addressed this need through an exploration of the +pre-training data detection problem, which is an instance of a membership +inference attack (MIA). This problem involves determining whether a given piece +of text has been used during the pre-training phase of the target LLM. Although +existing methods have designed various sophisticated MIA score functions to +achieve considerable detection performance in pre-trained LLMs, how to achieve +high-confidence detection and how to perform MIA on aligned LLMs remain +challenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA +method, which instructs LLMs themselves to serve as a more precise pre-training +data detector internally, rather than design an external MIA score function. +Furthermore, we design two instruction-based safeguards to respectively +mitigate the privacy risks brought by the existing methods and MIA-Tuner. To +comprehensively evaluate the most recent state-of-the-art LLMs, we collect a +more up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely +adopted benchmark WIKIMIA. We conduct extensive experiments across various +aligned and unaligned LLMs over the two benchmark datasets. The results +demonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a +significantly high level of 0.9. + +
+
+ comment: code and dataset: https://github.com/wjfu99/MIA-Tuner +
+
+
+
+
+ + ☆ Mitigating Backdoor Attacks in Federated Learning via Flipping Weight + Updates of Low-Activation Input Neurons + + +
+ Federated learning enables multiple clients to collaboratively train machine +learning models under the overall planning of the server while adhering to +privacy requirements. However, the server cannot directly oversee the local +training process, creating an opportunity for malicious clients to introduce +backdoors. Existing research shows that backdoor attacks activate specific +neurons in the compromised model, which remain dormant when processing clean +data. Leveraging this insight, we propose a method called Flipping Weight +Updates of Low-Activation Input Neurons (FLAIN) to defend against backdoor +attacks in federated learning. Specifically, after completing global training, +we employ an auxiliary dataset to identify low-activation input neurons and +flip the associated weight updates. We incrementally raise the threshold for +low-activation inputs and flip the weight updates iteratively, until the +performance degradation on the auxiliary data becomes unacceptable. Extensive +experiments validate that our method can effectively reduce the success rate of +backdoor attacks to a low level in various attack scenarios including those +with non-IID data distribution or high MCRs, causing only minimal performance +degradation on clean data. + +
+
+
+
+
+ + ☆ TextCAVs: Debugging vision models using text MICCAI 2024 + + +
+ Concept-based interpretability methods are a popular form of explanation for +deep learning models which provide explanations in the form of high-level human +interpretable concepts. These methods typically find concept activation vectors +(CAVs) using a probe dataset of concept examples. This requires labelled data +for these concepts -- an expensive task in the medical domain. We introduce +TextCAVs: a novel method which creates CAVs using vision-language models such +as CLIP, allowing for explanations to be created solely using text descriptions +of the concept, as opposed to image exemplars. This reduced cost in testing +concepts allows for many concepts to be tested and for users to interact with +the model, testing new ideas as they are thought of, rather than a delay caused +by image collection and annotation. In early experimental results, we +demonstrate that TextCAVs produces reasonable explanations for a chest x-ray +dataset (MIMIC-CXR) and natural images (ImageNet), and that these explanations +can be used to debug deep learning-based models. + +
+
+ comment: 11 pages, 2 figures. Accepted at iMIMIC Workshop at MICCAI 2024 +
+
+
+
+
+ + ☆ Modeling the Neonatal Brain Development Using Implicit Neural + Representations MICCAI 2024 + + +
+ The human brain undergoes rapid development during the third trimester of +pregnancy. In this work, we model the neonatal development of the infant brain +in this age range. As a basis, we use MR images of preterm- and term-birth +neonates from the developing human connectome project (dHCP). We propose a +neural network, specifically an implicit neural representation (INR), to +predict 2D- and 3D images of varying time points. In order to model a +subject-specific development process, it is necessary to disentangle the age +from the subjects' identity in the latent space of the INR. We propose two +methods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent +Augmentation (SGLA), enabling this disentanglement. We perform an analysis of +the results and compare our proposed model to an age-conditioned denoising +diffusion model as a baseline. We also show that our method can be applied in a +memory-efficient way, which is especially important for 3D data. + +
+
+ comment: Preprint, Accepted for PRIME MICCAI 2024 +
+
+
+
+
+ + ☆ The Power of Bias: Optimizing Client Selection in Federated Learning + with Heterogeneous Differential Privacy + + +
+ To preserve the data privacy, the federated learning (FL) paradigm emerges in +which clients only expose model gradients rather than original data for +conducting model training. To enhance the protection of model gradients in FL, +differentially private federated learning (DPFL) is proposed which incorporates +differentially private (DP) noises to obfuscate gradients before they are +exposed. Yet, an essential but largely overlooked problem in DPFL is the +heterogeneity of clients' privacy requirement, which can vary significantly +between clients and extremely complicates the client selection problem in DPFL. +In other words, both the data quality and the influence of DP noises should be +taken into account when selecting clients. To address this problem, we conduct +convergence analysis of DPFL under heterogeneous privacy, a generic client +selection strategy, popular DP mechanisms and convex loss. Based on convergence +analysis, we formulate the client selection problem to minimize the value of +loss function in DPFL with heterogeneous privacy, which is a convex +optimization problem and can be solved efficiently. Accordingly, we propose the +DPFL-BCS (biased client selection) algorithm. The extensive experiment results +with real datasets under both convex and non-convex loss functions indicate +that DPFL-BCS can remarkably improve model utility compared with the SOTA +baselines. + +
+
+
+
+
+ + ☆ Solving The Quantum Many-Body Hamiltonian Learning Problem with Neural + Differential Equations + + +
+ Understanding and characterising quantum many-body dynamics remains a +significant challenge due to both the exponential complexity required to +represent quantum many-body Hamiltonians, and the need to accurately track +states in time under the action of such Hamiltonians. This inherent complexity +limits our ability to characterise quantum many-body systems, highlighting the +need for innovative approaches to unlock their full potential. To address this +challenge, we propose a novel method to solve the Hamiltonian Learning (HL) +problem-inferring quantum dynamics from many-body state trajectories-using +Neural Differential Equations combined with an Ansatz Hamiltonian. Our method +is reliably convergent, experimentally friendly, and interpretable, making it a +stable solution for HL on a set of Hamiltonians previously unlearnable in the +literature. In addition to this, we propose a new quantitative benchmark based +on power laws, which can objectively compare the reliability and generalisation +capabilities of any two HL algorithms. Finally, we benchmark our method against +state-of-the-art HL algorithms with a 1D spin-1/2 chain proof of concept. + +
+
+
+
+
+ + ☆ Navigating Uncertainties in Machine Learning for Structural Dynamics: A + Comprehensive Review of Probabilistic and Non-Probabilistic Approaches in + Forward and Inverse Problems + + +
+ In the era of big data, machine learning (ML) has become a powerful tool in +various fields, notably impacting structural dynamics. ML algorithms offer +advantages by modeling physical phenomena based on data, even in the absence of +underlying mechanisms. However, uncertainties such as measurement noise and +modeling errors can compromise the reliability of ML predictions, highlighting +the need for effective uncertainty awareness to enhance prediction robustness. +This paper presents a comprehensive review on navigating uncertainties in ML, +categorizing uncertainty-aware approaches into probabilistic methods (including +Bayesian and frequentist perspectives) and non-probabilistic methods (such as +interval learning and fuzzy learning). Bayesian neural networks, known for +their uncertainty quantification and nonlinear mapping capabilities, are +emphasized for their superior performance and potential. The review covers +various techniques and methodologies for addressing uncertainties in ML, +discussing fundamentals and implementation procedures of each method. While +providing a concise overview of fundamental concepts, the paper refrains from +in-depth critical explanations. Strengths and limitations of each approach are +examined, along with their applications in structural dynamic forward problems +like response prediction, sensitivity assessment, and reliability analysis, and +inverse problems like system identification, model updating, and damage +identification. Additionally, the review identifies research gaps and suggests +future directions for investigations, aiming to provide comprehensive insights +to the research community. By offering an extensive overview of both +probabilistic and non-probabilistic approaches, this review aims to assist +researchers and practitioners in making informed decisions when utilizing ML +techniques to address uncertainties in structural dynamic problems. + +
+
+ comment: 114 pages, 27 figures, 6 tables, references added +
+
+
+
+
+ + ☆ A survey on secure decentralized optimization and learning + + +
+ Decentralized optimization has become a standard paradigm for solving +large-scale decision-making problems and training large machine learning models +without centralizing data. However, this paradigm introduces new privacy and +security risks, with malicious agents potentially able to infer private data or +impair the model accuracy. Over the past decade, significant advancements have +been made in developing secure decentralized optimization and learning +frameworks and algorithms. This survey provides a comprehensive tutorial on +these advancements. We begin with the fundamentals of decentralized +optimization and learning, highlighting centralized aggregation and distributed +consensus as key modules exposed to security risks in federated and distributed +optimization, respectively. Next, we focus on privacy-preserving algorithms, +detailing three cryptographic tools and their integration into decentralized +optimization and learning systems. Additionally, we examine resilient +algorithms, exploring the design and analysis of resilient aggregation and +consensus protocols that support these systems. We conclude the survey by +discussing current trends and potential future directions. + +
+
+ comment: 38 pages +
+
+
+
+
+ + ☆ DeepDFA: Automata Learning through Neural Probabilistic Relaxations + + +
+ In this work, we introduce DeepDFA, a novel approach to identifying +Deterministic Finite Automata (DFAs) from traces, harnessing a differentiable +yet discrete model. Inspired by both the probabilistic relaxation of DFAs and +Recurrent Neural Networks (RNNs), our model offers interpretability +post-training, alongside reduced complexity and enhanced training efficiency +compared to traditional RNNs. Moreover, by leveraging gradient-based +optimization, our method surpasses combinatorial approaches in both scalability +and noise resilience. Validation experiments conducted on target regular +languages of varying size and complexity demonstrate that our approach is +accurate, fast, and robust to noise in both the input symbols and the output +labels of training data, integrating the strengths of both logical grammar +induction and deep learning. + +
+
+
+
+
+ + ☆ Generative Dataset Distillation Based on Diffusion Model ECCV 2024 + + +
+ This paper presents our method for the generative track of The First Dataset +Distillation Challenge at ECCV 2024. Since the diffusion model has become the +mainstay of generative models because of its high-quality generative effects, +we focus on distillation methods based on the diffusion model. Considering that +the track can only generate a fixed number of images in 10 minutes using a +generative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a +generative model that can generate images at high speed. In this study, we +proposed a novel generative dataset distillation method based on Stable +Diffusion. Specifically, we use the SDXL-Turbo model which can generate images +at high speed and quality. Compared to other diffusion models that can only +generate images per class (IPC) = 1, our method can achieve an IPC = 10 for +Tiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to +generate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we +use the class information as text prompts and post data augmentation for the +SDXL-Turbo model. Experimental results show the effectiveness of the proposed +method, and we achieved third place in the generative track of the ECCV 2024 DD +Challenge. Codes are available at https://github.com/Guang000/BANKO. + +
+
+ comment: The Third Place Winner in Generative Track of the ECCV 2024 DD + Challenge +
+
+
+
+
+ + ☆ RadioDiff: An Effective Generative Diffusion Model for Sampling-Free + Dynamic Radio Map Construction + + +
+ Radio map (RM) is a promising technology that can obtain pathloss based on +only location, which is significant for 6G network applications to reduce the +communication costs for pathloss estimation. However, the construction of RM in +traditional is either computationally intensive or depends on costly +sampling-based pathloss measurements. Although the neural network (NN)-based +method can efficiently construct the RM without sampling, its performance is +still suboptimal. This is primarily due to the misalignment between the +generative characteristics of the RM construction problem and the +discrimination modeling exploited by existing NN-based methods. Thus, to +enhance RM construction performance, in this paper, the sampling-free RM +construction is modeled as a conditional generative problem, where a denoised +diffusion-based method, named RadioDiff, is proposed to achieve high-quality RM +construction. In addition, to enhance the diffusion model's capability of +extracting features from dynamic environments, an attention U-Net with an +adaptive fast Fourier transform module is employed as the backbone network to +improve the dynamic environmental features extracting capability. Meanwhile, +the decoupled diffusion model is utilized to further enhance the construction +performance of RMs. Moreover, a comprehensive theoretical analysis of why the +RM construction is a generative problem is provided for the first time, from +both perspectives of data features and NN training methods. Experimental +results show that the proposed RadioDiff achieves state-of-the-art performance +in all three metrics of accuracy, structural similarity, and peak +signal-to-noise ratio. The code is available at +https://github.com/UNIC-Lab/RadioDiff. + +
+
+
+
+
+ + ☆ A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive + Language Models + + +
+ Recent studies on logical reasoning in auto-regressive Language Models (LMs) +have sparked a debate on whether such models can learn systematic reasoning +principles during pre-training or merely exploit superficial patterns in the +training data. This paper presents a mechanistic interpretation of syllogistic +reasoning in LMs to further enhance our understanding of internal dynamics. +Specifically, we present a methodology for circuit discovery aimed at +disentangling content-independent reasoning mechanisms from world knowledge +acquired during pre-training. Through two distinct intervention methods, we +uncover a sufficient and necessary circuit involving middle-term suppression +that elucidates how LMs transfer information to derive valid conclusions from +premises. Furthermore, we investigate how belief biases manifest in syllogistic +reasoning, finding evidence of partial contamination from additional attention +heads responsible for encoding commonsense and contextualized knowledge. +Finally, we explore the generalization of the discovered mechanisms across +various syllogistic schemes and model sizes, finding that the identified +circuit is sufficient and necessary for all the schemes on which the model +achieves high downstream accuracy ($\geq$ 60\%). Overall, our findings suggest +that LMs indeed learn transferable content-independent reasoning mechanisms, +but that, at the same time, such mechanisms do not involve generalisable and +abstract logical primitives, being susceptible to contamination by the same +world knowledge acquired during pre-training. + +
+
+
+
+
+ + ☆ OptDist: Learning Optimal Distribution for Customer Lifetime Value + Prediction CIKM 2024 + + +
+ Customer Lifetime Value (CLTV) prediction is a critical task in business +applications. Accurately predicting CLTV is challenging in real-world business +scenarios, as the distribution of CLTV is complex and mutable. Firstly, there +is a large number of users without any consumption consisting of a long-tailed +part that is too complex to fit. Secondly, the small set of high-value users +spent orders of magnitude more than a typical user leading to a wide range of +the CLTV distribution which is hard to capture in a single distribution. +Existing approaches for CLTV estimation either assume a prior probability +distribution and fit a single group of distribution-related parameters for all +samples, or directly learn from the posterior distribution with manually +predefined buckets in a heuristic manner. However, all these methods fail to +handle complex and mutable distributions. In this paper, we propose a novel +optimal distribution selection model OptDist for CLTV prediction, which +utilizes an adaptive optimal sub-distribution selection mechanism to improve +the accuracy of complex distribution modeling. Specifically, OptDist trains +several candidate sub-distribution networks in the distribution learning module +(DLM) for modeling the probability distribution of CLTV. Then, a distribution +selection module (DSM) is proposed to select the sub-distribution for each +sample, thus making the selection automatically and adaptively. Besides, we +design an alignment mechanism that connects both modules, which effectively +guides the optimization. We conduct extensive experiments on both two public +and one private dataset to verify that OptDist outperforms state-of-the-art +baselines. Furthermore, OptDist has been deployed on a large-scale financial +platform for customer acquisition marketing campaigns and the online +experiments also demonstrate the effectiveness of OptDist. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ S-RAF: A Simulation-Based Robustness Assessment Framework for + Responsible Autonomous Driving + + +
+ As artificial intelligence (AI) technology advances, ensuring the robustness +and safety of AI-driven systems has become paramount. However, varying +perceptions of robustness among AI developers create misaligned evaluation +metrics, complicating the assessment and certification of safety-critical and +complex AI systems such as autonomous driving (AD) agents. To address this +challenge, we introduce Simulation-Based Robustness Assessment Framework +(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to +rigorously assess AD agents across diverse conditions, including faulty +sensors, environmental changes, and complex traffic situations. By quantifying +robustness and its relationship with other safety-critical factors, such as +carbon emissions, S-RAF aids developers and stakeholders in building safe and +responsible driving agents, and streamlining safety certification processes. +Furthermore, S-RAF offers significant advantages, such as reduced testing +costs, and the ability to explore edge cases that may be unsafe to test in the +real world. The code for this framework is available here: +https://github.com/cognitive-robots/rai-leaderboard + +
+
+
+
+
+ + ☆ GrassNet: State Space Model Meets Graph Neural Network + + +
+ Designing spectral convolutional networks is a formidable task in graph +learning. In traditional spectral graph neural networks (GNNs), +polynomial-based methods are commonly used to design filters via the Laplacian +matrix. In practical applications, however, these polynomial methods encounter +inherent limitations, which primarily arise from the the low-order truncation +of polynomial filters and the lack of overall modeling of the graph spectrum. +This leads to poor performance of existing spectral approaches on real-world +graph data, especially when the spectrum is highly concentrated or contains +many numerically identical values, as they tend to apply the exact same +modulation to signals with the same frequencies. To overcome these issues, in +this paper, we propose Graph State Space Network (GrassNet), a novel graph +neural network with theoretical support that provides a simple yet effective +scheme for designing and learning arbitrary graph spectral filters. In +particular, our GrassNet introduces structured state space models (SSMs) to +model the correlations of graph signals at different frequencies and derives a +unique rectification for each frequency in the graph spectrum. To the best of +our knowledge, our work is the first to employ SSMs for the design of GNN +spectral filters, and it theoretically offers greater expressive power compared +with polynomial filters. Extensive experiments on nine public benchmarks reveal +that GrassNet achieves superior performance in real-world graph modeling tasks. + +
+
+
+
+
+ + ☆ S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton + Sketching + + +
+ Attention based models have achieved many remarkable breakthroughs in +numerous applications. However, the quadratic complexity of Attention makes the +vanilla Attention based models hard to apply to long sequence tasks. Various +improved Attention structures are proposed to reduce the computation cost by +inducing low rankness and approximating the whole sequence by sub-sequences. +The most challenging part of those approaches is maintaining the proper balance +between information preservation and computation reduction: the longer +sub-sequences used, the better information is preserved, but at the price of +introducing more noise and computational costs. In this paper, we propose a +smoothed skeleton sketching based Attention structure, coined S$^3$Attention, +which significantly improves upon the previous attempts to negotiate this +trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact +of noise while keeping the linear complexity to the sequence length: a +smoothing block to mix information over long sequences and a matrix sketching +method that simultaneously selects columns and rows from the input matrix. We +verify the effectiveness of S$^3$Attention both theoretically and empirically. +Extensive studies over Long Range Arena (LRA) datasets and six time-series +forecasting show that S$^3$Attention significantly outperforms both vanilla +Attention and other state-of-the-art variants of Attention structures. + +
+
+
+
+
+ + ☆ A training regime to learn unified representations from complementary + breast imaging modalities + + +
+ Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT) +are the two most widely used imaging modalities for breast cancer screening. +Although DBT has increased cancer detection compared to FFDM, its widespread +adoption in clinical practice has been slowed by increased interpretation times +and a perceived decrease in the conspicuity of specific lesion types. +Specifically, the non-inferiority of DBT for microcalcifications remains under +debate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM +acquisitions remain popular, leading to overall increased exam times and +radiation dosage. Enabling DBT to provide diagnostic information present in +both FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in +both quantities. We propose a machine learning methodology that learns +high-level representations leveraging the complementary diagnostic signal from +both DBT and FFDM. Experiments on a large-scale data set validate our claims +and show that our representations enable more accurate breast lesion detection +than any DBT- or FFDM-based model. + +
+
+
+
+
+ + ☆ Linear combinations of latents in diffusion models: interpolation and + beyond + + +
+ Generative models are crucial for applications like data synthesis and +augmentation. Diffusion, Flow Matching and Continuous Normalizing Flows have +shown effectiveness across various modalities, and rely on Gaussian latent +variables for generation. As any generated object is directly associated with a +particular latent variable, we can manipulate the variables to exert control +over the generation process. However, standard approaches for combining latent +variables, such as spherical interpolation, only apply or work well in special +cases. Moreover, current methods for obtaining low-dimensional representations +of the data, important for e.g. surrogate models for search and creative +applications, are network and data modality specific. In this work we show that +the standard methods to combine variables do not yield intermediates following +the distribution the models are trained to expect. We propose Combination of +Gaussian variables (COG), a novel interpolation method that addresses this, is +easy to implement yet matches or improves upon current methods. COG addresses +linear combinations in general and, as we demonstrate, also supports other +operations including e.g. defining subspaces of the latent space, simplifying +the creation of expressive low-dimensional spaces of high-dimensional objects +using generative models based on Gaussian latents. + +
+
+
+
+
+ + ☆ ABQ-LLM: Arbitrary-Bit Quantized Inference Acceleration for Large + Language Models + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks. However, their practical application is constrained by substantial +memory and computational demands. Post-training quantization (PTQ) is +considered an effective method to accelerate LLM inference. Despite its growing +popularity in LLM model compression, PTQ deployment faces two major challenges. +First, low-bit quantization leads to performance degradation. Second, +restricted by the limited integer computing unit type on GPUs, quantized matrix +operations with different precisions cannot be effectively accelerated. To +address these issues, we introduce a novel arbitrary-bit quantization algorithm +and inference framework, ABQ-LLM. It achieves superior performance across +various quantization settings and enables efficient arbitrary-precision +quantized inference on the GPU. ABQ-LLM introduces several key innovations: (1) +a distribution correction method for transformer blocks to mitigate +distribution differences caused by full quantization of weights and +activations, improving performance at low bit-widths. (2) the bit balance +strategy to counteract performance degradation from asymmetric distribution +issues at very low bit-widths (e.g., 2-bit). (3) an innovative quantization +acceleration framework that reconstructs the quantization matrix multiplication +of arbitrary precision combinations based on BTC (Binary TensorCore) +equivalents, gets rid of the limitations of INT4/INT8 computing units. ABQ-LLM +can convert each component bit width gain into actual acceleration gain, +maximizing performance under mixed precision(e.g., W6A6, W2A8). Based on W2*A8 +quantization configuration on LLaMA-7B model, it achieved a WikiText2 +perplexity of 7.59 (2.17$\downarrow $ vs 9.76 in AffineQuant). Compared to +SmoothQuant, we realized 1.6$\times$ acceleration improvement and 2.7$\times$ +memory compression gain. + +
+
+
+
+
+ + ☆ Where is the signal in tokenization space? + + +
+ Large Language Models (LLMs) are typically shipped with tokenizers that +deterministically encode text into so-called canonical token sequences, to +which the LLMs assign probability values. One common assumption is that the +probability of a piece of text is the probability of its canonical token +sequence. However, the tokenization of a string is not unique: e.g., the Llama2 +tokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same +text. In this paper, we study non-canonical tokenizations. We prove that, given +a string, it is computationally hard to find the most likely tokenization for +an autoregressive LLM, as well as to compute the marginal probability over all +possible tokenizations. We then show how the marginal is, in most cases, +indistinguishable from the canonical probability. Surprisingly, we then +empirically demonstrate the existence of a significant amount of signal hidden +within tokenization space. Notably, by simply aggregating the probabilities of +non-canonical tokenizations, we achieve improvements across a range of LLM +evaluation benchmarks for a variety of architectures, including transformers +and state space models. + +
+
+
+
+
+ + ☆ Blockchain-Enabled Accountability in Data Supply Chain: A Data Bill of + Materials Approach + + +
+ In the era of advanced artificial intelligence, highlighted by large-scale +generative models like GPT-4, ensuring the traceability, verifiability, and +reproducibility of datasets throughout their lifecycle is paramount for +research institutions and technology companies. These organisations +increasingly rely on vast corpora to train and fine-tune advanced AI models, +resulting in intricate data supply chains that demand effective data governance +mechanisms. In addition, the challenge intensifies as diverse stakeholders may +use assorted tools, often without adequate measures to ensure the +accountability of data and the reliability of outcomes. In this study, we adapt +the concept of ``Software Bill of Materials" into the field of data governance +and management to address the above challenges, and introduce ``Data Bill of +Materials" (DataBOM) to capture the dependency relationship between different +datasets and stakeholders by storing specific metadata. We demonstrate a +platform architecture for providing blockchain-based DataBOM services, present +the interaction protocol for stakeholders, and discuss the minimal requirements +for DataBOM metadata. The proposed solution is evaluated in terms of +feasibility and performance via case study and quantitative analysis +respectively. + +
+
+
+
+
+ + ☆ Unsupervised Transfer Learning via Adversarial Contrastive Training + + +
+ Learning a data representation for downstream supervised learning tasks under +unlabeled scenario is both critical and challenging. In this paper, we propose +a novel unsupervised transfer learning approach using adversarial contrastive +training (ACT). Our experimental results demonstrate outstanding classification +accuracy with both fine-tuned linear probe and K-NN protocol across various +datasets, showing competitiveness with existing state-of-the-art +self-supervised learning methods. Moreover, we provide an end-to-end +theoretical guarantee for downstream classification tasks in a misspecified, +over-parameterized setting, highlighting how a large amount of unlabeled data +contributes to prediction accuracy. Our theoretical findings suggest that the +testing error of downstream tasks depends solely on the efficiency of data +augmentation used in ACT when the unlabeled sample size is sufficiently large. +This offers a theoretical understanding of learning downstream tasks with a +small sample size. + +
+
+
+
+
+ + ☆ Detecting Unsuccessful Students in Cybersecurity Exercises in Two + Different Learning Environments + + +
+ This full paper in the research track evaluates the usage of data logged from +cybersecurity exercises in order to predict students who are potentially at +risk of performing poorly. Hands-on exercises are essential for learning since +they enable students to practice their skills. In cybersecurity, hands-on +exercises are often complex and require knowledge of many topics. Therefore, +students may miss solutions due to gaps in their knowledge and become +frustrated, which impedes their learning. Targeted aid by the instructor helps, +but since the instructor's time is limited, efficient ways to detect struggling +students are needed. This paper develops automated tools to predict when a +student is having difficulty. We formed a dataset with the actions of 313 +students from two countries and two learning environments: KYPO CRP and +EDURange. These data are used in machine learning algorithms to predict the +success of students in exercises deployed in these environments. After +extracting features from the data, we trained and cross-validated eight +classifiers for predicting the exercise outcome and evaluated their predictive +power. The contribution of this paper is comparing two approaches to feature +engineering, modeling, and classification performance on data from two learning +environments. Using the features from either learning environment, we were able +to detect and distinguish between successful and struggling students. A +decision tree classifier achieved the highest balanced accuracy and sensitivity +with data from both learning environments. The results show that activity data +from cybersecurity exercises are suitable for predicting student success. In a +potential application, such models can aid instructors in detecting struggling +students and providing targeted help. We publish data and code for building +these models so that others can adopt or adapt them. + +
+
+ comment: To appear for publication in the FIE 2024 conference proceedings +
+
+
+
+
+ + ☆ Inverse design with conditional cascaded diffusion models + + +
+ Adjoint-based design optimizations are usually computationally expensive and +those costs scale with resolution. To address this, researchers have proposed +machine learning approaches for inverse design that can predict +higher-resolution solutions from lower cost/resolution ones. Due to the recent +success of diffusion models over traditional generative models, we extend the +use of diffusion models for multi-resolution tasks by proposing the conditional +cascaded diffusion model (cCDM). Compared to GANs, cCDM is more stable to +train, and each diffusion model within the cCDM can be trained independently, +thus each model's parameters can be tuned separately to maximize the +performance of the pipeline. Our study compares cCDM against a cGAN model with +transfer learning. + Our results demonstrate that the cCDM excels in capturing finer details, +preserving volume fraction constraints, and minimizing compliance errors in +multi-resolution tasks when a sufficient amount of high-resolution training +data (more than 102 designs) is available. Furthermore, we explore the impact +of training data size on the performance of both models. While both models show +decreased performance with reduced high-resolution training data, the cCDM +loses its superiority to the cGAN model with transfer learning when training +data is limited (less than 102), and we show the break-even point for this +transition. Also, we highlight that while the diffusion model may achieve +better pixel-wise performance in both low-resolution and high-resolution +scenarios, this does not necessarily guarantee that the model produces optimal +compliance error or constraint satisfaction. + +
+
+ comment: Accepted for presentation at IDETC/CIE 2024 conference, Washington, + DC. 11 pages, 9 figures +
+
+
+
+
+ + ☆ Mitigating Degree Bias in Signed Graph Neural Networks AAAI + + +
+ Like Graph Neural Networks (GNNs), Signed Graph Neural Networks (SGNNs) are +also up against fairness issues from source data and typical aggregation +method. In this paper, we are pioneering to make the investigation of fairness +in SGNNs expanded from GNNs. We identify the issue of degree bias within signed +graphs, offering a new perspective on the fairness issues related to SGNNs. To +handle the confronted bias issue, inspired by previous work on degree bias, a +new Model-Agnostic method is consequently proposed to enhance representation of +nodes with different degrees, which named as Degree Debiased Signed Graph +Neural Network (DD-SGNN) . More specifically, in each layer, we make a transfer +from nodes with high degree to nodes with low degree inside a head-to-tail +triplet, which to supplement the underlying domain missing structure of the +tail nodes and meanwhile maintain the positive and negative semantics specified +by balance theory in signed graphs. We make extensive experiments on four +real-world datasets. The result verifies the validity of the model, that is, +our model mitigates the degree bias issue without compromising +performance($\textit{i.e.}$, AUC, F1). The code is provided in supplementary +material. + +
+
+ comment: 10 pages, 7 figures, The 39th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ☆ The Limitations of Model Retraining in the Face of Performativity ICML + + +
+ We study stochastic optimization in the context of performative shifts, where +the data distribution changes in response to the deployed model. We demonstrate +that naive retraining can be provably suboptimal even for simple distribution +shifts. The issue worsens when models are retrained given a finite number of +samples at each retraining step. We show that adding regularization to +retraining corrects both of these issues, attaining provably optimal models in +the face of distribution shifts. Our work advocates rethinking how machine +learning models are retrained in the presence of performative effects. + +
+
+ comment: Accepted to 2024 ICML Workshop on Humans, Algorithmic Decision-Making + and Society +
+
+
+
+
+ + ☆ Optimal Sketching for Residual Error Estimation for Matrix and Vector + Norms ICLR 2024 + + +
+ We study the problem of residual error estimation for matrix and vector norms +using a linear sketch. Such estimates can be used, for example, to quickly +assess how useful a more expensive low-rank approximation computation will be. +The matrix case concerns the Frobenius norm and the task is to approximate the +$k$-residual $\|A - A_k\|_F$ of the input matrix $A$ within a +$(1+\epsilon)$-factor, where $A_k$ is the optimal rank-$k$ approximation. We +provide a tight bound of $\Theta(k^2/\epsilon^4)$ on the size of bilinear +sketches, which have the form of a matrix product $SAT$. This improves the +previous $O(k^2/\epsilon^6)$ upper bound in (Andoni et al. SODA 2013) and gives +the first non-trivial lower bound, to the best of our knowledge. In our +algorithm, our sketching matrices $S$ and $T$ can both be sparse matrices, +allowing for a very fast update time. We demonstrate that this gives a +substantial advantage empirically, for roughly the same sketch size and +accuracy as in previous work. + For the vector case, we consider the $\ell_p$-norm for $p>2$, where the task +is to approximate the $k$-residual $\|x - x_k\|_p$ up to a constant factor, +where $x_k$ is the optimal $k$-sparse approximation to $x$. Such vector norms +are frequently studied in the data stream literature and are useful for finding +frequent items or so-called heavy hitters. We establish an upper bound of +$O(k^{2/p}n^{1-2/p}\operatorname{poly}(\log n))$ for constant $\epsilon$ on the +dimension of a linear sketch for this problem. Our algorithm can be extended to +the $\ell_p$ sparse recovery problem with the same sketching dimension, which +seems to be the first such bound for $p > 2$. We also show an +$\Omega(k^{2/p}n^{1-2/p})$ lower bound for the sparse recovery problem, which +is tight up to a $\mathrm{poly}(\log n)$ factor. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ☆ Fishers Harvest Parallel Unlearning in Inherited Model Networks + + +
+ Unlearning in various learning frameworks remains challenging, with the +continuous growth and updates of models exhibiting complex inheritance +relationships. This paper presents a novel unlearning framework, which enables +fully parallel unlearning among models exhibiting inheritance. A key enabler is +the new Unified Model Inheritance Graph (UMIG), which captures the inheritance +using a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher +Inheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information +Matrix (FIM) from initial unlearning models to pinpoint impacted parameters in +inherited models. By employing FIM, the FIUn method breaks the sequential +dependencies among the models, facilitating simultaneous unlearning and +reducing computational overhead. We further design to merge disparate FIMs into +a single matrix, synchronizing updates across inherited models. Experiments +confirm the effectiveness of our unlearning framework. For single-class tasks, +it achieves complete unlearning with 0\% accuracy for unlearned labels while +maintaining 94.53\% accuracy for retained labels on average. For multi-class +tasks, the accuracy is 1.07\% for unlearned labels and 84.77\% for retained +labels on average. Our framework accelerates unlearning by 99\% compared to +alternative methods. + +
+
+
+
+
+ + ☆ Adversarial Contrastive Learning Based Physics-Informed Temporal + Networks for Cuffless Blood Pressure Estimation + + +
+ Time series data mining is immensely important in extensive applications, +such as traffic, medical, and e-commerce. In this paper, we focus on medical +temporal variation modeling, \emph{i.e.,} cuffless blood pressure (BP) +monitoring which has great value in cardiovascular healthcare. Although +providing a comfortable user experience, such methods are suffering from the +demand for a significant amount of realistic data to train an individual model +for each subject, especially considering the invasive or obtrusive BP +ground-truth measurements. To tackle this challenge, we introduce a novel +physics-informed temporal network~(PITN) with adversarial contrastive learning +to enable precise BP estimation with very limited data. Specifically, we first +enhance the physics-informed neural network~(PINN) with the temporal block for +investigating BP dynamics' multi-periodicity for personal cardiovascular cycle +modeling and temporal variation. We then employ adversarial training to +generate extra physiological time series data, improving PITN's robustness in +the face of sparse subject-specific training data. Furthermore, we utilize +contrastive learning to capture the discriminative variations of cardiovascular +physiologic phenomena. This approach aggregates physiological signals with +similar blood pressure values in latent space while separating clusters of +samples with dissimilar blood pressure values. Experiments on three +widely-adopted datasets with different modailties (\emph{i.e.,} bioimpedance, +PPG, millimeter-wave) demonstrate the superiority and effectiveness of the +proposed methods over previous state-of-the-art approaches. The code is +available at~\url{https://github.com/Zest86/ACL-PITN}. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ An Unsupervised Learning Framework Combined with Heuristics for the + Maximum Minimal Cut Problem + + +
+ The Maximum Minimal Cut Problem (MMCP), a NP-hard combinatorial optimization +(CO) problem, has not received much attention due to the demanding and +challenging bi-connectivity constraint. Moreover, as a CO problem, it is also a +daunting task for machine learning, especially without labeled instances. To +deal with these problems, this work proposes an unsupervised learning framework +combined with heuristics for MMCP that can provide valid and high-quality +solutions. As far as we know, this is the first work that explores machine +learning and heuristics to solve MMCP. The unsupervised solver is inspired by a +relaxation-plus-rounding approach, the relaxed solution is parameterized by +graph neural networks, and the cost and penalty of MMCP are explicitly written +out, which can train the model end-to-end. A crucial observation is that each +solution corresponds to at least one spanning tree. Based on this finding, a +heuristic solver that implements tree transformations by adding vertices is +utilized to repair and improve the solution quality of the unsupervised solver. +Alternatively, the graph is simplified while guaranteeing solution consistency, +which reduces the running time. We conduct extensive experiments to evaluate +our framework and give a specific application. The results demonstrate the +superiority of our method against two techniques designed. + +
+
+
+
+
+ + ☆ Enhancing Events in Neutrino Telescopes through Deep Learning-Driven + Super-Resolution + + +
+ Recent discoveries by neutrino telescopes, such as the IceCube Neutrino +Observatory, relied extensively on machine learning (ML) tools to infer +physical quantities from the raw photon hits detected. Neutrino telescope +reconstruction algorithms are limited by the sparse sampling of photons by the +optical modules due to the relatively large spacing ($10-100\,{\rm m})$ between +them. In this letter, we propose a novel technique that learns photon transport +through the detector medium through the use of deep learning-driven +super-resolution of data events. These ``improved'' events can then be +reconstructed using traditional or ML techniques, resulting in improved +resolution. Our strategy arranges additional ``virtual'' optical modules within +an existing detector geometry and trains a convolutional neural network to +predict the hits on these virtual optical modules. We show that this technique +improves the angular reconstruction of muons in a generic ice-based neutrino +telescope. Our results readily extend to water-based neutrino telescopes and +other event morphologies. + +
+
+ comment: 5+1 pages, 4+1 figures +
+
+
+
+
+ + ☆ Context-Aware Assistant Selection for Improved Inference Acceleration + with Large Language Models + + +
+ Despite their widespread adoption, large language models (LLMs) remain +prohibitive to use under resource constraints, with their ever growing sizes +only increasing the barrier for use. One noted issue is the high latency +associated with auto-regressive generation, rendering large LLMs use dependent +on advanced computing infrastructure. Assisted decoding, where a smaller draft +model guides a larger target model's generation, has helped alleviate this, but +remains dependent on alignment between the two models. Thus if the draft model +is insufficiently capable on some domain relative to the target model, +performance can degrade. Alternatively, one can leverage multiple draft models +to better cover the expertise of the target, but when multiple black-box draft +models are available, selecting an assistant without details about its +construction can be difficult. To better understand this decision making +problem, we observe it as a contextual bandit, where a policy must choose a +draft model based on a context. We show that even without prior knowledge of +the draft models, creating an offline dataset from only outputs of independent +draft/target models and training a policy over the alignment of these outputs +can accelerate performance on multiple domains provided the candidates are +effective. Further results show this to hold on various settings with multiple +assisted decoding candidates, highlighting its flexibility and the advantageous +role that such decision making can play. + +
+
+ comment: 14 pages (9 pages main content + references + appendix) +
+
+
+
+
+ + ♻ ☆ Surprise-Adaptive Intrinsic Motivation for Unsupervised Reinforcement + Learning + + +
+ Both entropy-minimizing and entropy-maximizing (curiosity) objectives for +unsupervised reinforcement learning (RL) have been shown to be effective in +different environments, depending on the environment's level of natural +entropy. However, neither method alone results in an agent that will +consistently learn intelligent behavior across environments. In an effort to +find a single entropy-based method that will encourage emergent behaviors in +any environment, we propose an agent that can adapt its objective online, +depending on the entropy conditions by framing the choice as a multi-armed +bandit problem. We devise a novel intrinsic feedback signal for the bandit, +which captures the agent's ability to control the entropy in its environment. +We demonstrate that such agents can learn to control entropy and exhibit +emergent behaviors in both high- and low-entropy regimes and can learn skillful +behaviors in benchmark tasks. Videos of the trained agents and summarized +findings can be found on our project page +https://sites.google.com/view/surprise-adaptive-agents + +
+
+ comment: Published at the Reinforcement Learning Conference 2024 +
+
+
+
+
+ + ♻ ☆ Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation + + +
+ Several recent advances in AI systems solve problems by providing a +"scaffolding" program that structures multiple calls to language models (LMs) +to generate better outputs. A scaffolding program is written in a programming +language such as Python. In this work, we use a language-model-infused +scaffolding program to improve itself. We start with a seed "improver" that +improves an input program according to a given utility function by querying an +LM several times and returning the best solution. We then run this seed +improver to improve itself. Across a small set of downstream tasks, the +resulting improved improver generates programs with significantly better +performance than its seed improver. A variety of self-improvement strategies +are proposed by the language model, including beam search, genetic algorithms, +and simulated annealing. Since the language models themselves are not altered, +this is not full recursive self-improvement. Nonetheless, it demonstrates that +a modern language model, GPT-4 in our experiments, is capable of writing code +that can call itself to improve itself. We consider concerns around the +development of self-improving technologies and evaluate the frequency with +which the generated code bypasses a sandbox. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Potion: Towards Poison Unlearning + + +
+ Adversarial attacks by malicious actors on machine learning systems, such as +introducing poison triggers into training datasets, pose significant risks. The +challenge in resolving such an attack arises in practice when only a subset of +the poisoned data can be identified. This necessitates the development of +methods to remove, i.e. unlearn, poison triggers from already trained models +with only a subset of the poison data available. The requirements for this task +significantly deviate from privacy-focused unlearning where all of the data to +be forgotten by the model is known. Previous work has shown that the +undiscovered poisoned samples lead to a failure of established unlearning +methods, with only one method, Selective Synaptic Dampening (SSD), showing +limited success. Even full retraining, after the removal of the identified +poison, cannot address this challenge as the undiscovered poison samples lead +to a reintroduction of the poison trigger in the model. Our work addresses two +key challenges to advance the state of the art in poison unlearning. First, we +introduce a novel outlier-resistant method, based on SSD, that significantly +improves model protection and unlearning performance. Second, we introduce +Poison Trigger Neutralisation (PTN) search, a fast, parallelisable, +hyperparameter search that utilises the characteristic "unlearning versus model +protection" trade-off to find suitable hyperparameters in settings where the +forget set size is unknown and the retain set is contaminated. We benchmark our +contributions using ResNet-9 on CIFAR10 and WideResNet-28x10 on CIFAR100. +Experimental results show that our method heals 93.72% of poison compared to +SSD with 83.41% and full retraining with 40.68%. We achieve this while also +lowering the average model accuracy drop caused by unlearning from 5.68% (SSD) +to 1.41% (ours). + +
+
+
+
+
+ + ♻ ☆ S-BDT: Distributed Differentially Private Boosted Decision Trees + + +
+ We introduce S-BDT: a novel $(\varepsilon,\delta)$-differentially private +distributed gradient boosted decision tree (GBDT) learner that improves the +protection of single training data points (privacy) while achieving meaningful +learning goals, such as accuracy or regression error (utility). S-BDT uses less +noise by relying on non-spherical multivariate Gaussian noise, for which we +show tight subsampling bounds for privacy amplification and incorporate that +into a R\'enyi filter for individual privacy accounting. We experimentally +reach the same utility while saving $50\%$ in terms of epsilon for $\varepsilon +\le 0.5$ on the Abalone regression dataset (dataset size $\approx 4K$), saving +$30\%$ in terms of epsilon for $\varepsilon \le 0.08$ for the Adult +classification dataset (dataset size $\approx 50K$), and saving $30\%$ in terms +of epsilon for $\varepsilon\leq0.03$ for the Spambase classification dataset +(dataset size $\approx 5K$). Moreover, we show that for situations where a GBDT +is learning a stream of data that originates from different subpopulations +(non-IID), S-BDT improves the saving of epsilon even further. + +
+
+ comment: The first two authors equally contributed to this work +
+
+
+
+
+ + ♻ ☆ Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams ICPR + + +
+ The proliferation of textual data on the Internet presents a unique +opportunity for institutions and companies to monitor public opinion about +their services and products. Given the rapid generation of such data, the text +stream mining setting, which handles sequentially arriving, potentially +infinite text streams, is often more suitable than traditional batch learning. +While pre-trained language models are commonly employed for their high-quality +text vectorization capabilities in streaming contexts, they face challenges +adapting to concept drift - the phenomenon where the data distribution changes +over time, adversely affecting model performance. Addressing the issue of +concept drift, this study explores the efficacy of seven text sampling methods +designed to selectively fine-tune language models, thereby mitigating +performance degradation. We precisely assess the impact of these methods on +fine-tuning the SBERT model using four different loss functions. Our +evaluation, focused on Macro F1-score and elapsed time, employs two text stream +datasets and an incremental SVM classifier to benchmark performance. Our +findings indicate that Softmax loss and Batch All Triplets loss are +particularly effective for text stream classification, demonstrating that +larger sample sizes generally correlate with improved macro F1-scores. Notably, +our proposed WordPieceToken ratio sampling method significantly enhances +performance with the identified loss functions, surpassing baseline results. + +
+
+ comment: Accepted for presentation at the 27th International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ Active Learning with Weak Supervision for Gaussian Processes + + +
+ Annotating data for supervised learning can be costly. When the annotation +budget is limited, active learning can be used to select and annotate those +observations that are likely to give the most gain in model performance. We +propose an active learning algorithm that, in addition to selecting which +observation to annotate, selects the precision of the annotation that is +acquired. Assuming that annotations with low precision are cheaper to obtain, +this allows the model to explore a larger part of the input space, with the +same annotation budget. We build our acquisition function on the previously +proposed BALD objective for Gaussian Processes, and empirically demonstrate the +gains of being able to adjust the annotation precision in the active learning +loop. + +
+
+ comment: This version of the contribution has been accepted for publication, + after peer review but is not the Version of Record and does not reflect + post-acceptance improvements, or any corrections. The Version of Record is + available online at: http://dx.doi.org/10.1007/978-981-99-1642-9_17. Use of + this Accepted Version is subject to the publisher's Accepted Manuscript terms + of use +
+
+
+
+
+ + ♻ ☆ Prediction Instability in Machine Learning Ensembles ICML2024 + + +
+ In machine learning ensembles predictions from multiple models are +aggregated. Despite widespread use and strong performance of ensembles in +applied problems little is known about the mathematical properties of +aggregating models and associated consequences for safe, explainable use of +such models. In this paper we prove a theorem that shows that any ensemble will +exhibit at least one of the following forms of prediction instability. It will +either ignore agreement among all underlying models, change its mind when none +of the underlying models have done so, or be manipulable through inclusion or +exclusion of options it would never actually predict. As a consequence, +ensemble aggregation procedures will always need to balance the benefits of +information use against the risk of these prediction instabilities. This +analysis also sheds light on what specific forms of prediction instability to +expect from particular ensemble algorithms; for example popular tree ensembles +like random forest, or xgboost will violate basic, intuitive fairness +properties. Finally, we show that this can be ameliorated by using consistent +models in asymptotic conditions. + +
+
+ comment: 15 pages, uses a modified version of ICML2024.sty +
+
+
+
+
+ + ♻ ☆ Federated Natural Policy Gradient and Actor Critic Methods for + Multi-task Reinforcement Learning + + +
+ Federated reinforcement learning (RL) enables collaborative decision making +of multiple distributed agents without sharing local data trajectories. In this +work, we consider a multi-task setting, in which each agent has its own private +reward function corresponding to different tasks, while sharing the same +transition kernel of the environment. Focusing on infinite-horizon Markov +decision processes, the goal is to learn a globally optimal policy that +maximizes the sum of the discounted total rewards of all the agents in a +decentralized manner, where each agent only communicates with its neighbors +over some prescribed graph topology. + We develop federated vanilla and entropy-regularized natural policy gradient +(NPG) methods in the tabular setting under softmax parameterization, where +gradient tracking is applied to estimate the global Q-function to mitigate the +impact of imperfect information sharing. We establish non-asymptotic global +convergence guarantees under exact policy evaluation, where the rates are +nearly independent of the size of the state-action space and illuminate the +impacts of network size and connectivity. To the best of our knowledge, this is +the first time that near dimension-free global convergence is established for +federated multi-task RL using policy optimization. We further go beyond the +tabular setting by proposing a federated natural actor critic (NAC) method for +multi-task RL with function approximation, and establish its finite-time sample +complexity taking the errors of function approximation into account. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Multimodal Learning: A Survey + + +
+ Multimodal learning, which aims to understand and analyze information from +multiple modalities, has achieved substantial progress in the supervised regime +in recent years. However, the heavy dependence on data paired with expensive +human annotations impedes scaling up models. Meanwhile, given the availability +of large-scale unannotated data in the wild, self-supervised learning has +become an attractive strategy to alleviate the annotation bottleneck. Building +on these two directions, self-supervised multimodal learning (SSML) provides +ways to learn from raw multimodal data. In this survey, we provide a +comprehensive review of the state-of-the-art in SSML, in which we elucidate +three major challenges intrinsic to self-supervised learning with multimodal +data: (1) learning representations from multimodal data without labels, (2) +fusion of different modalities, and (3) learning with unaligned data. We then +detail existing solutions to these challenges. Specifically, we consider (1) +objectives for learning from multimodal unlabeled data via self-supervision, +(2) model architectures from the perspective of different multimodal fusion +strategies, and (3) pair-free learning strategies for coarse-grained and +fine-grained alignment. We also review real-world applications of SSML +algorithms in diverse fields such as healthcare, remote sensing, and machine +translation. Finally, we discuss challenges and future directions for SSML. A +collection of related resources can be found at: +https://github.com/ys-zong/awesome-self-supervised-multimodal-learning. + +
+
+ comment: Accepted to IEEE T-PAMI +
+
+
+
+
+ + ♻ ☆ Detecting Hidden Triggers: Mapping Non-Markov Reward Functions to Markov + + +
+ Many Reinforcement Learning algorithms assume a Markov reward function to +guarantee optimality. However, not all reward functions are Markov. This paper +proposes a framework for mapping non-Markov reward functions into equivalent +Markov ones by learning specialized reward automata, Reward Machines. Unlike +the general practice of learning Reward Machines, we do not require a set of +high-level propositional symbols from which to learn. Rather, we learn hidden +triggers, directly from data, that construct them. We demonstrate the +importance of learning Reward Machines over their Deterministic Finite-State +Automata counterparts given their ability to model reward dependencies. We +formalize this distinction in our learning objective. Our mapping process is +constructed as an Integer Linear Programming problem. We prove that our +mappings form a suitable proxy for maximizing reward expectations. We +empirically validate our approach by learning black-box, non-Markov reward +functions in the Officeworld domain. Additionally, we demonstrate the +effectiveness of learning reward dependencies in a new domain, Breakfastworld. + +
+
+
+
+
+ + ♻ ☆ Agentic Skill Discovery + + +
+ Language-conditioned robotic skills make it possible to apply the high-level +reasoning of Large Language Models (LLMs) to low-level robotic control. A +remaining challenge is to acquire a diverse set of fundamental skills. Existing +approaches either manually decompose a complex task into atomic robotic actions +in a top-down fashion, or bootstrap as many combinations as possible in a +bottom-up fashion to cover a wider range of task possibilities. These +decompositions or combinations, however, require an initial skill library. For +example, a ``grasping'' capability can never emerge from a skill library +containing only diverse ``pushing'' skills. Existing skill discovery techniques +with reinforcement learning acquire skills by an exhaustive exploration but +often yield non-meaningful behaviors. In this study, we introduce a novel +framework for skill discovery that is entirely driven by LLMs. The framework +begins with an LLM generating task proposals based on the provided scene +description and the robot's configurations, aiming to incrementally acquire new +skills upon task completion. For each proposed task, a series of reinforcement +learning processes are initiated, utilizing reward and success determination +functions sampled by the LLM to develop the corresponding policy. The +reliability and trustworthiness of learned behaviors are further ensured by an +independent vision-language model. We show that starting with zero skill, the +skill library emerges and expands to more and more meaningful and reliable +skills, enabling the robot to efficiently further propose and complete advanced +tasks. Project page: \url{https://agentic-skill-discovery.github.io}. + +
+
+ comment: Webpage see https://agentic-skill-discovery.github.io/ +
+
+
+
+
+ + ♻ ☆ Dataset-learning duality and emergent criticality + + +
+ In artificial neural networks, the activation dynamics of non-trainable +variables is strongly coupled to the learning dynamics of trainable variables. +During the activation pass, the boundary neurons (e.g., input neurons) are +mapped to the bulk neurons (e.g., hidden neurons), and during the learning +pass, both bulk and boundary neurons are mapped to changes in trainable +variables (e.g., weights and biases). For example, in feed-forward neural +networks, forward propagation is the activation pass and backward propagation +is the learning pass. We show that a composition of the two maps establishes a +duality map between a subspace of non-trainable boundary variables (e.g., +dataset) and a tangent subspace of trainable variables (i.e., learning). In +general, the dataset-learning duality is a complex non-linear map between +high-dimensional spaces, but in a learning equilibrium, the problem can be +linearized and reduced to many weakly coupled one-dimensional problems. We use +the duality to study the emergence of criticality, or the power-law +distributions of fluctuations of the trainable variables. In particular, we +show that criticality can emerge in the learning system even from the dataset +in a non-critical state, and that the power-law distribution can be modified by +changing either the activation function or the loss function. + +
+
+ comment: 29 pages, 9 figures, 1 table, minor corrections +
+
+
+
+
+ + ♻ ☆ Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation + + +
+ By using an parametric value function to replace the Monte-Carlo rollouts for +value estimation, the actor-critic (AC) algorithms can reduce the variance of +stochastic policy gradient so that to improve the convergence rate. While +existing works mainly focus on analyzing convergence rate of AC algorithms +under Markovian noise, the impacts of momentum on AC algorithms remain largely +unexplored. In this work, we first propose a heavy-ball momentum based +advantage actor-critic (\mbox{HB-A2C}) algorithm by integrating the heavy-ball +momentum into the critic recursion that is parameterized by a linear function. +When the sample trajectory follows a Markov decision process, we quantitatively +certify the acceleration capability of the proposed HB-A2C algorithm. Our +theoretical results demonstrate that the proposed HB-A2C finds an +$\epsilon$-approximate stationary point with $\oo{\epsilon^{-2}}$ iterations +for reinforcement learning tasks with Markovian noise. Moreover, we also reveal +the dependence of learning rates on the length of the sample trajectory. By +carefully selecting the momentum factor of the critic recursion, the proposed +HB-A2C can balance the errors introduced by the initialization and the +stoschastic approximation. + +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Centralized and Federated Heart Disease Classification Models Using UCI + Dataset and their Shapley-value Based Interpretability + + +
+ Cardiovascular diseases are a leading cause of mortality worldwide, +highlighting the need for accurate diagnostic methods. This study benchmarks +centralized and federated machine learning algorithms for heart disease +classification using the UCI dataset which includes 920 patient records from +four hospitals in the USA, Hungary and Switzerland. Our benchmark is supported +by Shapley-value interpretability analysis to quantify features' importance for +classification. In the centralized setup, various binary classification +algorithms are trained on pooled data, with a support vector machine (SVM) +achieving the highest testing accuracy of 83.3\%, surpassing the established +benchmark of 78.7\% with logistic regression. Additionally, federated learning +algorithms with four clients (hospitals) are explored, leveraging the dataset's +natural partition to enhance privacy without sacrificing accuracy. Federated +SVM, an uncommon approach in the literature, achieves a top testing accuracy of +73.8\%. Our interpretability analysis aligns with existing medical knowledge of +heart disease indicators. Overall, this study establishes a benchmark for +efficient and interpretable pre-screening tools for heart disease while +maintaining patients' privacy. This work is available at +https://github.com/padillma1/Heart-Disease-Classification-on-UCI-dataset-and-Shapley-Interpretability-Analysis. + +
+
+
+
+
+ + ♻ ☆ Learning Diffusion Priors from Observations by Expectation Maximization + + +
+ Diffusion models recently proved to be remarkable priors for Bayesian inverse +problems. However, training these models typically requires access to large +amounts of clean data, which could prove difficult in some settings. In this +work, we present a novel method based on the expectation-maximization algorithm +for training diffusion models from incomplete and noisy observations only. +Unlike previous works, our method leads to proper diffusion models, which is +crucial for downstream tasks. As part of our method, we propose and motivate an +improved posterior sampling scheme for unconditional diffusion models. We +present empirical evidence supporting the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Automated Contrastive Learning Strategy Search for Time Series CIKM'2024 + + +
+ In recent years, Contrastive Learning (CL) has become a predominant +representation learning paradigm for time series. Most existing methods +manually build specific CL Strategies (CLS) by human heuristics for certain +datasets and tasks. However, manually developing CLS usually requires excessive +prior knowledge about the data, and massive experiments to determine the +detailed CL configurations. In this paper, we present an Automated Machine +Learning (AutoML) practice at Microsoft, which automatically learns CLS for +time series datasets and tasks, namely Automated Contrastive Learning (AutoCL). +We first construct a principled search space of size over $3\times10^{12}$, +covering data augmentation, embedding transformation, contrastive pair +construction, and contrastive losses. Further, we introduce an efficient +reinforcement learning algorithm, which optimizes CLS from the performance on +the validation tasks, to obtain effective CLS within the space. Experimental +results on various real-world datasets demonstrate that AutoCL could +automatically find the suitable CLS for the given dataset and task. From the +candidate CLS found by AutoCL on several public datasets/tasks, we compose a +transferable Generally Good Strategy (GGS), which has a strong performance for +other datasets. We also provide empirical analysis as a guide for the future +design of CLS. + +
+
+ comment: Accepted by CIKM'2024 +
+
+
+
+
+ + ♻ ☆ Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal + Time Series Imputation CIKM'2024 + + +
+ Spatiotemporal time series are usually collected via monitoring sensors +placed at different locations, which usually contain missing values due to +various mechanical failures. Imputing the missing values is crucial for +analyzing time series. When recovering a specific data point, most existing +methods consider all the information relevant to that point regardless of the +cause-and-effect relationship. During data collection, it is inevitable that +some unknown confounders are included, e.g., background noise in time series +and non-causal shortcut edges in the constructed sensor network. These +confounders could open backdoor paths and establish non-causal correlations +between the input and output. Over-exploiting these non-causal correlations +could cause overfitting. In this paper, we first revisit spatiotemporal time +series imputation from a causal perspective and show how to block the +confounders via the frontdoor adjustment. Based on the results of frontdoor +adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph Neural +Network (Casper), which contains a novel Prompt Based Decoder (PBD) and a +Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of +confounders and SCA could discover the sparse causal relationships among +embeddings. Theoretical analysis reveals that SCA discovers causal +relationships based on the values of gradients. We evaluate Casper on three +real-world datasets, and the experimental results show that Casper could +outperform the baselines and could effectively discover causal relationships. + +
+
+ comment: Accepted by CIKM'2024 +
+
+
+
+
+ + ♻ ☆ Multi-marginal Schrödinger Bridges with Iterative Reference Refinement + + +
+ Practitioners frequently aim to infer an unobserved population trajectory +using sample snapshots at multiple time points. For instance, in single-cell +sequencing, scientists would like to learn how gene expression evolves over +time. But sequencing any cell destroys that cell. So we cannot access any +cell's full trajectory, but we can access snapshot samples from many cells. +Stochastic differential equations are commonly used to analyze systems with +full individual-trajectory access; since here we have only sample snapshots, +these methods are inapplicable. The deep learning community has recently +explored using Schr\"odinger bridges (SBs) and their extensions to estimate +these dynamics. However, these methods either (1) interpolate between just two +time points or (2) require a single fixed reference dynamic within the SB, +which is often just set to be Brownian motion. But learning piecewise from +adjacent time points can fail to capture long-term dependencies. And +practitioners are typically able to specify a model class for the reference +dynamic but not the exact values of the parameters within it. So we propose a +new method that (1) learns the unobserved trajectories from sample snapshots +across multiple time points and (2) requires specification only of a class of +reference dynamics, not a single fixed one. In particular, we suggest an +iterative projection method inspired by Schr\"odinger bridges; we alternate +between learning a piecewise SB on the unobserved trajectories and using the +learned SB to refine our best guess for the dynamics within the reference +class. We demonstrate the advantages of our method via a well-known simulated +parametric model from ecology, simulated and real data from systems biology, +and real motion-capture data. + +
+
+ comment: Updated to fix title error +
+
+
+
+
+ + ♻ ☆ Kernel Density Estimators in Large Dimensions + + +
+ This paper studies Kernel density estimation for a high-dimensional +distribution $\rho(x)$. Traditional approaches have focused on the limit of +large number of data points $n$ and fixed dimension $d$. We analyze instead the +regime where both the number $n$ of data points $y_i$ and their dimensionality +$d$ grow with a fixed ratio $\alpha=(\log n)/d$. Our study reveals three +distinct statistical regimes for the kernel-based estimate of the density $\hat +\rho_h^{\mathcal {D}}(x)=\frac{1}{n h^d}\sum_{i=1}^n +K\left(\frac{x-y_i}{h}\right)$, depending on the bandwidth $h$: a classical +regime for large bandwidth where the Central Limit Theorem (CLT) holds, which +is akin to the one found in traditional approaches. Below a certain value of +the bandwidth, $h_{CLT}(\alpha)$, we find that the CLT breaks down. The +statistics of $\hat \rho_h^{\mathcal {D}}(x)$ for a fixed $x$ drawn from +$\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable +distribution). In particular below a value $h_G(\alpha)$, we find that $\hat +\rho_h^{\mathcal {D}}(x)$ is governed by extreme value statistics: only a few +points in the database matter and give the dominant contribution to the density +estimator. We provide a detailed analysis for high-dimensional multivariate +Gaussian data. We show that the optimal bandwidth threshold based on +Kullback-Leibler divergence lies in the new statistical regime identified in +this paper. Our findings reveal limitations of classical approaches, show the +relevance of these new statistical regimes, and offer new insights for Kernel +density estimation in high-dimensional settings. + +
+
+
+
+
+ + ♻ ☆ Resilience in Online Federated Learning: Mitigating Model-Poisoning + Attacks via Partial Sharing + + +
+ Federated learning (FL) allows training machine learning models on +distributed data without compromising privacy. However, FL is vulnerable to +model-poisoning attacks where malicious clients tamper with their local models +to manipulate the global model. In this work, we investigate the resilience of +the partial-sharing online FL (PSO-Fed) algorithm against such attacks. PSO-Fed +reduces communication overhead by allowing clients to share only a fraction of +their model updates with the server. We demonstrate that this partial sharing +mechanism has the added advantage of enhancing PSO-Fed's robustness to +model-poisoning attacks. Through theoretical analysis, we show that PSO-Fed +maintains convergence even under Byzantine attacks, where malicious clients +inject noise into their updates. Furthermore, we derive a formula for PSO-Fed's +mean square error, considering factors like stepsize, attack probability, and +the number of malicious clients. Interestingly, we find a non-trivial optimal +stepsize that maximizes PSO-Fed's resistance to these attacks. Extensive +numerical experiments confirm our theoretical findings and showcase PSO-Fed's +superior performance against model-poisoning attacks compared to other leading +FL algorithms. + +
+
+ comment: 13 pages, 9 figures, Submitted to TSIPN +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ Motion-compensated MR CINE reconstruction with reconstruction-driven + motion estimation + + +
+ In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective +approach to address highly undersampled acquisitions by incorporating motion +information between frames. In this work, we propose a novel perspective for +addressing the MCMR problem and a more integrated and efficient solution to the +MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the +original problem into two sub-optimization problems, i.e. motion estimation and +reconstruction, we formulate this problem as a single entity with one single +optimization. Our approach is unique in that the motion estimation is directly +driven by the ultimate goal, reconstruction, but not by the canonical +motion-warping loss (similarity measurement between motion-warped images and +target images). We align the objectives of motion estimation and +reconstruction, eliminating the drawbacks of artifacts-affected motion +estimation and therefore error-propagated reconstruction. Further, we can +deliver high-quality reconstruction and realistic motion without applying any +regularization/smoothness loss terms, circumventing the non-trivial weighting +factor tuning. We evaluate our method on two datasets: 1) an in-house acquired +2D CINE dataset for the retrospective study and 2) the public OCMR cardiac +dataset for the prospective study. The conducted experiments indicate that the +proposed MCMR framework can deliver artifact-free motion estimation and +high-quality MR images even for imaging accelerations up to 20x, outperforming +SOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation +across all experiments. The code is available at +https://github.com/JZPeterPan/MCMR-Recon-Driven-Motion. + +
+
+
+
+
+ + ♻ ☆ Maximizing V-information for Pre-training Superior Foundation Models + + +
+ Pre-training foundation models on large-scale datasets demonstrates +exceptional performance. However, recent research questions this traditional +notion, exploring whether an increase in pre-training data always leads to +enhanced model performance. To address this issue, data-effective learning +approaches have been introduced. However, current methods in this area lack a +clear standard for sample selection. Our experiments reveal that by maximizing +V-information, sample selection can be framed as an optimization problem, +enabling effective improvement in model performance even with fewer samples. +Under this guidance, we develop an optimal data-effective learning method +(OptiDEL) to maximize V-information. The OptiDEL method generates hard samples +to achieve or even exceed the performance of models trained on the full dataset +while using substantially less data. We compare the OptiDEL method with +state-of-the-art approaches finding that OptiDEL consistently outperforms +existing approaches across different datasets, with foundation models trained +on only 5% of the pre-training data surpassing the performance of those trained +on the full dataset. + +
+
+
+
+
+ + ♻ ☆ Rethinking of Encoder-based Warm-start Methods in Hyperparameter + Optimization + + +
+ Effectively representing heterogeneous tabular datasets for meta-learning +purposes remains an open problem. Previous approaches rely on predefined +meta-features, for example, statistical measures or landmarkers. The emergence +of dataset encoders opens new possibilities for the extraction of meta-features +because they do not involve any handmade design. Moreover, they are proven to +generate dataset representations with desired spatial properties. In this +research, we evaluate an encoder-based approach to one of the most established +meta-tasks - warm-starting of the Bayesian Hyperparameter Optimization. To +broaden our analysis we introduce a new approach for representation learning on +tabular data based on [Tomoharu Iwata and Atsutoshi Kumagai. Meta-learning from +Tasks with Heterogeneous Attribute Spaces. In Advances in Neural Information +Processing Systems, 2020]. The validation on over 100 datasets from UCI and an +independent metaMIMIC set of datasets highlights the nuanced challenges in +representation learning. We show that general representations may not suffice +for some meta-tasks where requirements are not explicitly considered during +extraction. + +
+
+
+
+
+ + ♻ ☆ CollaFuse: Navigating Limited Resources and Privacy in Collaborative + Generative AI + + +
+ In the landscape of generative artificial intelligence, diffusion-based +models present challenges for socio-technical systems in data requirements and +privacy. Traditional approaches like federated learning distribute the learning +process but strain individual clients, especially with constrained resources +(e.g., edge devices). In response to these challenges, we introduce CollaFuse, +a novel framework inspired by split learning. Tailored for efficient and +collaborative use of denoising diffusion probabilistic models, CollaFuse +enables shared server training and inference, alleviating client computational +burdens. This is achieved by retaining data and computationally inexpensive GPU +processes locally at each client while outsourcing the computationally +expensive processes to the shared server. Demonstrated in a healthcare context, +CollaFuse enhances privacy by highly reducing the need for sensitive +information sharing. These capabilities hold the potential to impact various +application areas, such as the design of edge computing solutions, healthcare +research, or autonomous driving. In essence, our work advances distributed +machine learning, shaping the future of collaborative GenAI networks. + +
+
+ comment: Thirty-Second European Conference on Information Systems (ECIS 2024) +
+
+
+
+
+ + ♻ ☆ Active Sensing of Knee Osteoarthritis Progression with Reinforcement + Learning + + +
+ Osteoarthritis (OA) is the most common musculoskeletal disease, which has no +cure. Knee OA (KOA) is one of the highest causes of disability worldwide, and +it costs billions of United States dollars to the global community. Prediction +of KOA progression has been of high interest to the community for years, as it +can advance treatment development through more efficient clinical trials and +improve patient outcomes through more efficient healthcare utilization. +Existing approaches for predicting KOA, however, are predominantly static, i.e. +consider data from a single time point to predict progression many years into +the future, and knee level, i.e. consider progression in a single joint only. +Due to these and related reasons, these methods fail to deliver the level of +predictive performance, which is sufficient to result in cost savings and +better patient outcomes. Collecting extensive data from all patients on a +regular basis could address the issue, but it is limited by the high cost at a +population level. In this work, we propose to go beyond static prediction +models in OA, and bring a novel Active Sensing (AS) approach, designed to +dynamically follow up patients with the objective of maximizing the number of +informative data acquisitions, while minimizing their total cost over a period +of time. Our approach is based on Reinforcement Learning (RL), and it leverages +a novel reward function designed specifically for AS of disease progression in +more than one part of a human body. Our method is end-to-end, relies on +multi-modal Deep Learning, and requires no human input at inference time. +Throughout an exhaustive experimental evaluation, we show that using RL can +provide a higher monetary benefit when compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Model-agnostic variable importance for predictive uncertainty: an + entropy-based approach + + +
+ In order to trust the predictions of a machine learning algorithm, it is +necessary to understand the factors that contribute to those predictions. In +the case of probabilistic and uncertainty-aware models, it is necessary to +understand not only the reasons for the predictions themselves, but also the +reasons for the model's level of confidence in those predictions. In this +paper, we show how existing methods in explainability can be extended to +uncertainty-aware models and how such extensions can be used to understand the +sources of uncertainty in a model's predictive distribution. In particular, by +adapting permutation feature importance, partial dependence plots, and +individual conditional expectation plots, we demonstrate that novel insights +into model behaviour may be obtained and that these methods can be used to +measure the impact of features on both the entropy of the predictive +distribution and the log-likelihood of the ground truth labels under that +distribution. With experiments using both synthetic and real-world data, we +demonstrate the utility of these approaches to understand both the sources of +uncertainty and their impact on model performance. + +
+
+ comment: Data Mining and Knowledge Discovery. Springer +
+
+
+
+
+ + ♻ ☆ Efficient mapping of phase diagrams with conditional Boltzmann + Generators + + +
+ The accurate prediction of phase diagrams is of central importance for both +the fundamental understanding of materials as well as for technological +applications in material sciences. However, the computational prediction of the +relative stability between phases based on their free energy is a daunting +task, as traditional free energy estimators require a large amount of +simulation data to obtain uncorrelated equilibrium samples over a grid of +thermodynamic states. In this work, we develop deep generative machine learning +models based on the Boltzmann Generator approach for entire phase diagrams, +employing normalizing flows conditioned on the thermodynamic states, e.g., +temperature and pressure, that they map to. By training a single normalizing +flow to transform the equilibrium distribution sampled at only one reference +thermodynamic state to a wide range of target temperatures and pressures, we +can efficiently generate equilibrium samples across the entire phase diagram. +Using a permutation-equivariant architecture allows us, thereby, to treat solid +and liquid phases on the same footing. We demonstrate our approach by +predicting the solid-liquid coexistence line for a Lennard-Jones system in +excellent agreement with state-of-the-art free energy methods while +significantly reducing the number of energy evaluations needed. + +
+
+
+
+
+ + ♻ ☆ On the Overlooked Pitfalls of Weight Decay and How to Mitigate Them: A + Gradient-Norm Perspective NeurIPS 2023 + + +
+ Weight decay is a simple yet powerful regularization technique that has been +very widely used in training of deep neural networks (DNNs). While weight decay +has attracted much attention, previous studies fail to discover some overlooked +pitfalls on large gradient norms resulted by weight decay. In this paper, we +discover that, weight decay can unfortunately lead to large gradient norms at +the final phase (or the terminated solution) of training, which often indicates +bad convergence and poor generalization. To mitigate the gradient-norm-centered +pitfalls, we present the first practical scheduler for weight decay, called the +Scheduled Weight Decay (SWD) method that can dynamically adjust the weight +decay strength according to the gradient norm and significantly penalize large +gradient norms during training. Our experiments also support that SWD indeed +mitigates large gradient norms and often significantly outperforms the +conventional constant weight decay strategy for Adaptive Moment Estimation +(Adam). + +
+
+ comment: NeurIPS 2023, 21 pages, 20 figures. Keywords: Weight Decay, + Regularization, Optimization, Deep Learning +
+
+
+
+
+ + ♻ ☆ Revisiting Score Function Estimators for $k$-Subset Sampling ICML 2024 + + +
+ Are score function estimators an underestimated approach to learning with +$k$-subset sampling? Sampling $k$-subsets is a fundamental operation in many +machine learning tasks that is not amenable to differentiable parametrization, +impeding gradient-based optimization. Prior work has focused on relaxed +sampling or pathwise gradient estimators. Inspired by the success of score +function estimators in variational inference and reinforcement learning, we +revisit them within the context of $k$-subset sampling. Specifically, we +demonstrate how to efficiently compute the $k$-subset distribution's score +function using a discrete Fourier transform, and reduce the estimator's +variance with control variates. The resulting estimator provides both exact +samples and unbiased gradient estimates while also applying to +non-differentiable downstream models, unlike existing methods. Experiments in +feature selection show results competitive with current methods, despite weaker +assumptions. + +
+
+ comment: ICML 2024 Workshop on Differentiable Almost Everything: + Differentiable Relaxations, Algorithms, Operators, and Simulators +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Improving the readability of mathematical expressions in text-based document +such as subtitle of mathematical video, is an significant task. To achieve +this, mathematical expressions should be convert to compiled formulas. For +instance, the spoken expression ``x equals minus b plus or minus the square +root of b squared minus four a c, all over two a'' from automatic speech +recognition is more readily comprehensible when displayed as a compiled formula +$x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken +sentences to compiled formulas, two processes are required: spoken sentences +are converted into LaTeX formulas, and LaTeX formulas are converted into +compiled formulas. The latter can be managed by using LaTeX engines. However, +there is no way to do the former effectively. Even if we try to solve this +using language models, there is no paired data between spoken sentences and +LaTeX formulas to train it. In this paper, we introduce MathBridge, the first +extensive dataset for translating mathematical spoken sentences into LaTeX +formulas. MathBridge comprises approximately 23 million LaTeX formulas paired +with the corresponding mathematical spoken sentences. Through comprehensive +evaluations, including fine-tuning with proposed data, we discovered that +MathBridge significantly enhances the capabilities of pretrained language +models for converting to LaTeX formulas from mathematical spoken sentences. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Indirectly Parameterized Concrete Autoencoders ICML 2024 + + +
+ Feature selection is a crucial task in settings where data is +high-dimensional or acquiring the full set of features is costly. Recent +developments in neural network-based embedded feature selection show promising +results across a wide range of applications. Concrete Autoencoders (CAEs), +considered state-of-the-art in embedded feature selection, may struggle to +achieve stable joint optimization, hurting their training time and +generalization. In this work, we identify that this instability is correlated +with the CAE learning duplicate selections. To remedy this, we propose a simple +and effective improvement: Indirectly Parameterized CAEs (IP-CAEs). IP-CAEs +learn an embedding and a mapping from it to the Gumbel-Softmax distributions' +parameters. Despite being simple to implement, IP-CAE exhibits significant and +consistent improvements over CAE in both generalization and training time +across several datasets for reconstruction and classification. Unlike CAE, +IP-CAE effectively leverages non-linear relationships and does not require +retraining the jointly optimized decoder. Furthermore, our approach is, in +principle, generalizable to Gumbel-Softmax distributions beyond feature +selection. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Attacking Graph Neural Networks with Bit Flips: Weisfeiler and Lehman Go + Indifferent + + +
+ Prior attacks on graph neural networks have mostly focused on graph poisoning +and evasion, neglecting the network's weights and biases. Traditional +weight-based fault injection attacks, such as bit flip attacks used for +convolutional neural networks, do not consider the unique properties of graph +neural networks. We propose the Injectivity Bit Flip Attack, the first bit flip +attack designed specifically for graph neural networks. Our attack targets the +learnable neighborhood aggregation functions in quantized message passing +neural networks, degrading their ability to distinguish graph structures and +losing the expressivity of the Weisfeiler-Lehman test. Our findings suggest +that exploiting mathematical properties specific to certain graph neural +network architectures can significantly increase their vulnerability to bit +flip attacks. Injectivity Bit Flip Attacks can degrade the maximal expressive +Graph Isomorphism Networks trained on various graph property prediction +datasets to random output by flipping only a small fraction of the network's +bits, demonstrating its higher destructive power compared to a bit flip attack +transferred from convolutional neural networks. Our attack is transparent and +motivated by theoretical insights which are confirmed by extensive empirical +results. + +
+
+
+
+
+ + ♻ ☆ ExtremeCast: Boosting Extreme Value Prediction for Global Weather + Forecast + + +
+ Data-driven weather forecast based on machine learning (ML) has experienced +rapid development and demonstrated superior performance in the global +medium-range forecast compared to traditional physics-based dynamical models. +However, most of these ML models struggle with accurately predicting extreme +weather, which is related to training loss and the uncertainty of weather +systems. Through mathematical analysis, we prove that the use of symmetric +losses, such as the Mean Squared Error (MSE), leads to biased predictions and +underestimation of extreme values. To address this issue, we introduce Exloss, +a novel loss function that performs asymmetric optimization and highlights +extreme values to obtain accurate extreme weather forecast. Beyond the +evolution in training loss, we introduce a training-free extreme value +enhancement module named ExBooster, which captures the uncertainty in +prediction outcomes by employing multiple random samples, thereby increasing +the hit rate of low-probability extreme events. Combined with an advanced +global weather forecast model, extensive experiments show that our solution can +achieve state-of-the-art performance in extreme weather prediction, while +maintaining the overall forecast accuracy comparable to the top medium-range +forecast models. + +
+
+
+
+
+ + ♻ ☆ DropKAN: Dropout Kolmogorov-Arnold Networks + + +
+ We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization +method that prevents co-adaptation of activation function weights in +Kolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask +directly within the KAN layer, randomly masking the outputs of some activations +within the KANs' computation graph. We show that this simple procedure that +require minimal coding effort has a regularizing effect and consistently lead +to better generalization of KANs. We analyze the adaptation of the standard +Dropout with KANs and demonstrate that Dropout applied to KANs' neurons can +lead to unpredictable behavior in the feedforward pass. We carry an empirical +study with real world Machine Learning datasets to validate our findings. Our +results suggest that DropKAN is consistently a better alternative to using +standard Dropout with KANs, and improves the generalization performance of +KANs. Our implementation of DropKAN is available at: +\url{https://github.com/Ghaith81/dropkan}. + +
+
+
+
+
+ + ♻ ☆ Fair Sampling in Diffusion Models through Switching Mechanism AAAI 2024 + + +
+ Diffusion models have shown their effectiveness in generation tasks by +well-approximating the underlying probability distribution. However, diffusion +models are known to suffer from an amplified inherent bias from the training +data in terms of fairness. While the sampling process of diffusion models can +be controlled by conditional guidance, previous works have attempted to find +empirical guidance to achieve quantitative fairness. To address this +limitation, we propose a fairness-aware sampling method called +\textit{attribute switching} mechanism for diffusion models. Without additional +training, the proposed sampling can obfuscate sensitive attributes in generated +data without relying on classifiers. We mathematically prove and experimentally +demonstrate the effectiveness of the proposed method on two key aspects: (i) +the generation of fair data and (ii) the preservation of the utility of the +generated data. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Handling Distribution Shifts on Graphs: An Invariance Perspective ICLR2022 + + +
+ There is increasing evidence suggesting neural networks' sensitivity to +distribution shifts, so that research on out-of-distribution (OOD) +generalization comes into the spotlight. Nonetheless, current endeavors mostly +focus on Euclidean data, and its formulation for graph-structured data is not +clear and remains under-explored, given two-fold fundamental challenges: 1) the +inter-connection among nodes in one graph, which induces non-IID generation of +data points even under the same environment, and 2) the structural information +in the input graph, which is also informative for prediction. In this paper, we +formulate the OOD problem on graphs and develop a new invariant learning +approach, Explore-to-Extrapolate Risk Minimization (EERM), that facilitates +graph neural networks to leverage invariance principles for prediction. EERM +resorts to multiple context explorers (specified as graph structure editers in +our case) that are adversarially trained to maximize the variance of risks from +multiple virtual environments. Such a design enables the model to extrapolate +from a single observed environment which is the common case for node-level +prediction. We prove the validity of our method by theoretically showing its +guarantee of a valid OOD solution and further demonstrate its power on various +real-world datasets for handling distribution shifts from artificial spurious +features, cross-domain transfers and dynamic graph evolution. + +
+
+ comment: ICLR2022, 30 pages +
+
+
+
+
+ + ♻ ☆ SGFormer: Simplifying and Empowering Transformers for Large-Graph + Representations NeurIPS 2023 + + +
+ Learning representations on large-sized graphs is a long-standing challenge +due to the inter-dependence nature involved in massive data points. +Transformers, as an emerging class of foundation encoders for graph-structured +data, have shown promising performance on small graphs due to its global +attention capable of capturing all-pair influence beyond neighboring nodes. +Even so, existing approaches tend to inherit the spirit of Transformers in +language and vision tasks, and embrace complicated models by stacking deep +multi-head attentions. In this paper, we critically demonstrate that even using +a one-layer attention can bring up surprisingly competitive performance across +node property prediction benchmarks where node numbers range from +thousand-level to billion-level. This encourages us to rethink the design +philosophy for Transformers on large graphs, where the global attention is a +computation overhead hindering the scalability. We frame the proposed scheme as +Simplified Graph Transformers (SGFormer), which is empowered by a simple +attention model that can efficiently propagate information among arbitrary +nodes in one layer. SGFormer requires none of positional encodings, +feature/graph pre-processing or augmented loss. Empirically, SGFormer +successfully scales to the web-scale graph ogbn-papers100M and yields up to +141x inference acceleration over SOTA Transformers on medium-sized graphs. +Beyond current results, we believe the proposed methodology alone enlightens a +new technical path of independent interest for building Transformers on large +graphs. + +
+
+ comment: Accepted to NeurIPS 2023, the codes are available at + https://github.com/qitianwu/SGFormer +
+
+
+
+
+ + ♻ ☆ Graph Out-of-Distribution Generalization via Causal Intervention WWW + + +
+ Out-of-distribution (OOD) generalization has gained increasing attentions for +learning on graphs, as graph neural networks (GNNs) often exhibit performance +degradation with distribution shifts. The challenge is that distribution shifts +on graphs involve intricate interconnections between nodes, and the environment +labels are often absent in data. In this paper, we adopt a bottom-up +data-generative perspective and reveal a key observation through causal +analysis: the crux of GNNs' failure in OOD generalization lies in the latent +confounding bias from the environment. The latter misguides the model to +leverage environment-sensitive correlations between ego-graph features and +target nodes' labels, resulting in undesirable generalization on new unseen +nodes. Built upon this analysis, we introduce a conceptually simple yet +principled approach for training robust GNNs under node-level distribution +shifts, without prior knowledge of environment labels. Our method resorts to a +new learning objective derived from causal inference that coordinates an +environment estimator and a mixture-of-expert GNN predictor. The new approach +can counteract the confounding bias in training data and facilitate learning +generalizable predictive relations. Extensive experiment demonstrates that our +model can effectively enhance generalization with various types of distribution +shifts and yield up to 27.4\% accuracy improvement over state-of-the-arts on +graph OOD generalization benchmarks. Source codes are available at +https://github.com/fannie1208/CaNet. + +
+
+ comment: Accepted by the research paper track of The Web Conference (WWW) + 2024. The codes are available at https://github.com/fannie1208/CaNet +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ What Do Language Models Hear? Probing for Auditory Representations in + Language Models + + +
+ This work explores whether language models encode meaningfully grounded +representations of sounds of objects. We learn a linear probe that retrieves +the correct text representation of an object given a snippet of audio related +to that object, where the sound representation is given by a pretrained audio +model. This probe is trained via a contrastive loss that pushes the language +representations and sound representations of an object to be close to one +another. After training, the probe is tested on its ability to generalize to +objects that were not seen during training. Across different language models +and audio models, we find that the probe generalization is above chance in many +cases, indicating that despite being trained only on raw text, language models +encode grounded knowledge of sounds for some objects. + +
+
+
+
+
+ + ♻ ☆ Natural Language Interaction with a Household Electricity + Knowledge-based Digital Twin + + +
+ Domain specific digital twins, representing a digital replica of various +segments of the smart grid, are foreseen as able to model, simulate, and +control the respective segments. At the same time, knowledge-based digital +twins, coupled with AI, may also empower humans to understand aspects of the +system through natural language interaction in view of planning and policy +making. This paper is the first to assess and report on the potential of +Retrieval Augmented Generation (RAG) question answers related to household +electrical energy measurement aspects leveraging a knowledge-based energy +digital twin. Relying on the recently published electricity consumption +knowledge graph that actually represents a knowledge-based digital twin, we +study the capabilities of ChatGPT, Gemini and Llama in answering electricity +related questions. Furthermore, we compare the answers with the ones generated +through a RAG techniques that leverages an existing electricity knowledge-based +digital twin. Our findings illustrate that the RAG approach not only reduces +the incidence of incorrect information typically generated by LLMs but also +significantly improves the quality of the output by grounding responses in +verifiable data. This paper details our methodology, presents a comparative +analysis of responses with and without RAG, and discusses the implications of +our findings for future applications of AI in specialized sectors like energy +data analysis. + +
+
+ comment: Accepted at IEEE SmartGridComm'24 +
+
+
+
+
+ + ♻ ☆ Reliable Generation of Privacy-preserving Synthetic EHR Time Series via + Diffusion Models + + +
+ Electronic Health Records (EHRs) are rich sources of patient-level data, +offering valuable resources for medical data analysis. However, privacy +concerns often restrict access to EHRs, hindering downstream analysis. Current +EHR de-identification methods are flawed and can lead to potential privacy +leakage. Additionally, existing publicly available EHR databases are limited, +preventing the advancement of medical research using EHR. This study aims to +overcome these challenges by generating realistic and privacy-preserving +synthetic electronic health records (EHRs) time series efficiently. We +introduce a new method for generating diverse and realistic synthetic EHR time +series data using Denoising Diffusion Probabilistic Models (DDPM). We conducted +experiments on six databases: Medical Information Mart for Intensive Care III +and IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and +non-EHR datasets on Stocks and Energy. We compared our proposed method with +eight existing methods. Our results demonstrate that our approach significantly +outperforms all existing methods in terms of data fidelity while requiring less +training effort. Additionally, data generated by our method yields a lower +discriminative accuracy compared to other baseline methods, indicating the +proposed method can generate data with less privacy risk. The proposed +diffusion-model-based method can reliably and efficiently generate synthetic +EHR time series, which facilitates the downstream medical data analysis. Our +numerical results show the superiority of the proposed method over all other +existing methods. + +
+
+
+
+
+ + ♻ ☆ Trading Devil Final: Backdoor attack via Stock market and Bayesian + Optimization + + +
+ Since the advent of generative artificial intelligence, every company and +researcher has been rushing to develop their own generative models, whether +commercial or not. Given the large number of users of these powerful new tools, +there is currently no intrinsically verifiable way to explain from the ground +up what happens when LLMs (large language models) learn. For example, those +based on automatic speech recognition systems, which have to rely on huge and +astronomical amounts of data collected from all over the web to produce fast +and efficient results, In this article, we develop a backdoor attack called +MarketBackFinal 2.0, based on acoustic data poisoning, MarketBackFinal 2.0 is +mainly based on modern stock market models. In order to show the possible +vulnerabilities of speech-based transformers that may rely on LLMs. + +
+
+ comment: END (will never be modified again) :Jumps-Diffusion and stock market: + Better quantify uncertainty in financial simulations +
+
+
+
+
+ + ♻ ☆ Data-driven identification of latent port-Hamiltonian systems + + +
+ Conventional physics-based modeling techniques involve high effort, e.g., +time and expert knowledge, while data-driven methods often lack +interpretability, structure, and sometimes reliability. To mitigate this, we +present a data-driven system identification framework that derives models in +the port-Hamiltonian (pH) formulation. This formulation is suitable for +multi-physical systems while guaranteeing the useful system theoretical +properties of passivity and stability. Our framework combines linear and +nonlinear reduction with structured, physics-motivated system identification. +In this process, high-dimensional state data obtained from possibly nonlinear +systems serves as input for an autoencoder, which then performs two tasks: (i) +nonlinearly transforming and (ii) reducing this data onto a low-dimensional +latent space. In this space, a linear pH system, that satisfies the pH +properties per construction, is parameterized by the weights of a neural +network. The mathematical requirements are met by defining the pH matrices +through Cholesky factorizations. The neural networks that define the coordinate +transformation and the pH system are identified in a joint optimization process +to match the dynamics observed in the data while defining a linear pH system in +the latent space. The learned, low-dimensional pH system can describe even +nonlinear systems and is rapidly computable due to its small size. The method +is exemplified by a parametric mass-spring-damper and a nonlinear pendulum +example, as well as the high-dimensional model of a disc brake with linear +thermoelastic behavior. + +
+
+ comment: 33 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ AirPilot: A PPO-based DRL Auto-Tuned Nonlinear PID Drone Controller for + Robust Autonomous Flights + + +
+ Navigation precision, speed and stability are crucial for safe UAV flight +maneuvers and effective flight mission executions in dynamic environments. +Different flight missions may have varying objectives, such as minimizing +energy consumption, achieving precise positioning, or maximizing speed. A +controller that can adapt to different objectives on the fly is highly +valuable. Proportional Integral Derivative controllers are one of the most +popular and widely used control algorithms for drones control systems, but +their linear control algorithm fails to capture the nonlinear nature of the +dynamic wind conditions and complex drone system. Manually tuning the PID gains +for various missions can be time-consuming and requires significant expertise. +This paper aims to revolutionize drone flight control by presenting the +AirPilot, a nonlinear Deep Reinforcement Learning (DRL) - enhanced PID drone +controller using Proximal Policy Optimization. AirPilot controller combines the +simplicity and effectiveness of traditional PID control with the adaptability, +learning capability, and optimization potential of DRL. This makes it better +suited for modern drone applications where the environment is dynamic, and +mission-specific performance demands are high. We employed a COEX Clover +autonomous drone for training the DRL agent within the Gazebo simulator and +subsequently implemented it in a real-world lab setting, which marks a +significant milestone as one of the first attempts to apply a DRL-based flight +controller on an actual drone. Airpilot is capable of reducing the navigation +error by more than 82% and improving overshoot, speed and settling time +significantly. + +
+
+ comment: 14 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Large Language Models for Code: Security Hardening and Adversarial + Testing CCS 2023 + + +
+ Large language models (large LMs) are increasingly trained on massive +codebases and used to generate code. However, LMs lack awareness of security +and are found to frequently produce unsafe code. This work studies the security +of LMs along two important axes: (i) security hardening, which aims to enhance +LMs' reliability in generating secure code, and (ii) adversarial testing, which +seeks to evaluate LMs' security at an adversarial standpoint. We address both +of these by formulating a new security task called controlled code generation. +The task is parametric and takes as input a binary property to guide the LM to +generate secure or unsafe code, while preserving the LM's capability of +generating functionally correct code. We propose a novel learning-based +approach called SVEN to solve this task. SVEN leverages property-specific +continuous vectors to guide program generation towards the given property, +without modifying the LM's weights. Our training procedure optimizes these +continuous vectors by enforcing specialized loss terms on different regions of +code, using a high-quality dataset carefully curated by us. Our extensive +evaluation shows that SVEN is highly effective in achieving strong security +control. For instance, a state-of-the-art CodeGen LM with 2.7B parameters +generates secure code for 59.1% of the time. When we employ SVEN to perform +security hardening (or adversarial testing) on this LM, the ratio is +significantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN +closely matches the original LMs in functional correctness. + +
+
+ comment: Accepted to ACM CCS 2023 +
+
+
+
+
+ + ♻ ☆ Spectral Clustering for Discrete Distributions + + +
+ The discrete distribution is often used to describe complex instances in +machine learning, such as images, sequences, and documents. Traditionally, +clustering of discrete distributions (D2C) has been approached using +Wasserstein barycenter methods. These methods operate under the assumption that +clusters can be well-represented by barycenters, which is seldom true in many +real-world applications. Additionally, these methods are not scalable for large +datasets due to the high computational cost of calculating Wasserstein +barycenters. In this work, we explore the feasibility of using spectral +clustering combined with distribution affinity measures (e.g., maximum mean +discrepancy and Wasserstein distance) to cluster discrete distributions. We +demonstrate that these methods can be more accurate and efficient than +barycenter methods. To further enhance scalability, we propose using linear +optimal transport to construct affinity matrices efficiently for large +datasets. We provide theoretical guarantees for the success of our methods in +clustering distributions. Experiments on both synthetic and real data show that +our methods outperform existing baselines. + +
+
+
+
+
+ + ♻ ☆ Implicit Causal Representation Learning via Switchable Mechanisms + + +
+ Learning causal representations from observational and interventional data in +the absence of known ground-truth graph structures necessitates implicit latent +causal representation learning. Implicit learning of causal mechanisms +typically involves two categories of interventional data: hard and soft +interventions. In real-world scenarios, soft interventions are often more +realistic than hard interventions, as the latter require fully controlled +environments. Unlike hard interventions, which directly force changes in a +causal variable, soft interventions exert influence indirectly by affecting the +causal mechanism. However, the subtlety of soft interventions impose several +challenges for learning causal models. One challenge is that soft +intervention's effects are ambiguous, since parental relations remain intact. +In this paper, we tackle the challenges of learning causal models using soft +interventions while retaining implicit modelling. We propose ICLR-SM, which +models the effects of soft interventions by employing a causal mechanism switch +variable designed to toggle between different causal mechanisms. In our +experiments, we consistently observe improved learning of identifiable, causal +representations, compared to baseline approaches. + +
+
+
+
+
+ + ♻ ☆ Confronting the Reproducibility Crisis: A Case Study of Challenges in + Cybersecurity AI + + +
+ In the rapidly evolving field of cybersecurity, ensuring the reproducibility +of AI-driven research is critical to maintaining the reliability and integrity +of security systems. This paper addresses the reproducibility crisis within the +domain of adversarial robustness -- a key area in AI-based cybersecurity that +focuses on defending deep neural networks against malicious perturbations. +Through a detailed case study, we attempt to validate results from prior work +on certified robustness using the VeriGauge toolkit, revealing significant +challenges due to software and hardware incompatibilities, version conflicts, +and obsolescence. Our findings underscore the urgent need for standardized +methodologies, containerization, and comprehensive documentation to ensure the +reproducibility of AI models deployed in critical cybersecurity applications. +By tackling these reproducibility challenges, we aim to contribute to the +broader discourse on securing AI systems against advanced persistent threats, +enhancing network and IoT security, and protecting critical infrastructure. +This work advocates for a concerted effort within the research community to +prioritize reproducibility, thereby strengthening the foundation upon which +future cybersecurity advancements are built. + +
+
+ comment: 8 pages, 0 figures, 2 tables, updated to incorporate feedback and + improvements +
+
+
+
+
+ + ♻ ☆ Large Language Model Aided QoS Prediction for Service Recommendation + + +
+ Large language models (LLMs) have seen rapid improvement in the recent years, +and have been used in a wider range of applications. After being trained on +large text corpus, LLMs obtain the capability of extracting rich features from +textual data. Such capability is potentially useful for the web service +recommendation task, where the web users and services have intrinsic attributes +that can be described using natural language sentences and are useful for +recommendation. In this paper, we explore the possibility and practicality of +using LLMs for web service recommendation. We propose the large language model +aided QoS prediction (llmQoS) model, which use LLMs to extract useful +information from attributes of web users and services via descriptive +sentences. This information is then used in combination with the QoS values of +historical interactions of users and services, to predict QoS values for any +given user-service pair. On the WSDream dataset, llmQoS is shown to overcome +the data sparsity issue inherent to the QoS prediction problem, and outperforms +comparable baseline models consistently. + +
+
+
+
+
+ + ♻ ☆ Space Group Informed Transformer for Crystalline Materials Generation + + +
+ We introduce CrystalFormer, a transformer-based autoregressive model +specifically designed for space group-controlled generation of crystalline +materials. The incorporation of space group symmetry significantly simplifies +the crystal space, which is crucial for data and compute efficient generative +modeling of crystalline materials. Leveraging the prominent discrete and +sequential nature of the Wyckoff positions, CrystalFormer learns to generate +crystals by directly predicting the species and locations of +symmetry-inequivalent atoms in the unit cell. We demonstrate the advantages of +CrystalFormer in standard tasks such as symmetric structure initialization and +element substitution compared to conventional methods implemented in popular +crystal structure prediction software. Moreover, we showcase the application of +CrystalFormer of property-guided materials design in a plug-and-play manner. +Our analysis shows that CrystalFormer ingests sensible solid-state chemistry +knowledge and heuristics by compressing the material dataset, thus enabling +systematic exploration of crystalline materials. The simplicity, generality, +and flexibility of CrystalFormer position it as a promising architecture to be +the foundational model of the entire crystalline materials space, heralding a +new era in materials modeling and discovery. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ RC-Mixup: A Data Augmentation Strategy against Noisy Data for Regression + Tasks KDD 2024 + + +
+ We study the problem of robust data augmentation for regression tasks in the +presence of noisy data. Data augmentation is essential for generalizing deep +learning models, but most of the techniques like the popular Mixup are +primarily designed for classification tasks on image data. Recently, there are +also Mixup techniques that are specialized to regression tasks like C-Mixup. In +comparison to Mixup, which takes linear interpolations of pairs of samples, +C-Mixup is more selective in which samples to mix based on their label +distances for better regression performance. However, C-Mixup does not +distinguish noisy versus clean samples, which can be problematic when mixing +and lead to suboptimal model performance. At the same time, robust training has +been heavily studied where the goal is to train accurate models against noisy +data through multiple rounds of model training. We thus propose our data +augmentation strategy RC-Mixup, which tightly integrates C-Mixup with +multi-round robust training methods for a synergistic effect. In particular, +C-Mixup improves robust training in identifying clean data, while robust +training provides cleaner data to C-Mixup for it to perform better. A key +advantage of RC-Mixup is that it is data-centric where the robust model +training algorithm itself does not need to be modified, but can simply benefit +from data mixing. We show in our experiments that RC-Mixup significantly +outperforms C-Mixup and robust training baselines on noisy data benchmarks and +can be integrated with various robust training methods. + +
+
+ comment: Accepted to KDD 2024 +
+
+
+
+
+ + ♻ ☆ BiLO: Bilevel Local Operator Learning for PDE inverse problems + + +
+ We propose a new neural network based method for solving inverse problems for +partial differential equations (PDEs) by formulating the PDE inverse problem as +a bilevel optimization problem. At the upper level, we minimize the data loss +with respect to the PDE parameters. At the lower level, we train a neural +network to locally approximate the PDE solution operator in the neighborhood of +a given set of PDE parameters, which enables an accurate approximation of the +descent direction for the upper level optimization problem. The lower level +loss function includes the L2 norms of both the residual and its derivative +with respect to the PDE parameters. We apply gradient descent simultaneously on +both the upper and lower level optimization problems, leading to an effective +and fast algorithm. The method, which we refer to as BiLO (Bilevel Local +Operator learning), is also able to efficiently infer unknown functions in the +PDEs through the introduction of an auxiliary variable. Through extensive +experiments over multiple PDE systems, we demonstrate that our method enforces +strong PDE constraints, is robust to sparse and noisy data, and eliminates the +need to balance the residual and the data loss, which is inherent to the soft +PDE constraints in many existing methods. + +
+
+
+
+
+ + ♻ ☆ Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A + Bayesian Fusion Approach + + +
+ Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs) +involves transmitting Electromagnetic Waves (EMWs) and performing target type +recognition on the received radar echo, crucial for defense and aerospace +applications. Previous studies highlighted the advantages of multistatic radar +configurations over monostatic ones in RATR. However, fusion methods in +multistatic radar configurations often suboptimally combine classification +vectors from individual radars probabilistically. To address this, we propose a +fully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to +aggregate classification probability vectors from multiple radars. OBF, based +on expected 0-1 loss, updates a Recursive Bayesian Classification (RBC) +posterior distribution for target UAV type, conditioned on historical +observations across multiple time steps. We evaluate the approach using +simulated random walk trajectories for seven drones, correlating target aspect +angles to Radar Cross Section (RCS) measurements in an anechoic chamber. +Comparing against single radar Automated Target Recognition (ATR) systems and +suboptimal fusion methods, our empirical results demonstrate that the OBF +method integrated with RBC significantly enhances classification accuracy +compared to other fusion methods and single radar configurations. + +
+
+ comment: Accepted to IEEE Transactions on Aerospace and Electronic Systems +
+
+
+
+
+ + ♻ ☆ Personalized Predictions of Glioblastoma Infiltration: Mathematical + Models, Physics-Informed Neural Networks and Multimodal Scans + + +
+ Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is +crucial for understanding tumor growth dynamics and designing personalized +radiotherapy treatment plans.Mathematical models of GBM growth can complement +the data in the prediction of spatial distributions of tumor cells. However, +this requires estimating patient-specific parameters of the model from clinical +data, which is a challenging inverse problem due to limited temporal data and +the limited time between imaging and diagnosis. This work proposes a method +that uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific +parameters of a reaction-diffusion PDE model of GBM growth from a single 3D +structural MRI snapshot. PINNs embed both the data and the PDE into a loss +function, thus integrating theory and data. Key innovations include the +identification and estimation of characteristic non-dimensional parameters, a +pre-training step that utilizes the non-dimensional parameters and a +fine-tuning step to determine the patient specific parameters. Additionally, +the diffuse domain method is employed to handle the complex brain geometry +within the PINN framework. Our method is validated both on synthetic and +patient datasets, and shows promise for real-time parametric inference in the +clinical setting for personalized GBM treatment. + +
+
+
+
+
+ + ♻ ☆ Activations Through Extensions: A Framework To Boost Performance Of + Neural Networks + + +
+ Activation functions are non-linearities in neural networks that allow them +to learn complex mapping between inputs and outputs. Typical choices for +activation functions are ReLU, Tanh, Sigmoid etc., where the choice generally +depends on the application domain. In this work, we propose a +framework/strategy that unifies several works on activation functions and +theoretically explains the performance benefits of these works. We also propose +novel techniques that originate from the framework and allow us to obtain +``extensions'' (i.e. special generalizations of a given neural network) of +neural networks through operations on activation functions. We theoretically +and empirically show that ``extensions'' of neural networks have performance +benefits compared to vanilla neural networks with insignificant space and time +complexity costs on standard test functions. We also show the benefits of +neural network ``extensions'' in the time-series domain on real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Enhancing Accuracy in Generative Models via Knowledge Transfer + + +
+ This paper investigates the accuracy of generative models and the impact of +knowledge transfer on their generation precision. Specifically, we examine a +generative model for a target task, fine-tuned using a pre-trained model from a +source task. Building on the "Shared Embedding" concept, which bridges the +source and target tasks, we introduce a novel framework for transfer learning +under distribution metrics such as the Kullback-Leibler divergence. This +framework underscores the importance of leveraging inherent similarities +between diverse tasks despite their distinct data distributions. Our theory +suggests that the shared structures can augment the generation accuracy for a +target task, reliant on the capability of a source model to identify shared +structures and effective knowledge transfer from source to target learning. To +demonstrate the practical utility of this framework, we explore the theoretical +implications for two specific generative models: diffusion and normalizing +flows. The results show enhanced performance in both models over their +non-transfer counterparts, indicating advancements for diffusion models and +providing fresh insights into normalizing flows in transfer and non-transfer +settings. These results highlight the significant contribution of knowledge +transfer in boosting the generation capabilities of these models. + +
+
+
+
+
+ + ♻ ☆ Characterizing and Understanding HGNN Training on GPUs + + +
+ Owing to their remarkable representation capabilities for heterogeneous graph +data, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in +many critical real-world domains such as recommendation systems and medical +analysis. Prior to their practical application, identifying the optimal HGNN +model parameters tailored to specific tasks through extensive training is a +time-consuming and costly process. To enhance the efficiency of HGNN training, +it is essential to characterize and analyze the execution semantics and +patterns within the training process to identify performance bottlenecks. In +this study, we conduct an in-depth quantification and analysis of two +mainstream HGNN training scenarios, including single-GPU and multi-GPU +distributed training. Based on the characterization results, we disclose the +performance bottlenecks and their underlying causes in different HGNN training +scenarios and provide optimization guidelines from both software and hardware +perspectives. + +
+
+ comment: 23 pages, 14 figures, submitted to ACM TACO +
+
+
+
+
+ + ♻ ☆ A Survey of Meta-Reinforcement Learning + + +
+ While deep reinforcement learning (RL) has fueled multiple high-profile +successes in machine learning, it is held back from more widespread adoption by +its often poor data efficiency and the limited generality of the policies it +produces. A promising approach for alleviating these limitations is to cast the +development of better RL algorithms as a machine learning problem itself in a +process called meta-RL. Meta-RL is most commonly studied in a problem setting +where, given a distribution of tasks, the goal is to learn a policy that is +capable of adapting to any new task from the task distribution with as little +data as possible. In this survey, we describe the meta-RL problem setting in +detail as well as its major variations. We discuss how, at a high level, +meta-RL research can be clustered based on the presence of a task distribution +and the learning budget available for each individual task. Using these +clusters, we then survey meta-RL algorithms and applications. We conclude by +presenting the open problems on the path to making meta-RL part of the standard +toolbox for a deep RL practitioner. + +
+
+
+
+
+ + ♻ ☆ Open-Source Molecular Processing Pipeline for Generating Molecules + + +
+ Generative models for molecules have shown considerable promise for use in +computational chemistry, but remain difficult to use for non-experts. For this +reason, we introduce open-source infrastructure for easily building generative +molecular models into the widely used DeepChem [Ramsundar et al., 2019] library +with the aim of creating a robust and reusable molecular generation pipeline. +In particular, we add high quality PyTorch [Paszke et al., 2019] +implementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao +and Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our +implementations show strong performance comparable with past work [Kuznetsov +and Polykovskiy, 2021, Cao and Kipf, 2022]. + +
+
+ comment: Presented at the 2024 Molecular Machine Learning Conference (MoML + 2024) +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Scaling up Multimodal Pre-training for Sign Language Understanding + + +
+ Sign language serves as the primary meaning of communication for the +deaf-mute community. Different from spoken language, it commonly conveys +information by the collaboration of manual features, i.e., hand gestures and +body movements, and non-manual features, i.e., facial expressions and mouth +cues. To facilitate communication between the deaf-mute and hearing people, a +series of sign language understanding (SLU) tasks have been studied in recent +years, including isolated/continuous sign language recognition (ISLR/CSLR), +gloss-free sign language translation (GF-SLT) and sign language retrieval +(SL-RT). Sign language recognition and translation aims to understand the +semantic meaning conveyed by sign languages from gloss-level and +sentence-level, respectively. In contrast, SL-RT focuses on retrieving sign +videos or corresponding texts from a closed-set under the query-by-example +search paradigm. These tasks investigate sign language topics from diverse +perspectives and raise challenges in learning effective representation of sign +language videos. To advance the development of sign language understanding, +exploring a generalized model that is applicable across various SLU tasks is a +profound research direction. + +
+
+ comment: Sign language recognition; Sign language translation; Sign language + retrieval +
+
+
+
+
+ + ♻ ☆ HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR + Headsets + + +
+ Virtual Reality (VR) has become increasingly popular for remote +collaboration, but video conferencing poses challenges when the user's face is +covered by the headset. Existing solutions have limitations in terms of +accessibility. In this paper, we propose HeadsetOff, a novel system that +achieves photorealistic video conferencing on economical VR headsets by +leveraging voice-driven face reconstruction. HeadsetOff consists of three main +components: a multimodal predictor, a generator, and an adaptive controller. +The predictor effectively predicts user future behavior based on different +modalities. The generator employs voice, head motion, and eye blink to animate +the human face. The adaptive controller dynamically selects the appropriate +generator model based on the trade-off between video quality and delay. +Experimental results demonstrate the effectiveness of HeadsetOff in achieving +high-quality, low-latency video conferencing on economical VR headsets. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 73 + +
+
+
+ + ☆ Can Large Language Models Understand Symbolic Graphics Programs? + + +
+ Assessing the capabilities of large language models (LLMs) is often +challenging, in part, because it is hard to find tasks to which they have not +been exposed during training. We take one step to address this challenge by +turning to a new task: focusing on symbolic graphics programs, which are a +popular representation for graphics content that procedurally generates visual +data. LLMs have shown exciting promise towards program synthesis, but do they +understand symbolic graphics programs? Unlike conventional programs, symbolic +graphics programs can be translated to graphics content. Here, we characterize +an LLM's understanding of symbolic programs in terms of their ability to answer +questions related to the graphics content. This task is challenging as the +questions are difficult to answer from the symbolic programs alone -- yet, they +would be easy to answer from the corresponding graphics content as we verify +through a human experiment. To understand symbolic programs, LLMs may need to +possess the ability to imagine how the corresponding graphics content would +look without directly accessing the rendered visual content. We use this task +to evaluate LLMs by creating a large benchmark for the semantic understanding +of symbolic graphics programs. This benchmark is built via program-graphics +correspondence, hence requiring minimal human efforts. We evaluate current LLMs +on our benchmark to elucidate a preliminary assessment of their ability to +reason about visual scenes from programs. We find that this task distinguishes +existing LLMs and models considered good at reasoning perform better. Lastly, +we introduce Symbolic Instruction Tuning (SIT) to improve this ability. +Specifically, we query GPT4-o with questions and images generated by symbolic +programs. Such data are then used to finetune an LLM. We also find that SIT +data can improve the general instruction following ability of LLMs. + +
+
+ comment: Technical Report v1 (44 pages, 23 figures, project page: + https://sgp-bench.github.io/) +
+
+
+
+
+ + ☆ ScalingFilter: Assessing Data Quality through Inverse Utilization of + Scaling Laws + + +
+ High-quality data is crucial for the pre-training performance of large +language models. Unfortunately, existing quality filtering methods rely on a +known high-quality dataset as reference, which can introduce potential bias and +compromise diversity. In this paper, we propose ScalingFilter, a novel approach +that evaluates text quality based on the perplexity difference between two +language models trained on the same data, thereby eliminating the influence of +the reference dataset in the filtering process. An theoretical analysis shows +that ScalingFilter is equivalent to an inverse utilization of scaling laws. +Through training models with 1.3B parameters on the same data source processed +by various quality filters, we find ScalingFilter can improve zero-shot +performance of pre-trained models in downstream tasks. To assess the bias +introduced by quality filtering, we introduce semantic diversity, a metric of +utilizing text embedding models for semantic representations. Extensive +experiments reveal that semantic diversity is a reliable indicator of dataset +diversity, and ScalingFilter achieves an optimal balance between downstream +performance and semantic diversity. + +
+
+
+
+
+ + ☆ Benchmarking the Capabilities of Large Language Models in Transportation + System Engineering: Accuracy, Consistency, and Reasoning Behaviors + + +
+ In this paper, we explore the capabilities of state-of-the-art large language +models (LLMs) such as GPT-4, GPT-4o, Claude 3.5 Sonnet, Claude 3 Opus, Gemini +1.5 Pro, Llama 3, and Llama 3.1 in solving some selected undergraduate-level +transportation engineering problems. We introduce TransportBench, a benchmark +dataset that includes a sample of transportation engineering problems on a wide +range of subjects in the context of planning, design, management, and control +of transportation systems. This dataset is used by human experts to evaluate +the capabilities of various commercial and open-sourced LLMs, especially their +accuracy, consistency, and reasoning behaviors, in solving transportation +engineering problems. Our comprehensive analysis uncovers the unique strengths +and limitations of each LLM, e.g. our analysis shows the impressive accuracy +and some unexpected inconsistent behaviors of Claude 3.5 Sonnet in solving +TransportBench problems. Our study marks a thrilling first step toward +harnessing artificial general intelligence for complex transportation +challenges. + +
+
+
+
+
+ + ☆ The ShareLM Collection and Plugin: Contributing Human-Model Chats for + the Benefit of the Community + + +
+ Human-model conversations provide a window into users' real-world scenarios, +behavior, and needs, and thus are a valuable resource for model development and +research. While for-profit companies collect user data through the APIs of +their models, using it internally to improve their own models, the open source +and research community lags behind. + We introduce the ShareLM collection, a unified set of human conversations +with large language models, and its accompanying plugin, a Web extension for +voluntarily contributing user-model conversations. Where few platforms share +their chats, the ShareLM plugin adds this functionality, thus, allowing users +to share conversations from most platforms. The plugin allows the user to rate +their conversations, both at the conversation and the response levels, and +delete conversations they prefer to keep private before they ever leave the +user's local storage. We release the plugin conversations as part of the +ShareLM collection, and call for more community effort in the field of open +human-model data. + The code, plugin, and data are available. + +
+
+
+
+
+ + ☆ mhGPT: A Lightweight Generative Pre-Trained Transformer for Mental + Health Text Analysis + + +
+ This paper introduces mhGPT, a lightweight generative pre-trained transformer +trained on mental health-related social media and PubMed articles. Fine-tuned +for specific mental health tasks, mhGPT was evaluated under limited hardware +constraints and compared with state-of-the-art models like MentaLLaMA and +Gemma. Despite having only 1.98 billion parameters and using just 5% of the +dataset, mhGPT outperformed larger models and matched the performance of models +trained on significantly more data. The key contributions include integrating +diverse mental health data, creating a custom tokenizer, and optimizing a +smaller architecture for low-resource settings. This research could advance +AI-driven mental health care, especially in areas with limited computing power. + +
+
+
+
+
+ + ☆ Covert Bias: The Severity of Social Views' Unalignment Towards Implicit + and Explicit Opinion + + +
+ While various approaches have recently been studied for bias identification, +little is known about how implicit language that does not explicitly convey a +viewpoint affects bias amplification in large language models.To examine the +severity of bias toward a view, we evaluated the performance of two downstream +tasks where the implicit and explicit knowledge of social groups were used. +First, we present a stress test evaluation by using a biased model in edge +cases of excessive bias scenarios. Then, we evaluate how LLMs calibrate +linguistically in response to both implicit and explicit opinions when they are +aligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM +performance in identifying implicit and explicit opinions, with a general +tendency of bias toward explicit opinions of opposing stances. Moreover, the +bias-aligned models generate more cautious responses using uncertainty phrases +compared to the unaligned (zero-shot) base models. The direct, incautious +responses of the unaligned models suggest a need for further refinement of +decisiveness by incorporating uncertainty markers to enhance their reliability, +especially on socially nuanced topics with high subjectivity. + +
+
+ comment: This work is under-review +
+
+
+
+
+ + ☆ DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for + Reinforcement Learning and Monte-Carlo Tree Search + + +
+ We introduce DeepSeek-Prover-V1.5, an open-source language model designed for +theorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both +training and inference processes. Pre-trained on DeepSeekMath-Base with +specialization in formal mathematical languages, the model undergoes supervised +fine-tuning using an enhanced formal theorem proving dataset derived from +DeepSeek-Prover-V1. Further refinement is achieved through reinforcement +learning from proof assistant feedback (RLPAF). Beyond the single-pass +whole-proof generation approach of DeepSeek-Prover-V1, we propose RMaxTS, a +variant of Monte-Carlo tree search that employs an intrinsic-reward-driven +exploration strategy to generate diverse proof paths. DeepSeek-Prover-V1.5 +demonstrates significant improvements over DeepSeek-Prover-V1, achieving new +state-of-the-art results on the test set of the high school level miniF2F +benchmark ($63.5\%$) and the undergraduate level ProofNet benchmark ($25.3\%$). + +
+
+
+
+
+ + ☆ P/D-Serve: Serving Disaggregated Large Language Model at Scale + + +
+ Serving disaggregated large language models (LLMs) over tens of thousands of +xPU devices (GPUs or NPUs) with reliable performance faces multiple challenges. +1) Ignoring the diversity (various prefixes and tidal requests), treating all +the prompts in a mixed pool is inadequate. To facilitate the similarity per +scenario and minimize the inner mismatch on P/D (prefill and decoding) +processing, fine-grained organization is required, dynamically adjusting P/D +ratios for better performance. 2) Due to inaccurate estimation on workload +(queue status or maintained connections), the global scheduler easily incurs +unnecessary timeouts in prefill. 3) Block-fixed device-to-device (D2D) KVCache +transfer over cluster-level RDMA (remote direct memory access) fails to achieve +desired D2D utilization as expected. To overcome previous problems, this paper +proposes an end-to-end system P/D-Serve, complying with the paradigm of MLOps +(machine learning operations), which models end-to-end (E2E) P/D performance +and enables: 1) fine-grained P/D organization, mapping the service with RoCE +(RDMA over converged ethernet) as needed, to facilitate similar processing and +dynamic adjustments on P/D ratios; 2) on-demand forwarding upon rejections for +idle prefill, decoupling the scheduler from regular inaccurate reports and +local queues, to avoid timeouts in prefill; and 3) efficient KVCache transfer +via optimized D2D access. P/D-Serve is implemented upon Ascend and MindSpore, +has been deployed over tens of thousands of NPUs for more than eight months in +commercial use, and further achieves 60\%, 42\% and 46\% improvements on E2E +throughput, time-to-first-token (TTFT) SLO (service level objective) and D2D +transfer time. As the E2E system with optimizations, P/D-Serve achieves 6.7x +increase on throughput, compared with aggregated LLMs. + +
+
+
+
+
+ + ☆ KOALA: Enhancing Speculative Decoding for LLM via Multi-Layer Draft + Heads with Adversarial Learning + + +
+ Large Language Models (LLMs) exhibit high inference latency due to their +autoregressive decoding nature. While the draft head in speculative decoding +mitigates this issue, its full potential remains unexplored. In this paper, we +introduce KOALA (K-layer Optimized Adversarial Learning Architecture), an +orthogonal approach to the draft head. By transforming the conventional +single-layer draft head into a multi-layer architecture and incorporating +adversarial learning into the traditional supervised training, KOALA +significantly improves the accuracy of the draft head in predicting subsequent +tokens, thus more closely mirroring the functionality of LLMs. Although this +improvement comes at the cost of slightly increased drafting overhead, KOALA +substantially unlocks the draft head's potential, greatly enhancing speculative +decoding. We conducted comprehensive evaluations of KOALA, including both +autoregressive and non-autoregressive draft heads across various tasks, +demonstrating a latency speedup ratio improvement of 0.24x-0.41x, which is +10.57%-14.09% faster than the original draft heads. + +
+
+
+
+
+ + ☆ MIDAS: Multi-level Intent, Domain, And Slot Knowledge Distillation for + Multi-turn NLU + + +
+ Although Large Language Models(LLMs) can generate coherent and contextually +relevant text, they often struggle to recognise the intent behind the human +user's query. Natural Language Understanding (NLU) models, however, interpret +the purpose and key information of user's input to enable responsive +interactions. Existing NLU models generally map individual utterances to a +dual-level semantic frame, involving sentence-level intent and word-level slot +labels. However, real-life conversations primarily consist of multi-turn +conversations, involving the interpretation of complex and extended dialogues. +Researchers encounter challenges addressing all facets of multi-turn dialogue +conversations using a unified single NLU model. This paper introduces a novel +approach, MIDAS, leveraging a multi-level intent, domain, and slot knowledge +distillation for multi-turn NLU. To achieve this, we construct distinct +teachers for varying levels of conversation knowledge, namely, sentence-level +intent detection, word-level slot filling, and conversation-level domain +classification. These teachers are then fine-tuned to acquire specific +knowledge of their designated levels. A multi-teacher loss is proposed to +facilitate the combination of these multi-level teachers, guiding a student +model in multi-turn dialogue tasks. The experimental results demonstrate the +efficacy of our model in improving the overall multi-turn conversation +understanding, showcasing the potential for advancements in NLU models through +the incorporation of multi-level dialogue knowledge distillation techniques. + +
+
+
+
+
+ + ☆ AgentCourt: Simulating Court with Adversarial Evolvable Lawyer Agents + + +
+ In this paper, we present a simulation system called AgentCourt that +simulates the entire courtroom process. The judge, plaintiff's lawyer, defense +lawyer, and other participants are autonomous agents driven by large language +models (LLMs). Our core goal is to enable lawyer agents to learn how to argue a +case, as well as improving their overall legal skills, through courtroom +process simulation. To achieve this goal, we propose an adversarial +evolutionary approach for the lawyer-agent. Since AgentCourt can simulate the +occurrence and development of court hearings based on a knowledge base and LLM, +the lawyer agents can continuously learn and accumulate experience from real +court cases. The simulation experiments show that after two lawyer-agents have +engaged in a thousand adversarial legal cases in AgentCourt (which can take a +decade for real-world lawyers), compared to their pre-evolutionary state, the +evolved lawyer agents exhibit consistent improvement in their ability to handle +legal tasks. To enhance the credibility of our experimental results, we +enlisted a panel of professional lawyers to evaluate our simulations. The +evaluation indicates that the evolved lawyer agents exhibit notable +advancements in responsiveness, as well as expertise and logical rigor. This +work paves the way for advancing LLM-driven agent technology in legal +scenarios. Code is available at https://github.com/relic-yuexi/AgentCourt. + +
+
+
+
+
+ + ☆ Extracting Sentence Embeddings from Pretrained Transformer Models + + +
+ Background/introduction: Pre-trained transformer models shine in many natural +language processing tasks and therefore are expected to bear the representation +of the input sentence or text meaning. These sentence-level embeddings are also +important in retrieval-augmented generation. But do commonly used plain +averaging or prompt templates surface it enough? + Methods: Given 110M parameters BERT's hidden representations from multiple +layers and multiple tokens we tried various ways to extract optimal sentence +representations. We tested various token aggregation and representation +post-processing techniques. We also tested multiple ways of using a general +Wikitext dataset to complement BERTs sentence representations. All methods were +tested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12 +classification tasks. We also evaluated our representation-shaping techniques +on other static models, including random token representations. + Results: Proposed representation extraction methods improved the performance +on STS and clustering tasks for all models considered. Very high improvements +for static token-based models, especially random embeddings for STS tasks +almost reach the performance of BERT-derived representations. + Conclusions: Our work shows that for multiple tasks simple baselines with +representation shaping techniques reach or even outperform more complex +BERT-based models or are able to contribute to their performance. + +
+
+
+
+
+ + ☆ I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative + Self-Enhancement Paradigm + + +
+ Large Language Models (LLMs) have achieved significant advancements, however, +the common learning paradigm treats LLMs as passive information repositories, +neglecting their potential for active learning and alignment. Some approaches +train LLMs using their own generated synthetic data, exploring the possibility +of active alignment. However, there is still a huge gap between these one-time +alignment methods and the continuous automatic alignment of humans. In this +paper, we introduce \textbf{I-SHEEP}, an \textbf{I}terative +\textbf{S}elf-En\textbf{H}anc\textbf{E}m\textbf{E}nt \textbf{P}aradigm.This +human-like paradigm enables LLMs to \textbf{continuously self-align from +scratch with nothing}. Compared to the one-time alignment method Dromedary +\cite{sun2023principledriven}, which refers to the first iteration in this +paper, I-SHEEP can significantly enhance capacities on both Qwen and Llama +models. I-SHEEP achieves a maximum relative improvement of 78.2\% in the Alpaca +Eval, 24.0\% in the MT Bench, and an absolute increase of 8.88\% in the IFEval +accuracy over subsequent iterations in Qwen-1.5 72B model. Additionally, +I-SHEEP surpasses the base model in various standard benchmark generation +tasks, achieving an average improvement of 24.77\% in code generation tasks, +12.04\% in TrivialQA, and 20.29\% in SQuAD. We also provide new insights based +on the experiment results. Our codes, datasets, and models are available at +\textbf{https://anonymous.4open.science/r/I-SHEEP}. + +
+
+
+
+
+ + ☆ RAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented + Generation + + +
+ Despite Retrieval-Augmented Generation (RAG) has shown promising capability +in leveraging external knowledge, a comprehensive evaluation of RAG systems is +still challenging due to the modular nature of RAG, evaluation of long-form +responses and reliability of measurements. In this paper, we propose a +fine-grained evaluation framework, RAGChecker, that incorporates a suite of +diagnostic metrics for both the retrieval and generation modules. Meta +evaluation verifies that RAGChecker has significantly better correlations with +human judgments than other evaluation metrics. Using RAGChecker, we evaluate 8 +RAG systems and conduct an in-depth analysis of their performance, revealing +insightful patterns and trade-offs in the design choices of RAG architectures. +The metrics of RAGChecker can guide researchers and practitioners in developing +more effective RAG systems. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Text2BIM: Generating Building Models Using a Large Language Model-based + Multi-Agent Framework + + +
+ The conventional BIM authoring process typically requires designers to master +complex and tedious modeling commands in order to materialize their design +intentions within BIM authoring tools. This additional cognitive burden +complicates the design process and hinders the adoption of BIM and model-based +design in the AEC (Architecture, Engineering, and Construction) industry. To +facilitate the expression of design intentions more intuitively, we propose +Text2BIM, an LLM-based multi-agent framework that can generate 3D building +models from natural language instructions. This framework orchestrates multiple +LLM agents to collaborate and reason, transforming textual user input into +imperative code that invokes the BIM authoring tool's APIs, thereby generating +editable BIM models with internal layouts, external envelopes, and semantic +information directly in the software. Furthermore, a rule-based model checker +is introduced into the agentic workflow, utilizing predefined domain knowledge +to guide the LLM agents in resolving issues within the generated models and +iteratively improving model quality. Extensive experiments were conducted to +compare and analyze the performance of three different LLMs under the proposed +framework. The evaluation results demonstrate that our approach can effectively +generate high-quality, structurally rational building models that are aligned +with the abstract concepts specified by user input. Finally, an interactive +software prototype was developed to integrate the framework into the BIM +authoring software Vectorworks, showcasing the potential of modeling by +chatting. + +
+
+
+
+
+ + ☆ Enhancing Large Language Model-based Speech Recognition by + Contextualization for Rare and Ambiguous Words + + +
+ We develop a large language model (LLM) based automatic speech recognition +(ASR) system that can be contextualized by providing keywords as prior +information in text prompts. We adopt decoder-only architecture and use our +in-house LLM, PLaMo-100B, pre-trained from scratch using datasets dominated by +Japanese and English texts as the decoder. We adopt a pre-trained Whisper +encoder as an audio encoder, and the audio embeddings from the audio encoder +are projected to the text embedding space by an adapter layer and concatenated +with text embeddings converted from text prompts to form inputs to the decoder. +By providing keywords as prior information in the text prompts, we can +contextualize our LLM-based ASR system without modifying the model architecture +to transcribe ambiguous words in the input audio accurately. Experimental +results demonstrate that providing keywords to the decoder can significantly +improve the recognition performance of rare and ambiguous words. + +
+
+ comment: 13 pages, 1 figure, and 7 tables +
+
+
+
+
+ + ☆ Leveraging Web-Crawled Data for High-Quality Fine-Tuning + + +
+ Most large language models are fine-tuned using either expensive +human-annotated data or GPT-4 generated data which cannot guarantee performance +in certain domains. We argue that although the web-crawled data often has +formatting errors causing semantic inaccuracies, it can still serve as a +valuable source for high-quality supervised fine-tuning in specific domains +without relying on advanced models like GPT-4. To this end, we create a paired +training dataset automatically by aligning web-crawled data with a smaller set +of high-quality data. By training a language model on this dataset, we can +convert web data with irregular formats into high-quality ones. Our experiments +show that training with the model-transformed data yields better results, +surpassing training with only high-quality data by an average score of 9.4% in +Chinese math problems. Additionally, our 7B model outperforms several +open-source models larger than 32B and surpasses well-known closed-source +models such as GPT-3.5, highlighting the efficacy of our approach. + +
+
+
+
+
+ + ☆ FuseChat: Knowledge Fusion of Chat Models + + +
+ While training large language models (LLMs) from scratch can indeed lead to +models with distinct capabilities and strengths, it incurs substantial costs +and may lead to redundancy in competencies. Knowledge fusion aims to integrate +existing LLMs of diverse architectures and capabilities into a more potent LLM +through lightweight continual training, thereby reducing the need for costly +LLM development. In this work, we propose a new framework for the knowledge +fusion of chat LLMs through two main stages, resulting in FuseChat. Firstly, we +conduct pairwise knowledge fusion on source chat LLMs of varying structures and +scales to create multiple target LLMs with identical structure and size via +lightweight fine-tuning. During this process, a statistics-based token +alignment approach is introduced as the cornerstone for fusing LLMs with +different structures. Secondly, we merge these target LLMs within the parameter +space, where we propose a novel method for determining the merging coefficients +based on the magnitude of parameter updates before and after fine-tuning. We +implement and validate FuseChat using six prominent chat LLMs with diverse +architectures and scales, including OpenChat-3.5-7B, Starling-LM-7B-alpha, +NH2-SOLAR-10.7B, InternLM2-Chat-20B, Mixtral-8x7B-Instruct, and +Qwen-1.5-Chat-72B. Experimental results on two instruction-following +benchmarks, AlpacaEval 2.0 and MT-Bench, demonstrate the superiority of +FuseChat-7B over baselines of various sizes. Our model is even comparable to +the larger Mixtral-8x7B-Instruct and approaches GPT-3.5-Turbo-1106 on MT-Bench. +Our code, model weights, and data are public at +\url{https://github.com/fanqiwan/FuseAI}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ ArabLegalEval: A Multitask Benchmark for Assessing Arabic Legal + Knowledge in Large Language Models + + +
+ The rapid advancements in Large Language Models (LLMs) have led to +significant improvements in various natural language processing tasks. However, +the evaluation of LLMs' legal knowledge, particularly in non-English languages +such as Arabic, remains under-explored. To address this gap, we introduce +ArabLegalEval, a multitask benchmark dataset for assessing the Arabic legal +knowledge of LLMs. Inspired by the MMLU and LegalBench datasets, ArabLegalEval +consists of multiple tasks sourced from Saudi legal documents and synthesized +questions. In this work, we aim to analyze the capabilities required to solve +legal problems in Arabic and benchmark the performance of state-of-the-art +LLMs. We explore the impact of in-context learning and investigate various +evaluation methods. Additionally, we explore workflows for generating questions +with automatic validation to enhance the dataset's quality. We benchmark +multilingual and Arabic-centric LLMs, such as GPT-4 and Jais, respectively. We +also share our methodology for creating the dataset and validation, which can +be generalized to other domains. We hope to accelerate AI research in the +Arabic Legal domain by releasing the ArabLegalEval dataset and code: +https://github.com/Thiqah/ArabLegalEval + +
+
+
+
+
+ + ☆ Coupling without Communication and Drafter-Invariant Speculative + Decoding + + +
+ Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice +wants to generate a sample $a\sim P$ and Bob a sample $b \sim Q$ such that $a = +b$ with has as high of probability as possible. It is well-known that, by +sampling from an optimal coupling between the distributions, Alice and Bob can +achieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total +variation distance. What if Alice and Bob must solve this same problem without +communicating at all? Perhaps surprisingly, with access to public randomness, +they can still achieve $Pr[a = b] \geq \frac{1 - D_{TV}(P,Q)}{1 + D_{TV}(P,Q)} +\geq 1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple +protocol based on the Weighted MinHash algorithm. In this work, we explore the +communication-free coupling in greater depth. First, we show that an equally +simple protocol based on Gumbel sampling matches the worst-case guarantees of +the Weighted MinHash approach, but tends to perform better in practice. +Conversely, we prove that both approaches are actually sharp: no +communication-free protocol can achieve $Pr[a=b]>\frac{1 - D_{TV}(P,Q)}{1 + +D_{TV}(P,Q)}$ in the worst-case. Finally, we prove that, for distributions over +$n$ items, there exists a scheme that uses just $O(\log(n/\epsilon))$ bits of +communication to achieve $Pr[a = b] = 1 - D_{TV}(P,Q) - \epsilon$, i.e. to +essentially match optimal coupling. Beyond our theoretical results, we +demonstrate an application of communication-free coupling to speculative +decoding, a recent method for accelerating autoregressive large language models +[Leviathan, Kalman, Matias, ICML 2023]. We show that communication-free +protocols yield a variant of speculative decoding that we call +Drafter-Invariant Speculative Decoding, which has the desirable property that +the output of the method is fixed given a fixed random seed, regardless of what +drafter is used for speculation. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Polaris: Open-ended Interactive Robotic Manipulation via Syn2Real Visual + Grounding and Large Language Models IROS 2024 + + +
+ This paper investigates the task of the open-ended interactive robotic +manipulation on table-top scenarios. While recent Large Language Models (LLMs) +enhance robots' comprehension of user instructions, their lack of visual +grounding constrains their ability to physically interact with the environment. +This is because the robot needs to locate the target object for manipulation +within the physical workspace. To this end, we introduce an interactive robotic +manipulation framework called Polaris, which integrates perception and +interaction by utilizing GPT-4 alongside grounded vision models. For precise +manipulation, it is essential that such grounded vision models produce detailed +object pose for the target object, rather than merely identifying pixels +belonging to them in the image. Consequently, we propose a novel +Synthetic-to-Real (Syn2Real) pose estimation pipeline. This pipeline utilizes +rendered synthetic data for training and is then transferred to real-world +manipulation tasks. The real-world performance demonstrates the efficacy of our +proposed pipeline and underscores its potential for extension to more general +categories. Moreover, real-robot experiments have showcased the impressive +performance of our framework in grasping and executing multiple manipulation +tasks. This indicates its potential to generalize to scenarios beyond the +tabletop. More information and video results are available here: +https://star-uu-wang.github.io/Polaris/ + +
+
+ comment: Accepted by IROS 2024. 8 pages, 5 figures. See + https://star-uu-wang.github.io/Polaris/ +
+
+
+
+
+ + ☆ Predicting Lung Cancer Patient Prognosis with Large Language Models + + +
+ Prognosis prediction is crucial for determining optimal treatment plans for +lung cancer patients. Traditionally, such predictions relied on models +developed from retrospective patient data. Recently, large language models +(LLMs) have gained attention for their ability to process and generate text +based on extensive learned knowledge. In this study, we evaluate the potential +of GPT-4o mini and GPT-3.5 in predicting the prognosis of lung cancer patients. +We collected two prognosis datasets, i.e., survival and post-operative +complication datasets, and designed multiple tasks to assess the models' +performance comprehensively. Logistic regression models were also developed as +baselines for comparison. The experimental results demonstrate that LLMs can +achieve competitive, and in some tasks superior, performance in lung cancer +prognosis prediction compared to data-driven logistic regression models despite +not using additional patient data. These findings suggest that LLMs can be +effective tools for prognosis prediction in lung cancer, particularly when +patient data is limited or unavailable. + +
+
+
+
+
+ + ☆ GERestaurant: A German Dataset of Annotated Restaurant Reviews for + Aspect-Based Sentiment Analysis + + +
+ We present GERestaurant, a novel dataset consisting of 3,078 German language +restaurant reviews manually annotated for Aspect-Based Sentiment Analysis +(ABSA). All reviews were collected from Tripadvisor, covering a diverse +selection of restaurants, including regional and international cuisine with +various culinary styles. The annotations encompass both implicit and explicit +aspects, including all aspect terms, their corresponding aspect categories, and +the sentiments expressed towards them. Furthermore, we provide baseline scores +for the four ABSA tasks Aspect Category Detection, Aspect Category Sentiment +Analysis, End-to-End ABSA and Target Aspect Sentiment Detection as a reference +point for future advances. The dataset fills a gap in German language resources +and facilitates exploration of ABSA in the restaurant domain. + +
+
+ comment: Accepted in KONVENS 2024. Camera Ready submission +
+
+
+
+
+ + ☆ MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and + Iterative Sub-SQL Refinement for Text-to-SQL + + +
+ Recent In-Context Learning based methods have achieved remarkable success in +Text-to-SQL task. However, there is still a large gap between the performance +of these models and human performance on datasets with complex database schema +and difficult questions, such as BIRD. Besides, existing work has neglected to +supervise intermediate steps when solving questions iteratively with question +decomposition methods, and the schema linking methods used in these works are +very rudimentary. To address these issues, we propose MAG-SQL, a multi-agent +generative approach with soft schema linking and iterative Sub-SQL refinement. +In our framework, an entity-based method with tables' summary is used to select +the columns in database, and a novel targets-conditions decomposition method is +introduced to decompose those complex questions. Additionally, we build a +iterative generating module which includes a Sub-SQL Generator and Sub-SQL +Refiner, introducing external oversight for each step of generation. Through a +series of ablation studies, the effectiveness of each agent in our framework +has been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL +achieves an execution accuracy of 61.08\%, compared to the baseline accuracy of +46.35\% for vanilla GPT-4 and the baseline accuracy of 57.56\% for MAC-SQL. +Besides, our approach makes similar progress on Spider. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ☆ DM2RM: Dual-Mode Multimodal Ranking for Target Objects and Receptacles + Based on Open-Vocabulary Instructions + + +
+ In this study, we aim to develop a domestic service robot (DSR) that, guided +by open-vocabulary instructions, can carry everyday objects to the specified +pieces of furniture. Few existing methods handle mobile manipulation tasks with +open-vocabulary instructions in the image retrieval setting, and most do not +identify both the target objects and the receptacles. We propose the Dual-Mode +Multimodal Ranking model (DM2RM), which enables images of both the target +objects and receptacles to be retrieved using a single model based on +multimodal foundation models. We introduce a switching mechanism that leverages +a mode token and phrase identification via a large language model to switch the +embedding space based on the prediction target. To evaluate the DM2RM, we +construct a novel dataset including real-world images collected from hundreds +of building-scale environments and crowd-sourced instructions with referring +expressions. The evaluation results show that the proposed DM2RM outperforms +previous approaches in terms of standard metrics in image retrieval settings. +Furthermore, we demonstrate the application of the DM2RM on a standardized +real-world DSR platform including fetch-and-carry actions, where it achieves a +task success rate of 82% despite the zero-shot transfer setting. Demonstration +videos, code, and more materials are available at +https://kkrr10.github.io/dm2rm/. + +
+
+
+
+
+ + ☆ Assessing Language Models' Worldview for Fiction Generation + + +
+ The use of Large Language Models (LLMs) has become ubiquitous, with abundant +applications in computational creativity. One such application is fictional +story generation. Fiction is a narrative that occurs in a story world that is +slightly different than ours. With LLMs becoming writing partners, we question +how suitable they are to generate fiction. This study investigates the ability +of LLMs to maintain a state of world essential to generate fiction. Through a +series of questions to nine LLMs, we find that only two models exhibit +consistent worldview, while the rest are self-conflicting. Subsequent analysis +of stories generated by four models revealed a strikingly uniform narrative +pattern. This uniformity across models further suggests a lack of `state' +necessary for fiction. We highlight the limitations of current LLMs in fiction +writing and advocate for future research to test and create story worlds for +LLMs to reside in. All code, dataset, and the generated responses can be found +in https://github.com/tanny411/llm-reliability-and-consistency-evaluation. + +
+
+ comment: Short paper +
+
+
+
+
+ + ☆ Fine-tuning Large Language Models with Human-inspired Learning + Strategies in Medical Question Answering + + +
+ Training Large Language Models (LLMs) incurs substantial data-related costs, +motivating the development of data-efficient training methods through optimised +data ordering and selection. Human-inspired learning strategies, such as +curriculum learning, offer possibilities for efficient training by organising +data according to common human learning practices. Despite evidence that +fine-tuning with curriculum learning improves the performance of LLMs for +natural language understanding tasks, its effectiveness is typically assessed +using a single model. In this work, we extend previous research by evaluating +both curriculum-based and non-curriculum-based learning strategies across +multiple LLMs, using human-defined and automated data labels for medical +question answering. Our results indicate a moderate impact of using +human-inspired learning strategies for fine-tuning LLMs, with maximum accuracy +gains of 1.77% per model and 1.81% per dataset. Crucially, we demonstrate that +the effectiveness of these strategies varies significantly across different +model-dataset combinations, emphasising that the benefits of a specific +human-inspired strategy for fine-tuning LLMs do not generalise. Additionally, +we find evidence that curriculum learning using LLM-defined question difficulty +outperforms human-defined difficulty, highlighting the potential of using +model-generated measures for optimal curriculum design. + +
+
+
+
+
+ + ☆ Instruct Large Language Models to Generate Scientific Literature Survey + Step by Step NLPCC 2024 + + +
+ Abstract. Automatically generating scientific literature surveys is a +valuable task that can significantly enhance research efficiency. However, the +diverse and complex nature of information within a literature survey poses +substantial challenges for generative models. In this paper, we design a series +of prompts to systematically leverage large language models (LLMs), enabling +the creation of comprehensive literature surveys through a step-by-step +approach. Specifically, we design prompts to guide LLMs to sequentially +generate the title, abstract, hierarchical headings, and the main content of +the literature survey. We argue that this design enables the generation of the +headings from a high-level perspective. During the content generation process, +this design effectively harnesses relevant information while minimizing costs +by restricting the length of both input and output content in LLM queries. Our +implementation with Qwen-long achieved third place in the NLPCC 2024 Scientific +Literature Survey Generation evaluation task, with an overall score only 0.03% +lower than the second-place team. Additionally, our soft heading recall is +95.84%, the second best among the submissions. Thanks to the efficient prompt +design and the low cost of the Qwen-long API, our method reduces the expense +for generating each literature survey to 0.1 RMB, enhancing the practical value +of our method. + +
+
+ comment: NLPCC 2024 +
+
+
+
+
+ + ☆ Words Matter: Reducing Stigma in Online Conversations about Substance + Use with Large Language Models + + +
+ Stigma is a barrier to treatment for individuals struggling with substance +use disorders (SUD), which leads to significantly lower treatment engagement +rates. With only 7% of those affected receiving any form of help, societal +stigma not only discourages individuals with SUD from seeking help but isolates +them, hindering their recovery journey and perpetuating a cycle of shame and +self-doubt. This study investigates how stigma manifests on social media, +particularly Reddit, where anonymity can exacerbate discriminatory behaviors. +We analyzed over 1.2 million posts, identifying 3,207 that exhibited +stigmatizing language towards people who use substances (PWUS). Using Informed +and Stylized LLMs, we develop a model for de-stigmatization of these +expressions into empathetic language, resulting in 1,649 reformed phrase pairs. +Our paper contributes to the field by proposing a computational framework for +analyzing stigma and destigmatizing online content, and delving into the +linguistic features that propagate stigma towards PWUS. Our work not only +enhances understanding of stigma's manifestations online but also provides +practical tools for fostering a more supportive digital environment for those +affected by SUD. Code and data will be made publicly available upon acceptance. + +
+
+
+
+
+ + ☆ Words Matter: Reducing Stigma in Online Conversations about Substance + Use with Large Language Models + + +
+ Stigma is a barrier to treatment for individuals struggling with substance +use disorders (SUD), which leads to significantly lower treatment engagement +rates. With only 7% of those affected receiving any form of help, societal +stigma not only discourages individuals with SUD from seeking help but isolates +them, hindering their recovery journey and perpetuating a cycle of shame and +self-doubt. This study investigates how stigma manifests on social media, +particularly Reddit, where anonymity can exacerbate discriminatory behaviors. +We analyzed over 1.2 million posts, identifying 3,207 that exhibited +stigmatizing language towards people who use substances (PWUS). Using Informed +and Stylized LLMs, we develop a model for de-stigmatization of these +expressions into empathetic language, resulting in 1,649 reformed phrase pairs. +Our paper contributes to the field by proposing a computational framework for +analyzing stigma and destigmatizing online content, and delving into the +linguistic features that propagate stigma towards PWUS. Our work not only +enhances understanding of stigma's manifestations online but also provides +practical tools for fostering a more supportive digital environment for those +affected by SUD. Code and data will be made publicly available upon acceptance. + +
+
+
+
+
+ + ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ☆ W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question + Answering + + +
+ In knowledge-intensive tasks such as open-domain question answering (OpenQA), +Large Language Models (LLMs) often struggle to generate factual answers relying +solely on their internal (parametric) knowledge. To address this limitation, +Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving +relevant information from external sources, thereby positioning the retriever +as a pivotal component. Although dense retrieval demonstrates state-of-the-art +performance, its training poses challenges due to the scarcity of ground-truth +evidence, largely attributed to the high costs of human annotation. In this +paper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create +weakly labeled data for training dense retrievers. Specifically, we rerank the +top-$K$ passages retrieved via BM25 by assessing the probability that LLMs will +generate the correct answer based on the question and each passage. The +highest-ranking passages are then used as positive training examples for dense +retrieval. Our comprehensive experiments across four publicly available OpenQA +datasets demonstrate that our approach enhances both retrieval and OpenQA +performance compared to baseline models. + +
+
+
+
+
+ + ☆ Rater Cohesion and Quality from a Vicarious Perspective + + +
+ Human feedback is essential for building human-centered AI systems across +domains where disagreement is prevalent, such as AI safety, content moderation, +or sentiment analysis. Many disagreements, particularly in politically charged +settings, arise because raters have opposing values or beliefs. Vicarious +annotation is a method for breaking down disagreement by asking raters how they +think others would annotate the data. In this paper, we explore the use of +vicarious annotation with analytical methods for moderating rater disagreement. +We employ rater cohesion metrics to study the potential influence of political +affiliations and demographic backgrounds on raters' perceptions of offense. +Additionally, we utilize CrowdTruth's rater quality metrics, which consider the +demographics of the raters, to score the raters and their annotations. We study +how the rater quality metrics influence the in-group and cross-group rater +cohesion across the personal and vicarious levels. + +
+
+
+
+
+ + ☆ Zero-Shot Learning and Key Points Are All You Need for Automated + Fact-Checking + + +
+ Automated fact-checking is an important task because determining the accurate +status of a proposed claim within the vast amount of information available +online is a critical challenge. This challenge requires robust evaluation to +prevent the spread of false information. Modern large language models (LLMs) +have demonstrated high capability in performing a diverse range of Natural +Language Processing (NLP) tasks. By utilizing proper prompting strategies, +their versatility due to their understanding of large context sizes and +zero-shot learning ability enables them to simulate human problem-solving +intuition and move towards being an alternative to humans for solving problems. +In this work, we introduce a straightforward framework based on Zero-Shot +Learning and Key Points (ZSL-KeP) for automated fact-checking, which despite +its simplicity, performed well on the AVeriTeC shared task dataset by robustly +improving the baseline and achieving 10th place. + +
+
+
+
+
+ + ☆ Level Up Your Tutorials: VLMs for Game Tutorials Quality Assessment ECCV 2024 + + +
+ Designing effective game tutorials is crucial for a smooth learning curve for +new players, especially in games with many rules and complex core mechanics. +Evaluating the effectiveness of these tutorials usually requires multiple +iterations with testers who have no prior knowledge of the game. Recent +Vision-Language Models (VLMs) have demonstrated significant capabilities in +understanding and interpreting visual content. VLMs can analyze images, provide +detailed insights, and answer questions about their content. They can recognize +objects, actions, and contexts in visual data, making them valuable tools for +various applications, including automated game testing. In this work, we +propose an automated game-testing solution to evaluate the quality of game +tutorials. Our approach leverages VLMs to analyze frames from video game +tutorials, answer relevant questions to simulate human perception, and provide +feedback. This feedback is compared with expected results to identify confusing +or problematic scenes and highlight potential errors for developers. In +addition, we publish complete tutorial videos and annotated frames from +different game versions used in our tests. This solution reduces the need for +extensive manual testing, especially by speeding up and simplifying the initial +development stages of the tutorial to improve the final game experience. + +
+
+ comment: Accepted at ECCV 2024 CV2 Workshop +
+
+
+
+
+ + ☆ Towards Realistic Synthetic User-Generated Content: A Scaffolding + Approach to Generating Online Discussions + + +
+ The emergence of synthetic data represents a pivotal shift in modern machine +learning, offering a solution to satisfy the need for large volumes of data in +domains where real data is scarce, highly private, or difficult to obtain. We +investigate the feasibility of creating realistic, large-scale synthetic +datasets of user-generated content, noting that such content is increasingly +prevalent and a source of frequently sought information. Large language models +(LLMs) offer a starting point for generating synthetic social media discussion +threads, due to their ability to produce diverse responses that typify online +interactions. However, as we demonstrate, straightforward application of LLMs +yields limited success in capturing the complex structure of online +discussions, and standard prompting mechanisms lack sufficient control. We +therefore propose a multi-step generation process, predicated on the idea of +creating compact representations of discussion threads, referred to as +scaffolds. Our framework is generic yet adaptable to the unique characteristics +of specific social media platforms. We demonstrate its feasibility using data +from two distinct online discussion platforms. To address the fundamental +challenge of ensuring the representativeness and realism of synthetic data, we +propose a portfolio of evaluation measures to compare various instantiations of +our framework. + +
+
+
+
+
+ + ☆ Evaluating Text Classification Robustness to Part-of-Speech Adversarial + Examples + + +
+ As machine learning systems become more widely used, especially for safety +critical applications, there is a growing need to ensure that these systems +behave as intended, even in the face of adversarial examples. Adversarial +examples are inputs that are designed to trick the decision making process, and +are intended to be imperceptible to humans. However, for text-based +classification systems, changes to the input, a string of text, are always +perceptible. Therefore, text-based adversarial examples instead focus on trying +to preserve semantics. Unfortunately, recent work has shown this goal is often +not met. To improve the quality of text-based adversarial examples, we need to +know what elements of the input text are worth focusing on. To address this, in +this paper, we explore what parts of speech have the highest impact of +text-based classifiers. Our experiments highlight a distinct bias in CNN +algorithms against certain parts of speech tokens within review datasets. This +finding underscores a critical vulnerability in the linguistic processing +capabilities of CNNs. + +
+
+
+
+
+ + ☆ Plan with Code: Comparing approaches for robust NL to DSL generation + + +
+ Planning in code is considered a more reliable approach for many +orchestration tasks. This is because code is more tractable than steps +generated via Natural Language and make it easy to support more complex +sequences by abstracting deterministic logic into functions. It also allows +spotting issues with incorrect function names with the help of parsing checks +that can be run on code. Progress in Code Generation methodologies, however, +remains limited to general-purpose languages like C, C++, and Python. LLMs +continue to face challenges with custom function names in Domain Specific +Languages or DSLs, leading to higher hallucination rates and syntax errors. +This is more common for custom function names, that are typically part of the +plan. Moreover, keeping LLMs up-to-date with newer function names is an issue. +This poses a challenge for scenarios like task planning over a large number of +APIs, since the plan is represented as a DSL having custom API names. In this +paper, we focus on workflow automation in RPA (Robotic Process Automation) +domain as a special case of task planning. We present optimizations for using +Retrieval Augmented Generation (or RAG) with LLMs for DSL generation along with +an ablation study comparing these strategies with a fine-tuned model. Our +results showed that the fine-tuned model scored the best on code similarity +metric. However, with our optimizations, RAG approach is able to match the +quality for in-domain API names in the test set. Additionally, it offers +significant advantage for out-of-domain or unseen API names, outperforming +Fine-Tuned model on similarity metric by 7 pts. + +
+
+ comment: 9 pages, 1 figure, 5 tables. arXiv admin note: substantial text + overlap with arXiv:2407.02742 +
+
+
+
+
+ + ♻ ☆ Direct Large Language Model Alignment Through Self-Rewarding Contrastive + Prompt Distillation + + +
+ Aligning large language models (LLMs) with human expectations without +human-annotated preference data is an important problem. In this paper, we +propose a method to evaluate the response preference by using the output +probabilities of response pairs under contrastive prompt pairs, which could +achieve better performance on LLaMA2-7B and LLaMA2-13B compared to RLAIF. Based +on this, we propose an automatic alignment method, Direct Large Model Alignment +(DLMA). First, we use contrastive prompt pairs to automatically generate +preference data. Then, we continue to evaluate the generated preference data +using contrastive prompt pairs and calculate a self-rewarding score. Finally, +we use the DPO algorithm to effectively align LLMs by combining this +self-rewarding score. In the experimental stage, our DLMA method could surpass +the \texttt{RLHF} method without relying on human-annotated preference data. + +
+
+ comment: 24 pages, 5 pages +
+
+
+
+
+ + ♻ ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aides to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+ + ♻ ☆ Quantifying Memorization and Detecting Training Data of Pre-trained + Language Models using Japanese Newspaper + + +
+ Dominant pre-trained language models (PLMs) have demonstrated the potential +risk of memorizing and outputting the training data. While this concern has +been discussed mainly in English, it is also practically important to focus on +domain-specific PLMs. In this study, we pre-trained domain-specific GPT-2 +models using a limited corpus of Japanese newspaper articles and evaluated +their behavior. Experiments replicated the empirical finding that memorization +of PLMs is related to the duplication in the training data, model size, and +prompt length, in Japanese the same as in previous English studies. +Furthermore, we attempted membership inference attacks, demonstrating that the +training data can be detected even in Japanese, which is the same trend as in +English. The study warns that domain-specific PLMs, sometimes trained with +valuable private data, can ''copy and paste'' on a large scale. + +
+
+ comment: The 17th International Natural Language Generation Conference +
+
+
+
+
+ + ♻ ☆ An Event Structure-aware Generative Model for Biomedical Event + Extraction + + +
+ Biomedical Event Extraction (BEE) is a challenging task that involves +modeling complex relationships between fine-grained entities in biomedical +text. BEE has traditionally been formulated as a classification problem. With +the recent technological advancements in large language models (LLMs), +generation-based models that cast event extraction as a sequence generation +problem have attracted much attention from the NLP research communities. +However, current generative models often overlook the importance of +cross-instance information from complex event structures such as nested events +and overlapping events, which contribute quite significantly in the benchmark +datasets. In this paper, we propose an event structure-aware generative model +called GenBEE, which can capture complex event structures in biomedical text +for biomedical event extraction. In particular, GenBEE constructs event prompts +that distill knowledge from LLMs for incorporating both label semantics and +argument dependency relationships into the proposed model. In addition, GenBEE +also generates prefixes with event structural prompts to incorporate structural +features for improving the model's overall performance. We have evaluated the +proposed GenBEE model on three widely used biomedical event extraction +benchmark datasets, namely MLEE, GE11, and PHEE. Experimental results show that +GenBEE has achieved state-of-the-art performance on the MLEE and GE11 datasets, +and achieved competitive results when compared to the state-of-the-art +classification-based models on the PHEE dataset. + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Compensate Quantization Errors+: Quantized Models Are Inquisitive + Learners + + +
+ Large Language Models (LLMs) showcase remarkable performance and robust +deductive capabilities, yet their expansive size complicates deployment and +raises environmental concerns due to substantial resource consumption. The +recent development of a quantization technique known as Learnable +Singular-value Increment (LSI) has addressed some of these quantization +challenges. Leveraging insights from LSI and our extensive research, we have +developed innovative methods that enhance the performance of quantized LLMs, +particularly in low-bit settings. Our methods consistently deliver +state-of-the-art results across various quantization scenarios and offer deep +theoretical insights into the quantization process, elucidating the potential +of quantized models for widespread application. + +
+
+ comment: Effecient Quantization Methods for LLMs +
+
+
+
+
+ + ♻ ☆ Large Language Models are Few-Shot Training Example Generators: A Case + Study in Fallacy Recognition + + +
+ Recognizing fallacies is crucial for ensuring the quality and validity of +arguments across various domains. However, computational fallacy recognition +faces challenges due to the diverse genres, domains, and types of fallacies +found in datasets. This leads to a highly multi-class, and even multi-label, +setup with substantial class imbalance. In this study, we aim to enhance +existing models for fallacy recognition by incorporating additional context and +by leveraging large language models to generate synthetic data, thus increasing +the representation of the infrequent classes. We experiment with GPT3.5 to +generate synthetic examples and we examine the impact of prompt settings for +this. Moreover, we explore zero-shot and few-shot scenarios to evaluate the +effectiveness of using the generated examples for training smaller models +within a unified fallacy recognition framework. Furthermore, we analyze the +overlap between the synthetic data and existing fallacy datasets. Finally, we +investigate the usefulness of providing supplementary context for detecting +fallacy types that need such context, e.g., diversion fallacies. Our evaluation +results demonstrate consistent improvements across fallacy types, datasets, and +generators. The code and the synthetic datasets are all publicly available. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Robust Cross-Lingual Entity Alignment via Neighbor Triple + Matching with Entity and Relation Texts + + +
+ Cross-lingual entity alignment (EA) enables the integration of multiple +knowledge graphs (KGs) across different languages, providing users with +seamless access to diverse and comprehensive knowledge. Existing methods, +mostly supervised, face challenges in obtaining labeled entity pairs. To +address this, recent studies have shifted towards self-supervised and +unsupervised frameworks. Despite their effectiveness, these approaches have +limitations: (1) Relation passing: mainly focusing on the entity while +neglecting the semantic information of relations, (2) Isomorphic assumption: +assuming isomorphism between source and target graphs, which leads to noise and +reduced alignment accuracy, and (3) Noise vulnerability: susceptible to noise +in the textual features, especially when encountering inconsistent translations +or Out-Of-Vocabulary (OOV) problems. In this paper, we propose ERAlign, an +unsupervised and robust cross-lingual EA pipeline that jointly performs +Entity-level and Relation-level Alignment by neighbor triple matching strategy +using semantic textual features of relations and entities. Its refinement step +iteratively enhances results by fusing entity-level and relation-level +alignments based on neighbor triple matching. The additional verification step +examines the entities' neighbor triples as the linearized text. This +Align-then-Verify pipeline rigorously assesses alignment results, achieving +near-perfect alignment even in the presence of noisy textual features of +entities. Our extensive experiments demonstrate that the robustness and general +applicability of ERAlign improved the accuracy and effectiveness of EA tasks, +contributing significantly to knowledge-oriented applications. + +
+
+
+
+
+ + ♻ ☆ Characterizing Multimodal Long-form Summarization: A Case Study on + Financial Reports + + +
+ As large language models (LLMs) expand the power of natural language +processing to handle long inputs, rigorous and systematic analyses are +necessary to understand their abilities and behavior. A salient application is +summarization, due to its ubiquity and controversy (e.g., researchers have +declared the death of summarization). In this paper, we use financial report +summarization as a case study because financial reports are not only long but +also use numbers and tables extensively. We propose a computational framework +for characterizing multimodal long-form summarization and investigate the +behavior of Claude 2.0/2.1, GPT-4/3.5, and Cohere. We find that GPT-3.5 and +Cohere fail to perform this summarization task meaningfully. For Claude 2 and +GPT-4, we analyze the extractiveness of the summary and identify a position +bias in LLMs. This position bias disappears after shuffling the input for +Claude, which suggests that Claude seems to recognize important information. We +also conduct a comprehensive investigation on the use of numeric data in +LLM-generated summaries and offer a taxonomy of numeric hallucination. We +employ prompt engineering to improve GPT-4's use of numbers with limited +success. Overall, our analyses highlight the strong capability of Claude 2 in +handling long multimodal inputs compared to GPT-4. The generated summaries and +evaluation code are available at +https://github.com/ChicagoHAI/characterizing-multimodal-long-form-summarization. + +
+
+
+
+
+ + ♻ ☆ The Llama 3 Herd of Models + + +
+ Modern artificial intelligence (AI) systems are powered by foundation models. +This paper presents a new set of foundation models, called Llama 3. It is a +herd of language models that natively support multilinguality, coding, +reasoning, and tool usage. Our largest model is a dense Transformer with 405B +parameters and a context window of up to 128K tokens. This paper presents an +extensive empirical evaluation of Llama 3. We find that Llama 3 delivers +comparable quality to leading language models such as GPT-4 on a plethora of +tasks. We publicly release Llama 3, including pre-trained and post-trained +versions of the 405B parameter language model and our Llama Guard 3 model for +input and output safety. The paper also presents the results of experiments in +which we integrate image, video, and speech capabilities into Llama 3 via a +compositional approach. We observe this approach performs competitively with +the state-of-the-art on image, video, and speech recognition tasks. The +resulting models are not yet being broadly released as they are still under +development. + +
+
+
+
+
+ + ♻ ☆ Few Shot Class Incremental Learning using Vision-Language models + + +
+ Recent advancements in deep learning have demonstrated remarkable performance +comparable to human capabilities across various supervised computer vision +tasks. However, the prevalent assumption of having an extensive pool of +training data encompassing all classes prior to model training often diverges +from real-world scenarios, where limited data availability for novel classes is +the norm. The challenge emerges in seamlessly integrating new classes with few +samples into the training data, demanding the model to adeptly accommodate +these additions without compromising its performance on base classes. To +address this exigency, the research community has introduced several solutions +under the realm of few-shot class incremental learning (FSCIL). + In this study, we introduce an innovative FSCIL framework that utilizes +language regularizer and subspace regularizer. During base training, the +language regularizer helps incorporate semantic information extracted from a +Vision-Language model. The subspace regularizer helps in facilitating the +model's acquisition of nuanced connections between image and text semantics +inherent to base classes during incremental training. Our proposed framework +not only empowers the model to embrace novel classes with limited data, but +also ensures the preservation of performance on base classes. To substantiate +the efficacy of our approach, we conduct comprehensive experiments on three +distinct FSCIL benchmarks, where our framework attains state-of-the-art +performance. + +
+
+
+
+
+ + ♻ ☆ Diagnosis extraction from unstructured Dutch echocardiogram reports + using span- and document-level characteristic classification + + +
+ Clinical machine learning research and AI driven clinical decision support +models rely on clinically accurate labels. Manually extracting these labels +with the help of clinical specialists is often time-consuming and expensive. +This study tests the feasibility of automatic span- and document-level +diagnosis extraction from unstructured Dutch echocardiogram reports. We +included 115,692 unstructured echocardiogram reports from the UMCU a large +university hospital in the Netherlands. A randomly selected subset was manually +annotated for the occurrence and severity of eleven commonly described cardiac +characteristics. We developed and tested several automatic labelling techniques +at both span and document levels, using weighted and macro F1-score, precision, +and recall for performance evaluation. We compared the performance of span +labelling against document labelling methods, which included both direct +document classifiers and indirect document classifiers that rely on span +classification results. The SpanCategorizer and MedRoBERTa$.$nl models +outperformed all other span and document classifiers, respectively. The +weighted F1-score varied between characteristics, ranging from 0.60 to 0.93 in +SpanCategorizer and 0.96 to 0.98 in MedRoBERTa$.$nl. Direct document +classification was superior to indirect document classification using span +classifiers. SetFit achieved competitive document classification performance +using only 10% of the training data. Utilizing a reduced label set yielded +near-perfect document classification results. We recommend using our published +SpanCategorizer and MedRoBERTa$.$nl models for span- and document-level +diagnosis extraction from Dutch echocardiography reports. For settings with +limited training data, SetFit may be a promising alternative for document +classification. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of trustworthy and +disinformation articles related to pro-Kremlin themes. It is sourced directly +from the debunk articles written by experts leading the EUvsDisinfo project. +Our dataset is the largest to-date resource in terms of the overall number of +articles and distinct languages. It also provides the largest topical and +temporal coverage. Using this dataset, we investigate the dissemination of +pro-Kremlin disinformation across different languages, uncovering +language-specific patterns targeting specific disinformation topics. We further +analyse the evolution of topic distribution over an eight-year period, noting a +significant surge in disinformation content before the full-scale invasion of +Ukraine in 2022. Lastly, we demonstrate the dataset's applicability in training +models to effectively distinguish between disinformation and trustworthy +content in multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked + Language Modelling methods for learning Speech Representations + + +
+ In recent years, self-supervised pre-training methods have gained significant +traction in learning high-level information from raw speech. Among these +methods, HuBERT has demonstrated SOTA performance in automatic speech +recognition (ASR). However, HuBERT's performance lags behind data2vec due to +disparities in pre-training strategies. In this paper, we propose (i) a Swap +method to address pre-training and inference mismatch observed in HuBERT and +(ii) incorporates Multicluster masked prediction loss for more effective +utilization of the models capacity. The resulting method is, MS-HuBERT, an +end-to-end self-supervised pre-training method for learning robust speech +representations. It beats vanilla HuBERT on the ASR Librispeech benchmark on +average by a 5% margin when evaluated on different finetuning splits. +Additionally, we demonstrate that the learned embeddings obtained during +pre-training encode essential information for improving performance of content +based tasks such as ASR. + +
+
+ comment: 4 pages, submitted to interspeech2024 +
+
+
+
+
+ + ♻ ☆ Distilling Reasoning Ability from Large Language Models with Adaptive + Thinking + + +
+ Chain of thought finetuning (cot-finetuning) aims to endow small language +models (SLM) with reasoning ability to improve their performance towards +specific tasks by allowing them to imitate the reasoning procedure of large +language models (LLM) beyond simply predicting the answers. Most existing +cot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to +generate a rationale before providing an answer. This mechanism enables SLM to +analyze and think about complex questions, but it also makes answer correctness +highly sensitive to minor errors in rationale. Therefore, we propose a robust +post-thinking mechanism to generate answers before rationale. Thanks to this +answer-first setting, 1) the answer can escape from the adverse effects caused +by minor errors in the rationale; 2) the rationale serves as an error amplifier +to the answer, which makes the SLM focus on learning hard samples; 3) the +inferring efficiency can also benefit from the setting since users can stop the +generation right after answers are outputted when inference is conducted. +However, although the post-thinking mechanism brings many advantages and +improves the overall performance of SLM on specific tasks, it may lose the +ability to think about the questions and decompose complex questions into +simple sub-questions compared to pre-thinking mechanism. Therefore, a +plug-and-play adaptive-thinking mechanism is proposed with the aid of the soft +prompt tuning to integrate the merits of the pre-thinking mechanism and +post-thinking mechanism, in which a perception module is introduced to +adaptively prompt SLM answer or think first based on perceiving the complexity +of the questions. Extensive experiments are conducted across 12 reasoning tasks +and 2 representative language models to demonstrate the effectiveness of the +proposed mechanism. + +
+
+
+
+
+ + ♻ ☆ Large Language Model Sentinel: LLM Agent for Adversarial Purification + + +
+ Over the past two years, the use of large language models (LLMs) has advanced +rapidly. While these LLMs offer considerable convenience, they also raise +security concerns, as LLMs are vulnerable to adversarial attacks by some +well-designed textual perturbations. In this paper, we introduce a novel +defense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is +designed to enhance the adversarial robustness of LLMs by purifying the +adversarial textual examples before feeding them into the target LLM. Our +method comprises two main components: a) Agent instruction, which can simulate +a new agent for adversarial defense, altering minimal characters to maintain +the original meaning of the sentence while defending against attacks; b) +Defense guidance, which provides strategies for modifying clean or adversarial +examples to ensure effective defense and accurate outputs from the target LLMs. +Remarkably, the defense agent demonstrates robust defensive capabilities even +without learning from adversarial examples. Additionally, we conduct an +intriguing adversarial experiment where we develop two agents, one for defense +and one for attack, and engage them in mutual confrontation. During the +adversarial interactions, neither agent completely beat the other. Extensive +experiments on both open-source and closed-source LLMs demonstrate that our +method effectively defends against adversarial attacks, thereby enhancing +adversarial robustness. + +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties ECCV 2024 + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Accepted to ECCV 2024. Code is available at + https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ Do Text-to-Vis Benchmarks Test Real Use of Visualisations? + + +
+ Large language models are able to generate code for visualisations in +response to user requests. This is a useful application, and an appealing one +for NLP research because plots of data provide grounding for language. However, +there are relatively few benchmarks, and it is unknown whether those that exist +are representative of what people do in practice. This paper aims to answer +that question through an empirical study comparing benchmark datasets and code +from public repositories. Our findings reveal a substantial gap in datasets, +with evaluations not testing the same distribution of chart types, attributes, +and the number of actions. The only representative dataset requires +modification to become an end-to-end and practical benchmark. This shows that +new, more benchmarks are needed to support the development of systems that +truly address users' visualisation needs. These observations will guide future +data creation, highlighting which features hold genuine significance for users. + +
+
+ comment: ARR AE score of 4 +
+
+
+
+
+ + ♻ ☆ Follow-Up Questions Improve Documents Generated by Large Language Models + + +
+ This study investigates the impact of Large Language Models (LLMs) generating +follow-up questions in response to user requests for short (1-page) text +documents. Users interacted with a novel web-based AI system designed to ask +follow-up questions. Users requested documents they would like the AI to +produce. The AI then generated follow-up questions to clarify the user's needs +or offer additional insights before generating the requested documents. After +answering the questions, users were shown a document generated using both the +initial request and the questions and answers, and a document generated using +only the initial request. Users indicated which document they preferred and +gave feedback about their experience with the question-answering process. The +findings of this study show clear benefits to question-asking both in document +preference and in the qualitative user experience. This study further shows +that users found more value in questions which were thought-provoking, +open-ended, or offered unique insights into the user's request as opposed to +simple information-gathering questions. + +
+
+
+
+
+ + ♻ ☆ Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised + Learning + + +
+ The majority of Chinese characters are monophonic, while a special group of +characters, called polyphonic characters, have multiple pronunciations. As a +prerequisite of performing speech-related generative tasks, the correct +pronunciation must be identified among several candidates. This process is +called Polyphone Disambiguation. Although the problem has been well explored +with both knowledge-based and learning-based approaches, it remains challenging +due to the lack of publicly available labeled datasets and the irregular nature +of polyphone in Mandarin Chinese. In this paper, we propose a novel +semi-supervised learning (SSL) framework for Mandarin Chinese polyphone +disambiguation that can potentially leverage unlimited unlabeled text data. We +explore the effect of various proxy labeling strategies including +entropy-thresholding and lexicon-based labeling. Qualitative and quantitative +experiments demonstrate that our method achieves state-of-the-art performance. +In addition, we publish a novel dataset specifically for the polyphone +disambiguation task to promote further research. + +
+
+
+
+
+ + ♻ ☆ Judgement Citation Retrieval using Contextual Similarity + + +
+ Traditionally in the domain of legal research, the retrieval of pertinent +citations from intricate case descriptions has demanded manual effort and +keyword-based search applications that mandate expertise in understanding legal +jargon. Legal case descriptions hold pivotal information for legal +professionals and researchers, necessitating more efficient and automated +approaches. We propose a methodology that combines natural language processing +(NLP) and machine learning techniques to enhance the organization and +utilization of legal case descriptions. This approach revolves around the +creation of textual embeddings with the help of state-of-art embedding models. +Our methodology addresses two primary objectives: unsupervised clustering and +supervised citation retrieval, both designed to automate the citation +extraction process. Although the proposed methodology can be used for any +dataset, we employed the Supreme Court of The United States (SCOTUS) dataset, +yielding remarkable results. Our methodology achieved an impressive accuracy +rate of 90.9%. By automating labor-intensive processes, we pave the way for a +more efficient, time-saving, and accessible landscape in legal research, +benefiting legal professionals, academics, and researchers. + +
+
+ comment: 14 pages, 16 images +
+
+
+
+
+ + ♻ ☆ Do GPT Language Models Suffer From Split Personality Disorder? The + Advent Of Substrate-Free Psychometrics + + +
+ Previous research on emergence in large language models shows these display +apparent human-like abilities and psychological latent traits. However, results +are partly contradicting in expression and magnitude of these latent traits, +yet agree on the worrisome tendencies to score high on the Dark Triad of +narcissism, psychopathy, and Machiavellianism, which, together with a track +record of derailments, demands more rigorous research on safety of these +models. We provided a state of the art language model with the same personality +questionnaire in nine languages, and performed Bayesian analysis of Gaussian +Mixture Model, finding evidence for a deeper-rooted issue. Our results suggest +both interlingual and intralingual instabilities, which indicate that current +language models do not develop a consistent core personality. This can lead to +unsafe behaviour of artificial intelligence systems that are based on these +foundation models, and are increasingly integrated in human life. We +subsequently discuss the shortcomings of modern psychometrics, abstract it, and +provide a framework for its species-neutral, substrate-free formulation. + +
+
+ comment: 37 pages, 7 figures, 3 tables, date v1: Mar 26 2023; replaced with + new version; reason: removed journal logo from older version of article that + is no longer valid +
+
+
+
+
+ + ♻ ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Understanding sentences that contain mathematical expressions in text form +poses significant challenges. To address this, the importance of converting +these expressions into a compiled formula is highlighted. For instance, the +expression ``x equals minus b plus or minus the square root of b squared minus +four a c, all over two a'' from automatic speech recognition (ASR) is more +readily comprehensible when displayed as a compiled formula $x = \frac{-b \pm +\sqrt{b^2 - 4ac}}{2a}$. To develop a text-to-formula conversion system, we can +break down the process into text-to-LaTeX and LaTeX-to-formula conversions, +with the latter managed by various existing LaTeX engines. However, the former +approach has been notably hindered by the severe scarcity of text-to-LaTeX +paired data, which presents a significant challenge in this field. In this +context, we introduce MathBridge, the first extensive dataset for translating +mathematical spoken expressions into LaTeX, to establish a robust baseline for +future research on text-to-LaTeX translation. MathBridge comprises +approximately 23 million LaTeX formulas paired with the corresponding spoken +English expressions. Through comprehensive evaluations, including fine-tuning +and testing with data, we discovered that MathBridge significantly enhances the +capabilities of pretrained language models for text-to-LaTeX translation. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. Our findings indicate the need +for a new metric, specifically for text-to-LaTeX conversion evaluations. + +
+
+ comment: 9page, 6 figures +
+
+
+
+
+ + ♻ ☆ Modeling Comparative Logical Relation with Contrastive Learning for Text + Generation NLPCC 2024 + + +
+ Data-to-Text Generation (D2T), a classic natural language generation problem, +aims at producing fluent descriptions for structured input data, such as a +table. Existing D2T works mainly focus on describing the superficial +associative relations among entities, while ignoring the deep comparative +logical relations, such as A is better than B in a certain aspect with a +corresponding opinion, which is quite common in our daily life. In this paper, +we introduce a new D2T task named comparative logical relation generation +(CLRG). Additionally, we propose a Comparative Logic (CoLo) based text +generation method, which generates texts following specific comparative logical +relations with contrastive learning. Specifically, we first construct various +positive and negative samples by fine-grained perturbations in entities, +aspects and opinions. Then, we perform contrastive learning in the encoder +layer to have a better understanding of the comparative logical relations, and +integrate it in the decoder layer to guide the model to correctly generate the +relations. Noting the data scarcity problem, we construct a Chinese Comparative +Logical Relation Dataset (CLRD), which is a high-quality human-annotated +dataset and challenging for text generation with descriptions of multiple +entities and annotations on their comparative logical relations. Extensive +experiments show that our method achieves impressive performance in both +automatic and human evaluations. + +
+
+ comment: NLPCC 2024 +
+
+
+
+
+ + ♻ ☆ MathScape: Evaluating MLLMs in multimodal Math Scenarios through a + Hierarchical Benchmark + + +
+ With the development of Multimodal Large Language Models (MLLMs), the +evaluation of multimodal models in the context of mathematical problems has +become a valuable research field. Multimodal visual-textual mathematical +reasoning serves as a critical indicator for evaluating the comprehension and +complex multi-step quantitative reasoning abilities of MLLMs. However, previous +multimodal math benchmarks have not sufficiently integrated visual and textual +information. To address this gap, we proposed MathScape, a new benchmark that +emphasizes the understanding and application of combined visual and textual +information. MathScape is designed to evaluate photo-based math problem +scenarios, assessing the theoretical understanding and application ability of +MLLMs through a categorical hierarchical approach. We conduct a +multi-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark +is challenging even for the most sophisticated models. By analyzing the +evaluation results, we identify the limitations of MLLMs, offering valuable +insights for enhancing model performance. + +
+
+
+
+
+ + ♻ ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.778 and an AP value of 0.426 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ♻ ☆ A Study on Large Language Models' Limitations in Multiple-Choice + Question Answering + + +
+ The widespread adoption of Large Language Models (LLMs) has become +commonplace, particularly with the emergence of open-source models. More +importantly, smaller models are well-suited for integration into consumer +devices and are frequently employed either as standalone solutions or as +subroutines in various AI tasks. Despite their ubiquitous use, there is no +systematic analysis of their specific capabilities and limitations. In this +study, we tackle one of the most widely used tasks - answering Multiple Choice +Question (MCQ). We analyze 26 small open-source models and find that 65% of the +models do not understand the task, only 4 models properly select an answer from +the given choices, and only 5 of these models are choice order independent. +These results are rather alarming given the extensive use of MCQ tests with +these models. We recommend exercising caution and testing task understanding +before using MCQ to evaluate LLMs in any field whatsoever. + +
+
+
+
+
+ + ♻ ☆ Eliminating Biased Length Reliance of Direct Preference Optimization via + Down-Sampled KL Divergence + + +
+ Direct Preference Optimization (DPO) has emerged as a prominent algorithm for +the direct and robust alignment of Large Language Models (LLMs) with human +preferences, offering a more straightforward alternative to the complex +Reinforcement Learning from Human Feedback (RLHF). Despite its promising +efficacy, DPO faces a notable drawback: "verbosity", a common over-optimization +phenomenon also observed in RLHF. While previous studies mainly attributed +verbosity to biased labels within the data, we propose that the issue also +stems from an inherent algorithmic length reliance in DPO. Specifically, we +suggest that the discrepancy between sequence-level Kullback-Leibler (KL) +divergences between chosen and rejected sequences, used in DPO, results in +overestimated or underestimated rewards due to varying token lengths. +Empirically, we utilize datasets with different label lengths to demonstrate +the presence of biased rewards. We then introduce an effective downsampling +approach, named SamPO, to eliminate potential length reliance. Our experimental +evaluations, conducted across three LLMs of varying scales and a diverse array +of conditional and open-ended benchmarks, highlight the efficacy of SamPO in +mitigating verbosity, achieving improvements of 5% to 12% over DPO through +debaised rewards. Our codes can be accessed at: +https://github.com/LuJunru/SamPO/. + +
+
+ comment: We thank Shiyue Xu for pointing out the error in Equation 5 in the + previous draft: https://github.com/LuJunru/SamPO/issues/1 +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ A Unified Taxonomy-Guided Instruction Tuning Framework for Entity Set + Expansion and Taxonomy Expansion + + +
+ Entity set expansion, taxonomy expansion, and seed-guided taxonomy +construction are three representative tasks that can be applied to +automatically populate an existing taxonomy with emerging concepts. Previous +studies view them as three separate tasks. Therefore, their proposed techniques +usually work for one specific task only, lacking generalizability and a +holistic perspective. In this paper, we aim at a unified solution to the three +tasks. To be specific, we identify two common skills needed for entity set +expansion, taxonomy expansion, and seed-guided taxonomy construction: finding +"siblings" and finding "parents". We propose a taxonomy-guided instruction +tuning framework to teach a large language model to generate siblings and +parents for query entities, where the joint pre-training process facilitates +the mutual enhancement of the two skills. Extensive experiments on multiple +benchmark datasets demonstrate the efficacy of our proposed TaxoInstruct +framework, which outperforms task-specific baselines across all three tasks. + +
+
+
+
+
+ + ♻ ☆ DP-MemArc: Differential Privacy Transfer Learning for Memory Efficient + Language Models + + +
+ Large language models have repeatedly shown outstanding performance across +diverse applications. However, deploying these models can inadvertently risk +user privacy. The significant memory demands during training pose a major +challenge in terms of resource consumption. This substantial size places a +heavy load on memory resources, raising considerable practical concerns. In +this paper, we introduce DP-MemArc, a novel training framework aimed at +reducing the memory costs of large language models while emphasizing the +protection of user data privacy. DP-MemArc incorporates side network or +reversible network designs to support a variety of differential privacy +memory-efficient fine-tuning schemes. Our approach not only achieves in memory +optimization but also ensures robust privacy protection, keeping user data +secure and confidential. Extensive experiments have demonstrated that DP-MemArc +effectively provides differential privacy-efficient fine-tuning across +different task scenarios. + +
+
+ comment: 9 pages second version +
+
+
+
+
+ + ♻ ☆ RAGSys: Item-Cold-Start Recommender as RAG System + + +
+ Large Language Models (LLM) hold immense promise for real-world applications, +but their generic knowledge often falls short of domain-specific needs. +Fine-tuning, a common approach, can suffer from catastrophic forgetting and +hinder generalizability. In-Context Learning (ICL) offers an alternative, which +can leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant +demonstrations for few-shot learning tasks. This paper explores the desired +qualities of a demonstration retrieval system for ICL. We argue that ICL +retrieval in this context resembles item-cold-start recommender systems, +prioritizing discovery and maximizing information gain over strict relevance. +We propose a novel evaluation method that measures the LLM's subsequent +performance on NLP tasks, eliminating the need for subjective diversity scores. +Our findings demonstrate the critical role of diversity and quality bias in +retrieved demonstrations for effective ICL, and highlight the potential of +recommender system techniques in this domain. + +
+
+
+
+
+ + ♻ ☆ GRAMMAR: Grounded and Modular Methodology for Assessment of + Closed-Domain Retrieval-Augmented Language Model + + +
+ Retrieval-Augmented Generation (RAG) systems are widely used across various +industries for querying closed-domain and in-house knowledge bases. However, +evaluating these systems presents significant challenges due to the private +nature of closed-domain data and a scarcity of queries with verifiable ground +truths. Moreover, there is a lack of analytical methods to diagnose problematic +modules and identify types of failure, such as those caused by knowledge +deficits or issues with robustness. To address these challenges, we introduce +GRAMMAR (GRounded And Modular Methodology for Assessment of RAG), an evaluation +framework comprising a grounded data generation process and an evaluation +protocol that effectively pinpoints defective modules. Our validation +experiments reveal that % traditional reference-free evaluation methods often +inaccurately assess false generations, tending toward optimism. In contrast, +GRAMMAR provides a reliable approach for identifying vulnerable modules and +supports hypothesis testing for textual form vulnerabilities. % An open-source +tool accompanying this framework will be released to easily reproduce our +results and enable reliable and modular evaluation in closed-domain settings. +An open-source tool accompanying this framework is available in our GitHub +repository \url{https://github.com/xinzhel/grammar}, allowing for easy +reproduction of our results and enabling reliable and modular evaluation in +closed-domain settings. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Trojan Activation Attack: Red-Teaming Large Language Models using + Activation Steering for Safety-Alignment CIKM'24 + + +
+ To ensure AI safety, instruction-tuned Large Language Models (LLMs) are +specifically trained to ensure alignment, which refers to making models behave +in accordance with human intentions. While these models have demonstrated +commendable results on various safety benchmarks, the vulnerability of their +safety alignment has not been extensively studied. This is particularly +troubling given the potential harm that LLMs can inflict. Existing attack +methods on LLMs often rely on poisoned training data or the injection of +malicious prompts. These approaches compromise the stealthiness and +generalizability of the attacks, making them susceptible to detection. +Additionally, these models often demand substantial computational resources for +implementation, making them less practical for real-world applications. In this +work, we study a different attack scenario, called Trojan Activation Attack +(TA^2), which injects trojan steering vectors into the activation layers of +LLMs. These malicious steering vectors can be triggered at inference time to +steer the models toward attacker-desired behaviors by manipulating their +activations. Our experiment results on four primary alignment tasks show that +TA^2 is highly effective and adds little or no overhead to attack efficiency. +Additionally, we discuss potential countermeasures against such activation +attacks. + +
+
+ comment: ACM International Conference on Information and Knowledge Management + (CIKM'24) +
+
+
+
+
+ + ♻ ☆ Enhancing Data Privacy in Large Language Models through Private + Association Editing + + +
+ Large Language Models (LLMs) are powerful tools with extensive applications, +but their tendency to memorize private information raises significant concerns +as private data leakage can easily happen. In this paper, we introduce Private +Association Editing (PAE), a novel defense approach for private data leakage. +PAE is designed to effectively remove Personally Identifiable Information (PII) +without retraining the model. Our approach consists of a four-step procedure: +detecting memorized PII, applying PAE cards to mitigate memorization of private +data, verifying resilience to targeted data extraction (TDE) attacks, and +ensuring consistency in the post-edit LLMs. The versatility and efficiency of +PAE, which allows for batch modifications, significantly enhance data privacy +in LLMs. Experimental results demonstrate the effectiveness of PAE in +mitigating private data leakage. We believe PAE will serve as a critical tool +in the ongoing effort to protect data privacy in LLMs, encouraging the +development of safer models for real-world applications. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 143 + +
+
+
+ + ☆ Can Large Language Models Understand Symbolic Graphics Programs? + + +
+ Assessing the capabilities of large language models (LLMs) is often +challenging, in part, because it is hard to find tasks to which they have not +been exposed during training. We take one step to address this challenge by +turning to a new task: focusing on symbolic graphics programs, which are a +popular representation for graphics content that procedurally generates visual +data. LLMs have shown exciting promise towards program synthesis, but do they +understand symbolic graphics programs? Unlike conventional programs, symbolic +graphics programs can be translated to graphics content. Here, we characterize +an LLM's understanding of symbolic programs in terms of their ability to answer +questions related to the graphics content. This task is challenging as the +questions are difficult to answer from the symbolic programs alone -- yet, they +would be easy to answer from the corresponding graphics content as we verify +through a human experiment. To understand symbolic programs, LLMs may need to +possess the ability to imagine how the corresponding graphics content would +look without directly accessing the rendered visual content. We use this task +to evaluate LLMs by creating a large benchmark for the semantic understanding +of symbolic graphics programs. This benchmark is built via program-graphics +correspondence, hence requiring minimal human efforts. We evaluate current LLMs +on our benchmark to elucidate a preliminary assessment of their ability to +reason about visual scenes from programs. We find that this task distinguishes +existing LLMs and models considered good at reasoning perform better. Lastly, +we introduce Symbolic Instruction Tuning (SIT) to improve this ability. +Specifically, we query GPT4-o with questions and images generated by symbolic +programs. Such data are then used to finetune an LLM. We also find that SIT +data can improve the general instruction following ability of LLMs. + +
+
+ comment: Technical Report v1 (44 pages, 23 figures, project page: + https://sgp-bench.github.io/) +
+
+
+
+
+ + ☆ Understanding the Local Geometry of Generative Model Manifolds + + +
+ Deep generative models learn continuous representations of complex data +manifolds using a finite number of samples during training. For a pre-trained +generative model, the common way to evaluate the quality of the manifold +representation learned, is by computing global metrics like Fr\'echet Inception +Distance using a large number of generated and real samples. However, +generative model performance is not uniform across the learned manifold, e.g., +for \textit{foundation models} like Stable Diffusion generation performance can +vary significantly based on the conditioning or initial noise vector being +denoised. In this paper we study the relationship between the \textit{local +geometry of the learned manifold} and downstream generation. Based on the +theory of continuous piecewise-linear (CPWL) generators, we use three geometric +descriptors - scaling ($\psi$), rank ($\nu$), and complexity ($\delta$) - to +characterize a pre-trained generative model manifold locally. We provide +quantitative and qualitative evidence showing that for a given latent, the +local descriptors are correlated with generation aesthetics, artifacts, +uncertainty, and even memorization. Finally we demonstrate that training a +\textit{reward model} on the local geometry can allow controlling the +likelihood of a generated sample under the learned distribution. + +
+
+ comment: Pre-print. 11 pages main, 8 pages app., 28 figures +
+
+
+
+
+ + ☆ Towards Flexible Visual Relationship Segmentation + + +
+ Visual relationship understanding has been studied separately in human-object +interaction(HOI) detection, scene graph generation(SGG), and referring +relationships(RR) tasks. Given the complexity and interconnectedness of these +tasks, it is crucial to have a flexible framework that can effectively address +these tasks in a cohesive manner. In this work, we propose FleVRS, a single +model that seamlessly integrates the above three aspects in standard and +promptable visual relationship segmentation, and further possesses the +capability for open-vocabulary segmentation to adapt to novel scenarios. FleVRS +leverages the synergy between text and image modalities, to ground various +types of relationships from images and use textual features from +vision-language models to visual conceptual understanding. Empirical validation +across various datasets demonstrates that our framework outperforms existing +models in standard, promptable, and open-vocabulary tasks, e.g., +1.9 $mAP$ on +HICO-DET, +11.4 $Acc$ on VRD, +4.7 $mAP$ on unseen HICO-DET. Our FleVRS +represents a significant step towards a more intuitive, comprehensive, and +scalable understanding of visual relationships. + +
+
+
+
+
+ + ☆ SLCA++: Unleash the Power of Sequential Fine-tuning for Continual + Learning with Pre-training ICCV 23 + + +
+ In recent years, continual learning with pre-training (CLPT) has received +widespread interest, instead of its traditional focus of training from scratch. +The use of strong pre-trained models (PTMs) can greatly facilitate knowledge +transfer and alleviate catastrophic forgetting, but also suffers from +progressive overfitting of pre-trained knowledge into specific downstream +tasks. A majority of current efforts often keep the PTMs frozen and incorporate +task-specific prompts to instruct representation learning, coupled with a +prompt selection process for inference. However, due to the limited capacity of +prompt parameters, this strategy demonstrates only sub-optimal performance in +continual learning. In comparison, tuning all parameters of PTMs often provides +the greatest potential for representation learning, making sequential +fine-tuning (Seq FT) a fundamental baseline that has been overlooked in CLPT. +To this end, we present an in-depth analysis of the progressive overfitting +problem from the lens of Seq FT. Considering that the overly fast +representation learning and the biased classification layer constitute this +particular problem, we introduce the advanced Slow Learner with Classifier +Alignment (SLCA++) framework to unleash the power of Seq FT, serving as a +strong baseline approach for CLPT. Our approach involves a Slow Learner to +selectively reduce the learning rate of backbone parameters, and a Classifier +Alignment to align the disjoint classification layers in a post-hoc fashion. We +further enhance the efficacy of SL with a symmetric cross-entropy loss, as well +as employ a parameter-efficient strategy to implement Seq FT with SLCA++. +Across a variety of continual learning scenarios on image classification +benchmarks, our approach provides substantial improvements and outperforms +state-of-the-art methods by a large margin. Code: +https://github.com/GengDavid/SLCA. + +
+
+ comment: This paper is an extension of our ICCV 23 paper (arXiv:2303.05118) +
+
+
+
+
+ + ☆ HeightLane: BEV Heightmap guided 3D Lane Detection + + +
+ Accurate 3D lane detection from monocular images presents significant +challenges due to depth ambiguity and imperfect ground modeling. Previous +attempts to model the ground have often used a planar ground assumption with +limited degrees of freedom, making them unsuitable for complex road +environments with varying slopes. Our study introduces HeightLane, an +innovative method that predicts a height map from monocular images by creating +anchors based on a multi-slope assumption. This approach provides a detailed +and accurate representation of the ground. HeightLane employs the predicted +heightmap along with a deformable attention-based spatial feature transform +framework to efficiently convert 2D image features into 3D bird's eye view +(BEV) features, enhancing spatial understanding and lane structure recognition. +Additionally, the heightmap is used for the positional encoding of BEV +features, further improving their spatial accuracy. This explicit view +transformation bridges the gap between front-view perceptions and spatially +accurate BEV representations, significantly improving detection performance. To +address the lack of the necessary ground truth (GT) height map in the original +OpenLane dataset, we leverage the Waymo dataset and accumulate its LiDAR data +to generate a height map for the drivable area of each scene. The GT heightmaps +are used to train the heightmap extraction module from monocular images. +Extensive experiments on the OpenLane validation set show that HeightLane +achieves state-of-the-art performance in terms of F-score, highlighting its +potential in real-world applications. + +
+
+ comment: 10 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce +\textbf{\textit{Snuffy}} architecture, a novel MIL-pooling method based on +sparse transformers that mitigates performance loss with limited pre-training +and enables continual few-shot pre-training as a competitive option. Our +sparsity pattern is tailored for pathology and is theoretically proven to be a +universal approximator with the tightest probabilistic sharp bound on the +number of layers for sparse transformers, to date. We demonstrate Snuffy's +effectiveness on CAMELYON16 and TCGA Lung cancer datasets, achieving superior +WSI and patch-level accuracies. The code is available on +\url{https://github.com/jafarinia/snuffy}. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ☆ Computer Vision Model Compression Techniques for Embedded Systems: A + Survey + + +
+ Deep neural networks have consistently represented the state of the art in +most computer vision problems. In these scenarios, larger and more complex +models have demonstrated superior performance to smaller architectures, +especially when trained with plenty of representative data. With the recent +adoption of Vision Transformer (ViT) based architectures and advanced +Convolutional Neural Networks (CNNs), the total number of parameters of leading +backbone architectures increased from 62M parameters in 2012 with AlexNet to 7B +parameters in 2024 with AIM-7B. Consequently, deploying such deep architectures +faces challenges in environments with processing and runtime constraints, +particularly in embedded systems. This paper covers the main model compression +techniques applied for computer vision tasks, enabling modern models to be used +in embedded systems. We present the characteristics of compression subareas, +compare different approaches, and discuss how to choose the best technique and +expected variations when analyzing it on various embedded devices. We also +share codes to assist researchers and new practitioners in overcoming initial +implementation challenges for each subarea and present trends for Model +Compression. Case studies for compression models are available at +\href{https://github.com/venturusbr/cv-model-compression}{https://github.com/venturusbr/cv-model-compression}. + +
+
+
+
+
+ + ☆ Comparative Evaluation of 3D Reconstruction Methods for Object Pose + Estimation + + +
+ Object pose estimation is essential to many industrial applications involving +robotic manipulation, navigation, and augmented reality. Current generalizable +object pose estimators, i.e., approaches that do not need to be trained per +object, rely on accurate 3D models. Predominantly, CAD models are used, which +can be hard to obtain in practice. At the same time, it is often possible to +acquire images of an object. Naturally, this leads to the question whether 3D +models reconstructed from images are sufficient to facilitate accurate object +pose estimation. We aim to answer this question by proposing a novel benchmark +for measuring the impact of 3D reconstruction quality on pose estimation +accuracy. Our benchmark provides calibrated images for object reconstruction +registered with the test images of the YCB-V dataset for pose evaluation under +the BOP benchmark format. Detailed experiments with multiple state-of-the-art +3D reconstruction and object pose estimation approaches show that the geometry +produced by modern reconstruction methods is often sufficient for accurate pose +estimation. Our experiments lead to interesting observations: (1) Standard +metrics for measuring 3D reconstruction quality are not necessarily indicative +of pose estimation accuracy, which shows the need for dedicated benchmarks such +as ours. (2) Classical, non-learning-based approaches can perform on par with +modern learning-based reconstruction techniques and can even offer a better +reconstruction time-pose accuracy tradeoff. (3) There is still a sizable gap +between performance with reconstructed and with CAD models. To foster research +on closing this gap, our benchmark is publicly available at +https://github.com/VarunBurde/reconstruction_pose_benchmark}. + +
+
+
+
+
+ + ☆ Rethinking Medical Anomaly Detection in Brain MRI: An Image Quality + Assessment Perspective + + +
+ Reconstruction-based methods, particularly those leveraging autoencoders, +have been widely adopted to perform anomaly detection in brain MRI. While most +existing works try to improve detection accuracy by proposing new model +structures or algorithms, we tackle the problem through image quality +assessment, an underexplored perspective in the field. We propose a fusion +quality loss function that combines Structural Similarity Index Measure loss +with l1 loss, offering a more comprehensive evaluation of reconstruction +quality. Additionally, we introduce a data pre-processing strategy that +enhances the average intensity ratio (AIR) between normal and abnormal regions, +further improving the distinction of anomalies. By fusing the aforementioned +two methods, we devise the image quality assessment (IQA) approach. The +proposed IQA approach achieves significant improvements (>10%) in terms of Dice +coefficient (DICE) and Area Under the Precision-Recall Curve (AUPRC) on the +BraTS21 (T2, FLAIR) and MSULB datasets when compared with state-of-the-art +methods. These results highlight the importance of invoking the comprehensive +image quality assessment in medical anomaly detection and provide a new +perspective for future research in this field. + +
+
+
+
+
+ + ☆ The Dawn of KAN in Image-to-Image (I2I) Translation: Integrating + Kolmogorov-Arnold Networks with GANs for Unpaired I2I Translation + + +
+ Image-to-Image translation in Generative Artificial Intelligence (Generative +AI) has been a central focus of research, with applications spanning +healthcare, remote sensing, physics, chemistry, photography, and more. Among +the numerous methodologies, Generative Adversarial Networks (GANs) with +contrastive learning have been particularly successful. This study aims to +demonstrate that the Kolmogorov-Arnold Network (KAN) can effectively replace +the Multi-layer Perceptron (MLP) method in generative AI, particularly in the +subdomain of image-to-image translation, to achieve better generative quality. +Our novel approach replaces the two-layer MLP with a two-layer KAN in the +existing Contrastive Unpaired Image-to-Image Translation (CUT) model, +developing the KAN-CUT model. This substitution favors the generation of more +informative features in low-dimensional vector representations, which +contrastive learning can utilize more effectively to produce high-quality +images in the target domain. Extensive experiments, detailed in the results +section, demonstrate the applicability of KAN in conjunction with contrastive +learning and GANs in Generative AI, particularly for image-to-image +translation. This work suggests that KAN could be a valuable component in the +broader generative AI domain. + +
+
+ comment: 10 pages, 6 Figures, 1 Table +
+
+
+
+
+ + ☆ Moving Healthcare AI-Support Systems for Visually Detectable Diseases + onto Constrained Devices + + +
+ Image classification usually requires connectivity and access to the cloud +which is often limited in many parts of the world, including hard to reach +rural areas. TinyML aims to solve this problem by hosting AI assistants on +constrained devices, eliminating connectivity issues by processing data within +the device itself, without internet or cloud access. This pilot study explores +the use of tinyML to provide healthcare support with low spec devices in low +connectivity environments, focusing on diagnosis of skin diseases and the +ethical use of AI assistants in a healthcare setting. To investigate this, +10,000 images of skin lesions were used to train a model for classifying +visually detectable diseases (VDDs). The model weights were then offloaded to a +Raspberry Pi with a webcam attached, to be used for the classification of skin +lesions without internet access. It was found that the developed prototype +achieved a test accuracy of 78% and a test loss of 1.08. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Learned Multimodal Compression for Autonomous Driving SP 2024 + + +
+ Autonomous driving sensors generate an enormous amount of data. In this +paper, we explore learned multimodal compression for autonomous driving, +specifically targeted at 3D object detection. We focus on camera and LiDAR +modalities and explore several coding approaches. One approach involves joint +coding of fused modalities, while others involve coding one modality first, +followed by conditional coding of the other modality. We evaluate the +performance of these coding schemes on the nuScenes dataset. Our experimental +results indicate that joint coding of fused modalities yields better results +compared to the alternatives. + +
+
+ comment: 6 pages, 5 figures, IEEE MMSP 2024 +
+
+
+
+
+ + ☆ WaterSplatting: Fast Underwater 3D Scene Reconstruction Using Gaussian + Splatting + + +
+ The underwater 3D scene reconstruction is a challenging, yet interesting +problem with applications ranging from naval robots to VR experiences. The +problem was successfully tackled by fully volumetric NeRF-based methods which +can model both the geometry and the medium (water). Unfortunately, these +methods are slow to train and do not offer real-time rendering. More recently, +3D Gaussian Splatting (3DGS) method offered a fast alternative to NeRFs. +However, because it is an explicit method that renders only the geometry, it +cannot render the medium and is therefore unsuited for underwater +reconstruction. Therefore, we propose a novel approach that fuses volumetric +rendering with 3DGS to handle underwater data effectively. Our method employs +3DGS for explicit geometry representation and a separate volumetric field +(queried once per pixel) for capturing the scattering medium. This dual +representation further allows the restoration of the scenes by removing the +scattering medium. Our method outperforms state-of-the-art NeRF-based methods +in rendering quality on the underwater SeaThru-NeRF dataset. Furthermore, it +does so while offering real-time rendering performance, addressing the +efficiency limitations of existing methods. Web: +https://water-splatting.github.io + +
+
+ comment: Web: https://water-splatting.github.io +
+
+
+
+
+ + ☆ A Multi-task Adversarial Attack Against Face Authentication + + +
+ Deep-learning-based identity management systems, such as face authentication +systems, are vulnerable to adversarial attacks. However, existing attacks are +typically designed for single-task purposes, which means they are tailored to +exploit vulnerabilities unique to the individual target rather than being +adaptable for multiple users or systems. This limitation makes them unsuitable +for certain attack scenarios, such as morphing, universal, transferable, and +counter attacks. In this paper, we propose a multi-task adversarial attack +algorithm called MTADV that are adaptable for multiple users or systems. By +interpreting these scenarios as multi-task attacks, MTADV is applicable to both +single- and multi-task attacks, and feasible in the white- and gray-box +settings. Furthermore, MTADV is effective against various face datasets, +including LFW, CelebA, and CelebA-HQ, and can work with different deep learning +models, such as FaceNet, InsightFace, and CurricularFace. Importantly, MTADV +retains its feasibility as a single-task attack targeting a single user/system. +To the best of our knowledge, MTADV is the first adversarial attack method that +can target all of the aforementioned scenarios in one algorithm. + +
+
+ comment: Accepted by ACM Transactions on Multimedia Computing, Communications, + and Applications +
+
+
+
+
+ + ☆ Towards Practical Human Motion Prediction with LiDAR Point Clouds + + +
+ Human motion prediction is crucial for human-centric multimedia understanding +and interacting. Current methods typically rely on ground truth human poses as +observed input, which is not practical for real-world scenarios where only raw +visual sensor data is available. To implement these methods in practice, a +pre-phrase of pose estimation is essential. However, such two-stage approaches +often lead to performance degradation due to the accumulation of errors. +Moreover, reducing raw visual data to sparse keypoint representations +significantly diminishes the density of information, resulting in the loss of +fine-grained features. In this paper, we propose \textit{LiDAR-HMP}, the first +single-LiDAR-based 3D human motion prediction approach, which receives the raw +LiDAR point cloud as input and forecasts future 3D human poses directly. +Building upon our novel structure-aware body feature descriptor, LiDAR-HMP +adaptively maps the observed motion manifold to future poses and effectively +models the spatial-temporal correlations of human motions for further +refinement of prediction results. Extensive experiments show that our method +achieves state-of-the-art performance on two public benchmarks and demonstrates +remarkable robustness and efficacy in real-world deployments. + +
+
+
+
+
+ + ☆ Heavy Labels Out! Dataset Distillation with Label Space Lightening + + +
+ Dataset distillation or condensation aims to condense a large-scale training +dataset into a much smaller synthetic one such that the training performance of +distilled and original sets on neural networks are similar. Although the number +of training samples can be reduced substantially, current state-of-the-art +methods heavily rely on enormous soft labels to achieve satisfactory +performance. As a result, the required storage can be comparable even to +original datasets, especially for large-scale ones. To solve this problem, +instead of storing these heavy labels, we propose a novel label-lightening +framework termed HeLlO aiming at effective image-to-label projectors, with +which synthetic labels can be directly generated online from synthetic images. +Specifically, to construct such projectors, we leverage prior knowledge in +open-source foundation models, e.g., CLIP, and introduce a LoRA-like +fine-tuning strategy to mitigate the gap between pre-trained and target +distributions, so that original models for soft-label generation can be +distilled into a group of low-rank matrices. Moreover, an effective image +optimization method is proposed to further mitigate the potential error between +the original and distilled label generators. Extensive experiments demonstrate +that with only about 0.003% of the original storage required for a complete set +of soft labels, we achieve comparable performance to current state-of-the-art +dataset distillation methods on large-scale datasets. Our code will be +available. + +
+
+
+
+
+ + ☆ Beyond Full Label: Single-Point Prompt for Infrared Small Target Label + Generation + + +
+ In this work, we make the first attempt to construct a learning-based +single-point annotation paradigm for infrared small target label generation +(IRSTLG). Our intuition is that label generation requires just one more point +prompt than target detection: IRSTLG can be regarded as an infrared small +target detection (IRSTD) task with the target location hint. Based on this +insight, we introduce an energy double guided single-point prompt (EDGSP) +framework, which adeptly transforms the target detection network into a refined +label generation method. Specifically, the proposed EDGSP includes: 1) target +energy initialization (TEI) to create a foundational outline for sufficient +shape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid +localization of interested regions and reinforcement of individual differences +to avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate +false alarms. Experimental results show that pseudo labels generated by three +baselines equipped with EDGSP achieve 100% object-level probability of +detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k +datasets, with a pixel-level intersection over union (IoU) improvement of +13.28% over state-of-the-art label generation methods. Additionally, the +downstream detection task reveals that our centroid-annotated pseudo labels +surpass full labels, even with coarse single-point annotations, it still +achieves 99.5% performance of full labeling. + +
+
+
+
+
+ + ☆ FancyVideo: Towards Dynamic and Consistent Video Generation via + Cross-frame Textual Guidance + + +
+ Synthesizing motion-rich and temporally consistent videos remains a challenge +in artificial intelligence, especially when dealing with extended durations. +Existing text-to-video (T2V) models commonly employ spatial cross-attention for +text control, equivalently guiding different frame generations without +frame-specific textual guidance. Thus, the model's capacity to comprehend the +temporal logic conveyed in prompts and generate videos with coherent motion is +restricted. To tackle this limitation, we introduce FancyVideo, an innovative +video generator that improves the existing text-control mechanism with the +well-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM +incorporates the Temporal Information Injector (TII), Temporal Affinity Refiner +(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of +cross-attention, respectively, to achieve frame-specific textual guidance. +Firstly, TII injects frame-specific information from latent features into text +conditions, thereby obtaining cross-frame textual conditions. Then, TAR refines +the correlation matrix between cross-frame textual conditions and latent +features along the time dimension. Lastly, TFB boosts the temporal consistency +of latent features. Extensive experiments comprising both quantitative and +qualitative evaluations demonstrate the effectiveness of FancyVideo. Our +approach achieves state-of-the-art T2V generation results on the EvalCrafter +benchmark and facilitates the synthesis of dynamic and consistent videos. The +video show results can be available at https://fancyvideo.github.io/, and we +will make our code and model weights publicly available. + +
+
+
+
+
+ + ☆ Not Every Image is Worth a Thousand Words: Quantifying Originality in + Stable Diffusion ICML 2024 + + +
+ This work addresses the challenge of quantifying originality in text-to-image +(T2I) generative diffusion models, with a focus on copyright originality. We +begin by evaluating T2I models' ability to innovate and generalize through +controlled experiments, revealing that stable diffusion models can effectively +recreate unseen elements with sufficiently diverse training data. Then, our key +insight is that concepts and combinations of image elements the model is +familiar with, and saw more during training, are more concisly represented in +the model's latent space. We hence propose a method that leverages textual +inversion to measure the originality of an image based on the number of tokens +required for its reconstruction by the model. Our approach is inspired by legal +definitions of originality and aims to assess whether a model can produce +original content without relying on specific prompts or having the training +data of the model. We demonstrate our method using both a pre-trained stable +diffusion model and a synthetic dataset, showing a correlation between the +number of tokens and image originality. This work contributes to the +understanding of originality in generative models and has implications for +copyright infringement cases. + +
+
+ comment: GenLaw ICML 2024 +
+
+
+
+
+ + ☆ Your Turn: Real-World Turning Angle Estimation for Parkinson's Disease + Severity Assessment + + +
+ People with Parkinson's Disease (PD) often experience progressively worsening +gait, including changes in how they turn around, as the disease progresses. +Existing clinical rating tools are not capable of capturing hour-by-hour +variations of PD symptoms, as they are confined to brief assessments within +clinic settings. Measuring real-world gait turning angles continuously and +passively is a component step towards using gait characteristics as sensitive +indicators of disease progression in PD. This paper presents a deep +learning-based approach to automatically quantify turning angles by extracting +3D skeletons from videos and calculating the rotation of hip and knee joints. +We utilise state-of-the-art human pose estimation models, Fastpose and Strided +Transformer, on a total of 1386 turning video clips from 24 subjects (12 people +with PD and 12 healthy control volunteers), trimmed from a PD dataset of +unscripted free-living videos in a home-like setting (Turn-REMAP). We also +curate a turning video dataset, Turn-H3.6M, from the public Human3.6M human +pose benchmark with 3D ground truth, to further validate our method. Previous +gait research has primarily taken place in clinics or laboratories evaluating +scripted gait outcomes, but this work focuses on real-world settings where +complexities exist, such as baggy clothing and poor lighting. Due to +difficulties in obtaining accurate ground truth data in a free-living setting, +we quantise the angle into the nearest bin $45^\circ$ based on the manual +labelling of expert clinicians. Our method achieves a turning calculation +accuracy of 41.6%, a Mean Absolute Error (MAE) of 34.7{\deg}, and a weighted +precision WPrec of 68.3% for Turn-REMAP. This is the first work to explore the +use of single monocular camera data to quantify turns by PD patients in a home +setting. + +
+
+
+
+
+ + ☆ Towards flexible perception with visual memory + + +
+ Training a neural network is a monolithic endeavor, akin to carving knowledge +into stone: once the process is completed, editing the knowledge in a network +is nearly impossible, since all information is distributed across the network's +weights. We here explore a simple, compelling alternative by marrying the +representational power of deep neural networks with the flexibility of a +database. Decomposing the task of image classification into image similarity +(from a pre-trained embedding) and search (via fast nearest neighbor retrieval +from a knowledge database), we build a simple and flexible visual memory that +has the following key capabilities: (1.) The ability to flexibly add data +across scales: from individual samples all the way to entire classes and +billion-scale data; (2.) The ability to remove data through unlearning and +memory pruning; (3.) An interpretable decision-mechanism on which we can +intervene to control its behavior. Taken together, these capabilities +comprehensively demonstrate the benefits of an explicit visual memory. We hope +that it might contribute to a conversation on how knowledge should be +represented in deep vision models -- beyond carving it in ``stone'' weights. + +
+
+
+
+
+ + ☆ Unsupervised Variational Translator for Bridging Image Restoration and + High-Level Vision Tasks + + +
+ Recent research tries to extend image restoration capabilities from human +perception to machine perception, thereby enhancing the performance of +high-level vision tasks in degraded environments. These methods, primarily +based on supervised learning, typically involve the retraining of restoration +networks or high-level vision networks. However, collecting paired data in +real-world scenarios and retraining large-scale models are challenge. To this +end, we propose an unsupervised learning method called \textbf{Va}riational +\textbf{T}ranslator (VaT), which does not require retraining existing +restoration and high-level vision networks. Instead, it establishes a +lightweight network that serves as an intermediate bridge between them. By +variational inference, VaT approximates the joint distribution of restoration +output and high-level vision input, dividing the optimization objective into +preserving content and maximizing marginal likelihood associated with +high-level vision tasks. By cleverly leveraging self-training paradigms, VaT +achieves the above optimization objective without requiring labels. As a +result, the translated images maintain a close resemblance to their original +content while also demonstrating exceptional performance on high-level vision +tasks. Extensive experiments in dehazing and low-light enhancement for +detection and classification show the superiority of our method over other +state-of-the-art unsupervised counterparts, even significantly surpassing +supervised methods in some complex real-world scenarios. + +
+
+
+
+
+ + ☆ Unlearnable Examples Detection via Iterative Filtering ICANN 2024 + + +
+ Deep neural networks are proven to be vulnerable to data poisoning attacks. +Recently, a specific type of data poisoning attack known as availability +attacks has led to the failure of data utilization for model learning by adding +imperceptible perturbations to images. Consequently, it is quite beneficial and +challenging to detect poisoned samples, also known as Unlearnable Examples +(UEs), from a mixed dataset. In response, we propose an Iterative Filtering +approach for UEs identification. This method leverages the distinction between +the inherent semantic mapping rules and shortcuts, without the need for any +additional information. We verify that when training a classifier on a mixed +dataset containing both UEs and clean data, the model tends to quickly adapt to +the UEs compared to the clean data. Due to the accuracy gaps between training +with clean/poisoned samples, we employ a model to misclassify clean samples +while correctly identifying the poisoned ones. The incorporation of additional +classes and iterative refinement enhances the model's ability to differentiate +between clean and poisoned samples. Extensive experiments demonstrate the +superiority of our method over state-of-the-art detection approaches across +various attacks, datasets, and poison ratios, significantly reducing the Half +Total Error Rate (HTER) compared to existing methods. + +
+
+ comment: Accepted by ICANN 2024 +
+
+
+
+
+ + ☆ CorrAdaptor: Adaptive Local Context Learning for Correspondence Pruning ECAI + + +
+ In the fields of computer vision and robotics, accurate pixel-level +correspondences are essential for enabling advanced tasks such as +structure-from-motion and simultaneous localization and mapping. Recent +correspondence pruning methods usually focus on learning local consistency +through k-nearest neighbors, which makes it difficult to capture robust context +for each correspondence. We propose CorrAdaptor, a novel architecture that +introduces a dual-branch structure capable of adaptively adjusting local +contexts through both explicit and implicit local graph learning. Specifically, +the explicit branch uses KNN-based graphs tailored for initial neighborhood +identification, while the implicit branch leverages a learnable matrix to +softly assign neighbors and adaptively expand the local context scope, +significantly enhancing the model's robustness and adaptability to complex +image variations. Moreover, we design a motion injection module to integrate +motion consistency into the network to suppress the impact of outliers and +refine local context learning, resulting in substantial performance +improvements. The experimental results on extensive correspondence-based tasks +indicate that our CorrAdaptor achieves state-of-the-art performance both +qualitatively and quantitatively. The code and pre-trained models are available +at https://github.com/TaoWangzj/CorrAdaptor. + +
+
+ comment: 8 pages, 4 figures, accepted by ECAI +
+
+
+
+
+ + ☆ Category-Prompt Refined Feature Learning for Long-Tailed Multi-Label + Image Classification ACM MM 2024 + + +
+ Real-world data consistently exhibits a long-tailed distribution, often +spanning multiple categories. This complexity underscores the challenge of +content comprehension, particularly in scenarios requiring Long-Tailed +Multi-Label image Classification (LTMLC). In such contexts, imbalanced data +distribution and multi-object recognition pose significant hurdles. To address +this issue, we propose a novel and effective approach for LTMLC, termed +Category-Prompt Refined Feature Learning (CPRFL), utilizing semantic +correlations between different categories and decoupling category-specific +visual representations for each category. Specifically, CPRFL initializes +category-prompts from the pretrained CLIP's embeddings and decouples +category-specific visual representations through interaction with visual +features, thereby facilitating the establishment of semantic correlations +between the head and tail classes. To mitigate the visual-semantic domain bias, +we design a progressive Dual-Path Back-Propagation mechanism to refine the +prompts by progressively incorporating context-related visual information into +prompts. Simultaneously, the refinement process facilitates the progressive +purification of the category-specific visual representations under the guidance +of the refined prompts. Furthermore, taking into account the negative-positive +sample imbalance, we adopt the Asymmetric Loss as our optimization objective to +suppress negative samples across all classes and potentially enhance the +head-to-tail recognition performance. We validate the effectiveness of our +method on two LTMLC benchmarks and extensive experiments demonstrate the +superiority of our work over baselines. + The code is available at https://github.com/jiexuanyan/CPRFL. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Unsupervised Part Discovery via Dual Representation Alignment + + +
+ Object parts serve as crucial intermediate representations in various +downstream tasks, but part-level representation learning still has not received +as much attention as other vision tasks. Previous research has established that +Vision Transformer can learn instance-level attention without labels, +extracting high-quality instance-level representations for boosting downstream +tasks. In this paper, we achieve unsupervised part-specific attention learning +using a novel paradigm and further employ the part representations to improve +part discovery performance. Specifically, paired images are generated from the +same image with different geometric transformations, and multiple part +representations are extracted from these paired images using a novel module, +named PartFormer. These part representations from the paired images are then +exchanged to improve geometric transformation invariance. Subsequently, the +part representations are aligned with the feature map extracted by a feature +map encoder, achieving high similarity with the pixel representations of the +corresponding part regions and low similarity in irrelevant regions. Finally, +the geometric and semantic constraints are applied to the part representations +through the intermediate results in alignment for part-specific attention +learning, encouraging the PartFormer to focus locally and the part +representations to explicitly include the information of the corresponding +parts. Moreover, the aligned part representations can further serve as a series +of reliable detectors in the testing phase, predicting pixel masks for part +discovery. Extensive experiments are carried out on four widely used datasets, +and our results demonstrate that the proposed method achieves competitive +performance and robustness due to its part-specific attention. + +
+
+ comment: Accepted by TPAMI-2024 +
+
+
+
+
+ + ☆ Multimodal Causal Reasoning Benchmark: Challenging Vision Large Language + Models to Infer Causal Links Between Siamese Images + + +
+ Large Language Models (LLMs) have showcased exceptional ability in causal +reasoning from textual information. However, will these causalities remain +straightforward for Vision Large Language Models (VLLMs) when only visual hints +are provided? Motivated by this, we propose a novel Multimodal Causal Reasoning +benchmark, namely MuCR, to challenge VLLMs to infer semantic cause-and-effect +relationship when solely relying on visual cues such as action, appearance, +clothing, and environment. Specifically, we introduce a prompt-driven image +synthesis approach to create siamese images with embedded semantic causality +and visual cues, which can effectively evaluate VLLMs' causal reasoning +capabilities. Additionally, we develop tailored metrics from multiple +perspectives, including image-level match, phrase-level understanding, and +sentence-level explanation, to comprehensively assess VLLMs' comprehension +abilities. Our extensive experiments reveal that the current state-of-the-art +VLLMs are not as skilled at multimodal causal reasoning as we might have hoped. +Furthermore, we perform a comprehensive analysis to understand these models' +shortcomings from different views and suggest directions for future research. +We hope MuCR can serve as a valuable resource and foundational benchmark in +multimodal causal reasoning research. The project is available at: +https://github.com/Zhiyuan-Li-John/MuCR + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ When Video Coding Meets Multimodal Large Language Models: A Unified + Paradigm for Video Coding + + +
+ Existing codecs are designed to eliminate intrinsic redundancies to create a +compact representation for compression. However, strong external priors from +Multimodal Large Language Models (MLLMs) have not been explicitly explored in +video compression. Herein, we introduce a unified paradigm for Cross-Modality +Video Coding (CMVC), which is a pioneering approach to explore multimodality +representation and video generative models in video coding. Specifically, on +the encoder side, we disentangle a video into spatial content and motion +components, which are subsequently transformed into distinct modalities to +achieve very compact representation by leveraging MLLMs. During decoding, +previously encoded components and video generation models are leveraged to +create multiple encoding-decoding modes that optimize video reconstruction +quality for specific decoding requirements, including Text-Text-to-Video (TT2V) +mode to ensure high-quality semantic information and Image-Text-to-Video (IT2V) +mode to achieve superb perceptual consistency. In addition, we propose an +efficient frame interpolation model for IT2V mode via Low-Rank Adaption (LoRA) +tuning to guarantee perceptual quality, which allows the generated motion cues +to behave smoothly. Experiments on benchmarks indicate that TT2V achieves +effective semantic reconstruction, while IT2V exhibits competitive perceptual +consistency. These results highlight potential directions for future research +in video coding. + +
+
+
+
+
+ + ☆ OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse + Click Annotation + + +
+ LiDAR-based outdoor 3D object detection has received widespread attention. +However, training 3D detectors from the LiDAR point cloud typically relies on +expensive bounding box annotations. This paper presents OC3D, an innovative +weakly supervised method requiring only coarse clicks on the bird' s eye view +of the 3D point cloud. A key challenge here is the absence of complete +geometric descriptions of the target objects from such simple click +annotations. To address this problem, our proposed OC3D adopts a two-stage +strategy. In the first stage, we initially design a novel dynamic and static +classification strategy and then propose the Click2Box and Click2Mask modules +to generate box-level and mask-level pseudo-labels for static and dynamic +instances, respectively. In the second stage, we design a Mask2Box module, +leveraging the learning capabilities of neural networks to update mask-level +pseudo-labels, which contain less information, to box level pseudo-labels. +Experimental results on the widely used KITTI and nuScenes datasets demonstrate +that our OC3D with only coarse clicks achieves state-of-the-art performance +compared to weakly-supervised 3D detection methods. Combining OC3D with a +missing click mining strategy, we propose a OC3D++ pipeline, which requires +only 0.2% annotation cost in the KITTI dataset to achieve performance +comparable to fully supervised methods. + +
+
+
+
+
+ + ☆ HAIR: Hypernetworks-based All-in-One Image Restoration + + +
+ Image restoration involves recovering a high-quality clean image from its +degraded version, which is a fundamental task in computer vision. Recent +progress in image restoration has demonstrated the effectiveness of learning +models capable of addressing various degradations simultaneously, i.e., the +All-in-One image restoration models. However, these existing methods typically +utilize the same parameters facing images with different degradation types, +which causes the model to be forced to trade off between degradation types, +therefore impair the total performance. To solve this problem, we propose HAIR, +a Hypernetworks-based plug-in-and-play method that dynamically generated +parameters for the corresponding networks based on the contents of input +images. HAIR consists of 2 main components: Classifier (Cl) and Hyper Selecting +Net (HSN). To be more specific, the Classifier is a simple image classification +network which is used to generate a Global Information Vector (GIV) that +contains the degradation information of the input image; And the HSNs can be +seen as a simple Fully-connected Neural Network that receive the GIV and output +parameters for the corresponding modules. Extensive experiments shows that +incorporating HAIR into the architectures can significantly improve the +performance of different models on image restoration tasks at a low cost, +\textbf{although HAIR only generate parameters and haven't change these models' +logical structures at all.} With incorporating HAIR into the popular +architecture Restormer, our method obtains superior or at least comparable +performance to current state-of-the-art methods on a range of image restoration +tasks. +\href{https://github.com/toummHus/HAIR}{\textcolor{blue}{$\underline{\textbf{Code +and pre-trained checkpoints are available here.}}$}} + +
+
+ comment: 13 pages, 4 figures, 6 tables +
+
+
+
+
+ + ☆ ColorMamba: Towards High-quality NIR-to-RGB Spectral Translation with + Mamba + + +
+ Translating NIR to the visible spectrum is challenging due to cross-domain +complexities. Current models struggle to balance a broad receptive field with +computational efficiency, limiting practical use. Although the Selective +Structured State Space Model, especially the improved version, Mamba, excels in +generative tasks by capturing long-range dependencies with linear complexity, +its default approach of converting 2D images into 1D sequences neglects local +context. In this work, we propose a simple but effective backbone, dubbed +ColorMamba, which first introduces Mamba into spectral translation tasks. To +explore global long-range dependencies and local context for efficient spectral +translation, we introduce learnable padding tokens to enhance the distinction +of image boundaries and prevent potential confusion within the sequence model. +Furthermore, local convolutional enhancement and agent attention are designed +to improve the vanilla Mamba. Moreover, we exploit the HSV color to provide +multi-scale guidance in the reconstruction process for more accurate spectral +translation. Extensive experiments show that our ColorMamba achieves a 1.02 +improvement in terms of PSNR compared with the state-of-the-art method. Our +code is available at https://github.com/AlexYangxx/ColorMamba. + +
+
+ comment: Code is available at https://github.com/AlexYangxx/ColorMamba +
+
+
+
+
+ + ☆ Single-image coherent reconstruction of objects and humans CVPR + + +
+ Existing methods for reconstructing objects and humans from a monocular image +suffer from severe mesh collisions and performance limitations for interacting +occluding objects. This paper introduces a method to obtain a globally +consistent 3D reconstruction of interacting objects and people from a single +image. Our contributions include: 1) an optimization framework, featuring a +collision loss, tailored to handle human-object and human-human interactions, +ensuring spatially coherent scene reconstruction; and 2) a novel technique to +robustly estimate 6 degrees of freedom (DOF) poses, specifically for heavily +occluded objects, exploiting image inpainting. Notably, our proposed method +operates effectively on images from real-world scenarios, without necessitating +scene or object-level 3D supervision. Extensive qualitative and quantitative +evaluation against existing methods demonstrates a significant reduction in +collisions in the final reconstructions of scenes with multiple interacting +humans and objects and a more coherent scene reconstruction. + +
+
+ comment: Accepted at AI for 3D Generation, CVPR Workshop +
+
+
+
+
+ + ☆ Treat Stillness with Movement: Remote Sensing Change Detection via + Coarse-grained Temporal Foregrounds Mining + + +
+ Current works focus on addressing the remote sensing change detection task +using bi-temporal images. Although good performance can be achieved, however, +seldom of they consider the motion cues which may also be vital. In this work, +we revisit the widely adopted bi-temporal images-based framework and propose a +novel Coarse-grained Temporal Mining Augmented (CTMA) framework. To be +specific, given the bi-temporal images, we first transform them into a video +using interpolation operations. Then, a set of temporal encoders is adopted to +extract the motion features from the obtained video for coarse-grained changed +region prediction. Subsequently, we design a novel Coarse-grained Foregrounds +Augmented Spatial Encoder module to integrate both global and local +information. We also introduce a motion augmented strategy that leverages +motion cues as an additional output to aggregate with the spatial features for +improved results. Meanwhile, we feed the input image pairs into the ResNet to +get the different features and also the spatial blocks for fine-grained feature +learning. More importantly, we propose a mask augmented strategy that utilizes +coarse-grained changed regions, incorporating them into the decoder blocks to +enhance the final changed prediction. Extensive experiments conducted on +multiple benchmark datasets fully validated the effectiveness of our proposed +framework for remote sensing image change detection. The source code of this +paper will be released on +https://github.com/Event-AHU/CTM_Remote_Sensing_Change_Detection + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ MambaMIM: Pre-training Mamba with State Space Token-interpolation + + +
+ Generative self-supervised learning demonstrates outstanding representation +learning capabilities in both Convolutional Neural Networks (CNNs) and Vision +Transformers (ViTs). However, there are currently no generative pre-training +methods related to selective state space models (Mamba) that can handle +long-range dependencies effectively. To address this challenge, we introduce a +generative self-supervised learning method for Mamba (MambaMIM) based on +Selective Structure State Space Sequence Token-interpolation (S6T), a +general-purpose pre-training method for arbitrary Mamba architectures. Our +method, MambaMIM, incorporates a bottom-up 3D hybrid masking strategy in the +encoder to maintain masking consistency across different architectures. +Additionally, S6T is employed to learn causal relationships between the masked +sequence in the state space. MambaMIM can be used on any single or hybrid Mamba +architectures to enhance the Mamba long-range representation capability. +Extensive downstream experiments reveal the feasibility and advancement of +using Mamba for pre-training medical image tasks. The code is available at: +https://github.com/FengheTan9/MambaMIM + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Navigating Data Scarcity using Foundation Models: A Benchmark of + Few-Shot and Zero-Shot Learning Approaches in Medical Imaging MICCAI 2024 + + +
+ Data scarcity is a major limiting factor for applying modern machine learning +techniques to clinical tasks. Although sufficient data exists for some +well-studied medical tasks, there remains a long tail of clinically relevant +tasks with poor data availability. Recently, numerous foundation models have +demonstrated high suitability for few-shot learning (FSL) and zero-shot +learning (ZSL), potentially making them more accessible to practitioners. +However, it remains unclear which foundation model performs best on FSL medical +image analysis tasks and what the optimal methods are for learning from limited +data. We conducted a comprehensive benchmark study of ZSL and FSL using 16 +pretrained foundation models on 19 diverse medical imaging datasets. Our +results indicate that BiomedCLIP, a model pretrained exclusively on medical +data, performs best on average for very small training set sizes, while very +large CLIP models pretrained on LAION-2B perform best with slightly more +training samples. However, simply fine-tuning a ResNet-18 pretrained on +ImageNet performs similarly with more than five training examples per class. +Our findings also highlight the need for further research on foundation models +specifically tailored for medical applications and the collection of more +datasets to train these models. + +
+
+ comment: Accepted as an oral presentation in MICCAI 2024 2nd International + Workshop on Foundation Models for General Medical AI +
+
+
+
+
+ + ☆ CamoTeacher: Dual-Rotation Consistency Learning for Semi-Supervised + Camouflaged Object Detection ECCV 2024 + + +
+ Existing camouflaged object detection~(COD) methods depend heavily on +large-scale pixel-level annotations.However, acquiring such annotations is +laborious due to the inherent camouflage characteristics of the +objects.Semi-supervised learning offers a promising solution to this +challenge.Yet, its application in COD is hindered by significant pseudo-label +noise, both pixel-level and instance-level.We introduce CamoTeacher, a novel +semi-supervised COD framework, utilizing Dual-Rotation Consistency +Learning~(DRCL) to effectively address these noise issues.Specifically, DRCL +minimizes pseudo-label noise by leveraging rotation views' consistency in +pixel-level and instance-level.First, it employs Pixel-wise Consistency +Learning~(PCL) to deal with pixel-level noise by reweighting the different +parts within the pseudo-label.Second, Instance-wise Consistency Learning~(ICL) +is used to adjust weights for pseudo-labels, which handles instance-level +noise.Extensive experiments on four COD benchmark datasets demonstrate that the +proposed CamoTeacher not only achieves state-of-the-art compared with +semi-supervised learning methods, but also rivals established fully-supervised +learning methods.Our code will be available soon. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ PI-Att: Topology Attention for Segmentation Networks through Adaptive + Persistence Image Representation + + +
+ Segmenting multiple objects (e.g., organs) in medical images often requires +an understanding of their topology, which simultaneously quantifies the shape +of the objects and their positions relative to each other. This understanding +is important for segmentation networks to generalize better with limited +training data, which is common in medical image analysis. However, many popular +networks were trained to optimize only pixel-wise performance, ignoring the +topological correctness of the segmentation. In this paper, we introduce a new +topology-aware loss function, which we call PI-Att, that explicitly forces the +network to minimize the topological dissimilarity between the ground truth and +prediction maps. We quantify the topology of each map by the persistence image +representation, for the first time in the context of a segmentation network +loss. Besides, we propose a new mechanism to adaptively calculate the +persistence image at the end of each epoch based on the network's performance. +This adaptive calculation enables the network to learn topology outline in the +first epochs, and then topology details towards the end of training. The +effectiveness of the proposed PI-Att loss is demonstrated on two different +datasets for aorta and great vessel segmentation in computed tomography images. + +
+
+
+
+
+ + ☆ An Advanced Deep Learning Based Three-Stream Hybrid Model for Dynamic + Hand Gesture Recognition + + +
+ In the modern context, hand gesture recognition has emerged as a focal point. +This is due to its wide range of applications, which include comprehending sign +language, factories, hands-free devices, and guiding robots. Many researchers +have attempted to develop more effective techniques for recognizing these hand +gestures. However, there are challenges like dataset limitations, variations in +hand forms, external environments, and inconsistent lighting conditions. To +address these challenges, we proposed a novel three-stream hybrid model that +combines RGB pixel and skeleton-based features to recognize hand gestures. In +the procedure, we preprocessed the dataset, including augmentation, to make +rotation, translation, and scaling independent systems. We employed a +three-stream hybrid model to extract the multi-feature fusion using the power +of the deep learning module. In the first stream, we extracted the initial +feature using the pre-trained Imagenet module and then enhanced this feature by +using a multi-layer of the GRU and LSTM modules. In the second stream, we +extracted the initial feature with the pre-trained ReseNet module and enhanced +it with the various combinations of the GRU and LSTM modules. In the third +stream, we extracted the hand pose key points using the media pipe and then +enhanced them using the stacked LSTM to produce the hierarchical feature. After +that, we concatenated the three features to produce the final. Finally, we +employed a classification module to produce the probabilistic map to generate +predicted output. We mainly produced a powerful feature vector by taking +advantage of the pixel-based deep learning feature and pos-estimation-based +stacked deep learning feature, including a pre-trained model with a scratched +deep learning model for unequalled gesture detection capabilities. + +
+
+
+
+
+ + ☆ DIVE: Towards Descriptive and Diverse Visual Commonsense Generation EMNLP 2023 + + +
+ Towards human-level visual understanding, visual commonsense generation has +been introduced to generate commonsense inferences beyond images. However, +current research on visual commonsense generation has overlooked an important +human cognitive ability: generating descriptive and diverse inferences. In this +work, we propose a novel visual commonsense generation framework, called DIVE, +which aims to improve the descriptiveness and diversity of generated +inferences. DIVE involves two methods, generic inference filtering and +contrastive retrieval learning, which address the limitations of existing +visual commonsense resources and training objectives. Experimental results +verify that DIVE outperforms state-of-the-art models for visual commonsense +generation in terms of both descriptiveness and diversity, while showing a +superior quality in generating unique and novel inferences. Notably, DIVE +achieves human-level descriptiveness and diversity on Visual Commonsense +Graphs. Furthermore, human evaluations confirm that DIVE aligns closely with +human judgments on descriptiveness and diversity\footnote{Our code and dataset +are available at https://github.com/Park-ing-lot/DIVE. + +
+
+ comment: 19 pages, 10 figuers, EMNLP 2023 (main) +
+
+
+
+
+ + ☆ Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for + Collaborative DNN Training on Heterogeneous Edge Devices + + +
+ On-device Deep Neural Network (DNN) training has been recognized as crucial +for privacy-preserving machine learning at the edge. However, the intensive +training workload and limited onboard computing resources pose significant +challenges to the availability and efficiency of model training. While existing +works address these challenges through native resource management optimization, +we instead leverage our observation that edge environments usually comprise a +rich set of accompanying trusted edge devices with idle resources beyond a +single terminal. We propose Asteroid, a distributed edge training system that +breaks the resource walls across heterogeneous edge devices for efficient model +training acceleration. Asteroid adopts a hybrid pipeline parallelism to +orchestrate distributed training, along with a judicious parallelism planning +for maximizing throughput under certain resource constraints. Furthermore, a +fault-tolerant yet lightweight pipeline replay mechanism is developed to tame +the device-level dynamics for training robustness and performance stability. We +implement Asteroid on heterogeneous edge devices with both vision and language +models, demonstrating up to 12.2x faster training than conventional parallelism +methods and 2.1x faster than state-of-the-art hybrid parallelism methods +through evaluations. Furthermore, Asteroid can recover training pipeline 14x +faster than baseline methods while preserving comparable throughput despite +unexpected device exiting and failure. + +
+
+ comment: Accepted by The 30th Annual International Conference on Mobile + Computing and Networking (MobiCom'24) +
+
+
+
+
+ + ☆ Adaptive Learning of Consistency and Inconsistency Information for Fake + News Detection + + +
+ The rapid advancement of social media platforms has significantly reduced the +cost of information dissemination, yet it has also led to a proliferation of +fake news, posing a threat to societal trust and credibility. Most of fake news +detection research focused on integrating text and image information to +represent the consistency of multiple modes in news content, while paying less +attention to inconsistent information. Besides, existing methods that leveraged +inconsistent information often caused one mode overshadowing another, leading +to ineffective use of inconsistent clue. To address these issues, we propose an +adaptive multi-modal feature fusion network (MFF-Net). Inspired by human +judgment processes for determining truth and falsity in news, MFF-Net focuses +on inconsistent parts when news content is generally consistent and consistent +parts when it is generally inconsistent. Specifically, MFF-Net extracts +semantic and global features from images and texts respectively, and learns +consistency information between modes through a multiple feature fusion module. +To deal with the problem of modal information being easily masked, we design a +single modal feature filtering strategy to capture inconsistent information +from corresponding modes separately. Finally, similarity scores are calculated +based on global features with adaptive adjustments made to achieve weighted +fusion of consistent and inconsistent features. Extensive experimental results +demonstrate that MFF-Net outperforms state-of-the-art methods across three +public news datasets derived from real social medias. + +
+
+
+
+
+ + ☆ MVInpainter: Learning Multi-View Consistent Inpainting to Bridge 2D and + 3D Editing + + +
+ Novel View Synthesis (NVS) and 3D generation have recently achieved prominent +improvements. However, these works mainly focus on confined categories or +synthetic 3D assets, which are discouraged from generalizing to challenging +in-the-wild scenes and fail to be employed with 2D synthesis directly. +Moreover, these methods heavily depended on camera poses, limiting their +real-world applications. To overcome these issues, we propose MVInpainter, +re-formulating the 3D editing as a multi-view 2D inpainting task. Specifically, +MVInpainter partially inpaints multi-view images with the reference guidance +rather than intractably generating an entirely novel view from scratch, which +largely simplifies the difficulty of in-the-wild NVS and leverages unmasked +clues instead of explicit pose conditions. To ensure cross-view consistency, +MVInpainter is enhanced by video priors from motion components and appearance +guidance from concatenated reference key&value attention. Furthermore, +MVInpainter incorporates slot attention to aggregate high-level optical flow +features from unmasked regions to control the camera movement with pose-free +training and inference. Sufficient scene-level experiments on both +object-centric and forward-facing datasets verify the effectiveness of +MVInpainter, including diverse tasks, such as multi-view object removal, +synthesis, insertion, and replacement. The project page is +https://ewrfcas.github.io/MVInpainter/. + +
+
+ comment: Project page: https://ewrfcas.github.io/MVInpainter/ +
+
+
+
+
+ + ☆ Co-Fix3D: Enhancing 3D Object Detection with Collaborative Refinement + + +
+ In the realm of autonomous driving,accurately detecting occluded or distant +objects,referred to as weak positive sample ,presents significant challenges. +These challenges predominantly arise during query initialization, where an +over-reliance on heatmap confidence often results in a high rate of false +positives, consequently masking weaker detections and impairing system +performance. To alleviate this issue, we propose a novel approach, Co-Fix3D, +which employs a collaborative hybrid multi-stage parallel query generation +mechanism for BEV representations. Our method incorporates the Local-Global +Feature Enhancement (LGE) module, which refines BEV features to more +effectively highlight weak positive samples. It uniquely leverages the Discrete +Wavelet Transform (DWT) for accurate noise reduction and features refinement in +localized areas, and incorporates an attention mechanism to more +comprehensively optimize global BEV features. Moreover, our method increases +the volume of BEV queries through a multi-stage parallel processing of the LGE, +significantly enhancing the probability of selecting weak positive samples. +This enhancement not only improves training efficiency within the decoder +framework but also boosts overall system performance. Notably, Co-Fix3D +achieves superior results on the stringent nuScenes benchmark, outperforming +all previous models with a 69.1% mAP and 72.9% NDS on the LiDAR-based +benchmark, and 72.3% mAP and 74.1% NDS on the multi-modality benchmark, without +relying on test-time augmentation or additional datasets. The source code will +be made publicly available upon acceptance. + +
+
+
+
+
+ + ☆ Monte Carlo Path Tracing and Statistical Event Detection for Event + Camera Simulation + + +
+ This paper presents a novel event camera simulation system fully based on +physically based Monte Carlo path tracing with adaptive path sampling. The +adaptive sampling performed in the proposed method is based on a statistical +technique, hypothesis testing for the hypothesis whether the difference of +logarithmic luminances at two distant periods is significantly larger than a +predefined event threshold. To this end, our rendering system collects +logarithmic luminances rather than raw luminance in contrast to the +conventional rendering system imitating conventional RGB cameras. Then, based +on the central limit theorem, we reasonably assume that the distribution of the +population mean of logarithmic luminance can be modeled as a normal +distribution, allowing us to model the distribution of the difference of +logarithmic luminance as a normal distribution. Then, using Student's t-test, +we can test the hypothesis and determine whether to discard the null hypothesis +for event non-occurrence. When we sample a sufficiently large number of path +samples to satisfy the central limit theorem and obtain a clean set of events, +our method achieves significant speed up compared to a simple approach of +sampling paths uniformly at every pixel. To our knowledge, we are the first to +simulate the behavior of event cameras in a physically accurate manner using an +adaptive sampling technique in Monte Carlo path tracing, and we believe this +study will contribute to the development of computer vision applications using +event cameras. + +
+
+ comment: 10 pages, 7 figures, Presented at ICCP 2024 +
+
+
+
+
+ + ☆ IIU: Independent Inference Units for Knowledge-based Visual Question + Answering + + +
+ Knowledge-based visual question answering requires external knowledge beyond +visible content to answer the question correctly. One limitation of existing +methods is that they focus more on modeling the inter-modal and intra-modal +correlations, which entangles complex multimodal clues by implicit embeddings +and lacks interpretability and generalization ability. The key challenge to +solve the above problem is to separate the information and process it +separately at the functional level. By reusing each processing unit, the +generalization ability of the model to deal with different data can be +increased. In this paper, we propose Independent Inference Units (IIU) for +fine-grained multi-modal reasoning to decompose intra-modal information by the +functionally independent units. Specifically, IIU processes each +semantic-specific intra-modal clue by an independent inference unit, which also +collects complementary information by communication from different units. To +further reduce the impact of redundant information, we propose a memory update +module to maintain semantic-relevant memory along with the reasoning process +gradually. In comparison with existing non-pretrained multi-modal reasoning +models on standard datasets, our model achieves a new state-of-the-art, +enhancing performance by 3%, and surpassing basic pretrained multi-modal +models. The experimental results show that our IIU model is effective in +disentangling intra-modal clues as well as reasoning units to provide +explainable reasoning evidence. Our code is available at +https://github.com/Lilidamowang/IIU. + +
+
+
+
+
+ + ☆ Exploring learning environments for label\-efficient cancer diagnosis + + +
+ Despite significant research efforts and advancements, cancer remains a +leading cause of mortality. Early cancer prediction has become a crucial focus +in cancer research to streamline patient care and improve treatment outcomes. +Manual tumor detection by histopathologists can be time consuming, prompting +the need for computerized methods to expedite treatment planning. Traditional +approaches to tumor detection rely on supervised learning, necessitates a large +amount of annotated data for model training. However, acquiring such extensive +labeled data can be laborious and time\-intensive. This research examines the +three learning environments: supervised learning (SL), semi\-supervised +learning (Semi\-SL), and self\-supervised learning (Self\-SL): to predict +kidney, lung, and breast cancer. Three pre\-trained deep learning models +(Residual Network\-50, Visual Geometry Group\-16, and EfficientNetB0) are +evaluated based on these learning settings using seven carefully curated +training sets. To create the first training set (TS1), SL is applied to all +annotated image samples. Five training sets (TS2\-TS6) with different ratios of +labeled and unlabeled cancer images are used to evaluateSemi\-SL. Unlabeled +cancer images from the final training set (TS7) are utilized for Self\-SL +assessment. Among different learning environments, outcomes from the Semi\-SL +setting show a strong degree of agreement with the outcomes achieved in the SL +setting. The uniform pattern of observations from the pre\-trained models +across all three datasets validates the methodology and techniques of the +research. Based on modest number of labeled samples and minimal computing cost, +our study suggests that the Semi\-SL option can be a highly viable replacement +for the SL option under label annotation constraint scenarios. + +
+
+ comment: Submitted to the journal +
+
+
+
+
+ + ☆ Analytical Uncertainty-Based Loss Weighting in Multi-Task Learning + + +
+ With the rise of neural networks in various domains, multi-task learning +(MTL) gained significant relevance. A key challenge in MTL is balancing +individual task losses during neural network training to improve performance +and efficiency through knowledge sharing across tasks. To address these +challenges, we propose a novel task-weighting method by building on the most +prevalent approach of Uncertainty Weighting and computing analytically optimal +uncertainty-based weights, normalized by a softmax function with tunable +temperature. Our approach yields comparable results to the combinatorially +prohibitive, brute-force approach of Scalarization while offering a more +cost-effective yet high-performing alternative. We conduct an extensive +benchmark on various datasets and architectures. Our method consistently +outperforms six other common weighting methods. Furthermore, we report +noteworthy experimental findings for the practical application of MTL. For +example, larger networks diminish the influence of weighting methods, and +tuning the weight decay has a low impact compared to the learning rate. + +
+
+
+
+
+ + ☆ LLaVA-Surg: Towards Multimodal Surgical Assistant via Structured + Surgical Video Learning + + +
+ Multimodal large language models (LLMs) have achieved notable success across +various domains, while research in the medical field has largely focused on +unimodal images. Meanwhile, current general-domain multimodal models for videos +still lack the capabilities to understand and engage in conversations about +surgical videos. One major contributing factor is the absence of datasets in +the surgical field. In this paper, we create a new dataset, Surg-QA, consisting +of 102,000 surgical video-instruction pairs, the largest of its kind so far. To +build such a dataset, we propose a novel two-stage question-answer generation +pipeline with LLM to learn surgical knowledge in a structured manner from the +publicly available surgical lecture videos. The pipeline breaks down the +generation process into two stages to significantly reduce the task complexity, +allowing us to use a more affordable, locally deployed open-source LLM than the +premium paid LLM services. It also mitigates the risk of LLM hallucinations +during question-answer generation, thereby enhancing the overall quality of the +generated data. We further train LLaVA-Surg, a novel vision-language +conversational assistant capable of answering open-ended questions about +surgical videos, on this Surg-QA dataset, and conduct comprehensive evaluations +on zero-shot surgical video question-answering tasks. We show that LLaVA-Surg +significantly outperforms all previous general-domain models, demonstrating +exceptional multimodal conversational skills in answering open-ended questions +about surgical videos. We will release our code, model, and the +instruction-tuning dataset. + +
+
+
+
+
+ + ☆ Polaris: Open-ended Interactive Robotic Manipulation via Syn2Real Visual + Grounding and Large Language Models IROS 2024 + + +
+ This paper investigates the task of the open-ended interactive robotic +manipulation on table-top scenarios. While recent Large Language Models (LLMs) +enhance robots' comprehension of user instructions, their lack of visual +grounding constrains their ability to physically interact with the environment. +This is because the robot needs to locate the target object for manipulation +within the physical workspace. To this end, we introduce an interactive robotic +manipulation framework called Polaris, which integrates perception and +interaction by utilizing GPT-4 alongside grounded vision models. For precise +manipulation, it is essential that such grounded vision models produce detailed +object pose for the target object, rather than merely identifying pixels +belonging to them in the image. Consequently, we propose a novel +Synthetic-to-Real (Syn2Real) pose estimation pipeline. This pipeline utilizes +rendered synthetic data for training and is then transferred to real-world +manipulation tasks. The real-world performance demonstrates the efficacy of our +proposed pipeline and underscores its potential for extension to more general +categories. Moreover, real-robot experiments have showcased the impressive +performance of our framework in grasping and executing multiple manipulation +tasks. This indicates its potential to generalize to scenarios beyond the +tabletop. More information and video results are available here: +https://star-uu-wang.github.io/Polaris/ + +
+
+ comment: Accepted by IROS 2024. 8 pages, 5 figures. See + https://star-uu-wang.github.io/Polaris/ +
+
+
+
+
+ + ☆ FlashGS: Efficient 3D Gaussian Splatting for Large-scale and + High-resolution Rendering + + +
+ This work introduces FlashGS, an open-source CUDA Python library, designed to +facilitate the efficient differentiable rasterization of 3D Gaussian Splatting +through algorithmic and kernel-level optimizations. FlashGS is developed based +on the observations from a comprehensive analysis of the rendering process to +enhance computational efficiency and bring the technique to wide adoption. The +paper includes a suite of optimization strategies, encompassing redundancy +elimination, efficient pipelining, refined control and scheduling mechanisms, +and memory access optimizations, all of which are meticulously integrated to +amplify the performance of the rasterization process. An extensive evaluation +of FlashGS' performance has been conducted across a diverse spectrum of +synthetic and real-world large-scale scenes, encompassing a variety of image +resolutions. The empirical findings demonstrate that FlashGS consistently +achieves an average 4x acceleration over mobile consumer GPUs, coupled with +reduced memory consumption. These results underscore the superior performance +and resource optimization capabilities of FlashGS, positioning it as a +formidable tool in the domain of 3D rendering. + +
+
+
+
+
+ + ☆ Conditional Brownian Bridge Diffusion Model for VHR SAR to Optical Image + Translation + + +
+ Synthetic Aperture Radar (SAR) imaging technology provides the unique +advantage of being able to collect data regardless of weather conditions and +time. However, SAR images exhibit complex backscatter patterns and speckle +noise, which necessitate expertise for interpretation. To deal with this +challenge, research has been conducted on translating SAR images into +optical-like representations to aid the interpretation of SAR data. +Nevertheless, existing studies have predominantly utilized low-resolution +satellite imagery datasets and have largely been based on Generative +Adversarial Network (GAN) which are known for their training instability and +low fidelity. To overcome these limitations of low-resolution data usage and +GAN-based approaches, this paper introduces a conditional image-to-image +translation approach based on Brownian Bridge Diffusion Model (BBDM). We +conducted comprehensive experiments on the MSAW dataset, a paired SAR and +optical images collection of 0.5m Very-High-Resolution (VHR) images. The +experimental results indicate that our method surpasses both the Conditional +Diffusion Model (CDM) and the GAN-based models in diverse perceptual quality +metrics. + +
+
+ comment: 5 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ Training Spatial-Frequency Visual Prompts and Probabilistic Clusters for + Accurate Black-Box Transfer Learning + + +
+ Despite the growing prevalence of black-box pre-trained models (PTMs) such as +prediction API services, there remains a significant challenge in directly +applying general models to real-world scenarios due to the data distribution +gap. Considering a data deficiency and constrained computational resource +scenario, this paper proposes a novel parameter-efficient transfer learning +framework for vision recognition models in the black-box setting. Our framework +incorporates two novel training techniques. First, we align the input space +(i.e., image) of PTMs to the target data distribution by generating visual +prompts of spatial and frequency domain. Along with the novel spatial-frequency +hybrid visual prompter, we design a novel training technique based on +probabilistic clusters, which can enhance class separation in the output space +(i.e., prediction probabilities). In experiments, our model demonstrates +superior performance in a few-shot transfer learning setting across extensive +visual recognition datasets, surpassing state-of-the-art baselines. +Additionally, we show that the proposed method efficiently reduces +computational costs for training and inference phases. + +
+
+ comment: ACM Multimedia 2024 +
+
+
+
+
+ + ☆ MobileMEF: Fast and Efficient Method for Multi-Exposure Fusion + + +
+ Recent advances in camera design and imaging technology have enabled the +capture of high-quality images using smartphones. However, due to the limited +dynamic range of digital cameras, the quality of photographs captured in +environments with highly imbalanced lighting often results in poor-quality +images. To address this issue, most devices capture multi-exposure frames and +then use some multi-exposure fusion method to merge those frames into a final +fused image. Nevertheless, most traditional and current deep learning +approaches are unsuitable for real-time applications on mobile devices due to +their heavy computational and memory requirements. We propose a new method for +multi-exposure fusion based on an encoder-decoder deep learning architecture +with efficient building blocks tailored for mobile devices. This efficient +design makes our model capable of processing 4K resolution images in less than +2 seconds on mid-range smartphones. Our method outperforms state-of-the-art +techniques regarding full-reference quality measures and computational +efficiency (runtime and memory usage), making it ideal for real-time +applications on hardware-constrained devices. Our code is available at: +https://github.com/LucasKirsten/MobileMEF. + +
+
+
+
+
+ + ☆ Surgical SAM 2: Real-time Segment Anything in Surgical Video by + Efficient Frame Pruning + + +
+ Surgical video segmentation is a critical task in computer-assisted surgery +and is vital for enhancing surgical quality and patient outcomes. Recently, the +Segment Anything Model 2 (SAM2) framework has shown superior advancements in +image and video segmentation. However, SAM2 struggles with efficiency due to +the high computational demands of processing high-resolution images and complex +and long-range temporal dynamics in surgical videos. To address these +challenges, we introduce Surgical SAM 2 (SurgSAM-2), an advanced model to +utilize SAM2 with an Efficient Frame Pruning (EFP) mechanism, to facilitate +real-time surgical video segmentation. The EFP mechanism dynamically manages +the memory bank by selectively retaining only the most informative frames, +reducing memory usage and computational cost while maintaining high +segmentation accuracy. Our extensive experiments demonstrate that SurgSAM-2 +significantly improves both efficiency and segmentation accuracy compared to +the vanilla SAM2. Remarkably, SurgSAM-2 achieves a 3$\times$ FPS compared with +SAM2, while also delivering state-of-the-art performance after fine-tuning with +lower-resolution data. These advancements establish SurgSAM-2 as a leading +model for surgical video analysis, making real-time surgical video segmentation +in resource-constrained environments a feasible reality. + +
+
+ comment: 16 pages, 2 figures +
+
+
+
+
+ + ☆ A Deep Features-Based Approach Using Modified ResNet50 and Gradient + Boosting for Visual Sentiments Classification + + +
+ The versatile nature of Visual Sentiment Analysis (VSA) is one reason for its +rising profile. It isn't easy to efficiently manage social media data with +visual information since previous research has concentrated on Sentiment +Analysis (SA) of single modalities, like textual. In addition, most visual +sentiment studies need to adequately classify sentiment because they are mainly +focused on simply merging modal attributes without investigating their +intricate relationships. This prompted the suggestion of developing a fusion of +deep learning and machine learning algorithms. In this research, a deep +feature-based method for multiclass classification has been used to extract +deep features from modified ResNet50. Furthermore, gradient boosting algorithm +has been used to classify photos containing emotional content. The approach is +thoroughly evaluated on two benchmarked datasets, CrowdFlower and GAPED. +Finally, cutting-edge deep learning and machine learning models were used to +compare the proposed strategy. When compared to state-of-the-art approaches, +the proposed method demonstrates exceptional performance on the datasets +presented. + +
+
+ comment: 4 pages, 4 figures, 3 tables, IEEE International Conference on + Multimedia Information Processing and Retrieval (MIPR) 2024 +
+
+
+
+
+ + ☆ GOReloc: Graph-based Object-Level Relocalization for Visual SLAM + + +
+ This article introduces a novel method for object-level relocalization of +robotic systems. It determines the pose of a camera sensor by robustly +associating the object detections in the current frame with 3D objects in a +lightweight object-level map. Object graphs, considering semantic +uncertainties, are constructed for both the incoming camera frame and the +pre-built map. Objects are represented as graph nodes, and each node employs +unique semantic descriptors based on our devised graph kernels. We extract a +subgraph from the target map graph by identifying potential object associations +for each object detection, then refine these associations and pose estimations +using a RANSAC-inspired strategy. Experiments on various datasets demonstrate +that our method achieves more accurate data association and significantly +increases relocalization success rates compared to baseline methods. The +implementation of our method is released at +\url{https://github.com/yutongwangBIT/GOReloc}. + +
+
+ comment: 8 pages, accepted by IEEE RAL +
+
+
+
+
+ + ☆ DM2RM: Dual-Mode Multimodal Ranking for Target Objects and Receptacles + Based on Open-Vocabulary Instructions + + +
+ In this study, we aim to develop a domestic service robot (DSR) that, guided +by open-vocabulary instructions, can carry everyday objects to the specified +pieces of furniture. Few existing methods handle mobile manipulation tasks with +open-vocabulary instructions in the image retrieval setting, and most do not +identify both the target objects and the receptacles. We propose the Dual-Mode +Multimodal Ranking model (DM2RM), which enables images of both the target +objects and receptacles to be retrieved using a single model based on +multimodal foundation models. We introduce a switching mechanism that leverages +a mode token and phrase identification via a large language model to switch the +embedding space based on the prediction target. To evaluate the DM2RM, we +construct a novel dataset including real-world images collected from hundreds +of building-scale environments and crowd-sourced instructions with referring +expressions. The evaluation results show that the proposed DM2RM outperforms +previous approaches in terms of standard metrics in image retrieval settings. +Furthermore, we demonstrate the application of the DM2RM on a standardized +real-world DSR platform including fetch-and-carry actions, where it achieves a +task success rate of 82% despite the zero-shot transfer setting. Demonstration +videos, code, and more materials are available at +https://kkrr10.github.io/dm2rm/. + +
+
+
+
+
+ + ☆ Persistence Image from 3D Medical Image: Superpixel and Optimized + Gaussian Coefficient + + +
+ Topological data analysis (TDA) uncovers crucial properties of objects in +medical imaging. Methods based on persistent homology have demonstrated their +advantages in capturing topological features that traditional deep learning +methods cannot detect in both radiology and pathology. However, previous +research primarily focused on 2D image analysis, neglecting the comprehensive +3D context. In this paper, we propose an innovative 3D TDA approach that +incorporates the concept of superpixels to transform 3D medical image features +into point cloud data. By Utilizing Optimized Gaussian Coefficient, the +proposed 3D TDA method, for the first time, efficiently generate holistic +Persistence Images for 3D volumetric data. Our 3D TDA method exhibits superior +performance on the MedMNist3D dataset when compared to other traditional +methods, showcasing its potential effectiveness in modeling 3D persistent +homology-based topological analysis when it comes to classification tasks. The +source code is publicly available at +https://github.com/hrlblab/TopologicalDataAnalysis3D. + +
+
+
+
+
+ + ☆ Deep Joint Denoising and Detection for Enhanced Intracellular Particle + Analysis + + +
+ Reliable analysis of intracellular dynamic processes in time-lapse +fluorescence microscopy images requires complete and accurate tracking of all +small particles in all time frames of the image sequences. A fundamental first +step towards this goal is particle detection. Given the small size of the +particles, their detection is greatly affected by image noise. Recent studies +have shown that applying image denoising as a preprocessing step indeed +improves particle detection and their subsequent tracking. Deep learning based +particle detection methods have shown superior results compared to traditional +detection methods. However, they do not explicitly aim to remove noise from the +images to facilitate detection. Thus we hypothesize that their performance +could be further improved. In this paper, we propose a new deep neural network, +called DENODET (denoising-detection network), which performs image denoising +and particle detection simultaneously. We show that integrative denoising and +detection yields more accurate detection results. Our method achieves superior +results compared to state-of-the-art particle detection methods on the particle +tracking challenge dataset and our own real fluorescence microscopy image data. + +
+
+ comment: 11 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Quantum-inspired Interpretable Deep Learning Architecture for Text + Sentiment Analysis + + +
+ Text has become the predominant form of communication on social media, +embedding a wealth of emotional nuances. Consequently, the extraction of +emotional information from text is of paramount importance. Despite previous +research making some progress, existing text sentiment analysis models still +face challenges in integrating diverse semantic information and lack +interpretability. To address these issues, we propose a quantum-inspired deep +learning architecture that combines fundamental principles of quantum mechanics +(QM principles) with deep learning models for text sentiment analysis. +Specifically, we analyze the commonalities between text representation and QM +principles to design a quantum-inspired text representation method and further +develop a quantum-inspired text embedding layer. Additionally, we design a +feature extraction layer based on long short-term memory (LSTM) networks and +self-attention mechanisms (SAMs). Finally, we calculate the text density matrix +using the quantum complex numbers principle and apply 2D-convolution neural +networks (CNNs) for feature condensation and dimensionality reduction. Through +a series of visualization, comparative, and ablation experiments, we +demonstrate that our model not only shows significant advantages in accuracy +and efficiency compared to previous related models but also achieves a certain +level of interpretability by integrating QM principles. Our code is available +at QISA. + +
+
+
+
+
+ + ☆ MambaVT: Spatio-Temporal Contextual Modeling for robust RGB-T Tracking + + +
+ Existing RGB-T tracking algorithms have made remarkable progress by +leveraging the global interaction capability and extensive pre-trained models +of the Transformer architecture. Nonetheless, these methods mainly adopt +imagepair appearance matching and face challenges of the intrinsic high +quadratic complexity of the attention mechanism, resulting in constrained +exploitation of temporal information. Inspired by the recently emerged State +Space Model Mamba, renowned for its impressive long sequence modeling +capabilities and linear computational complexity, this work innovatively +proposes a pure Mamba-based framework (MambaVT) to fully exploit +spatio-temporal contextual modeling for robust visible-thermal tracking. +Specifically, we devise the long-range cross-frame integration component to +globally adapt to target appearance variations, and introduce short-term +historical trajectory prompts to predict the subsequent target states based on +local temporal location clues. Extensive experiments show the significant +potential of vision Mamba for RGB-T tracking, with MambaVT achieving +state-of-the-art performance on four mainstream benchmarks while requiring +lower computational costs. We aim for this work to serve as a simple yet strong +baseline, stimulating future research in this field. The code and pre-trained +models will be made available. + +
+
+
+
+
+ + ☆ To Impute or Not: Recommendations for Multibiometric Fusion + + +
+ Combining match scores from different biometric systems via fusion is a +well-established approach to improving recognition accuracy. However, missing +scores can degrade performance as well as limit the possible fusion techniques +that can be applied. Imputation is a promising technique in multibiometric +systems for replacing missing data. In this paper, we evaluate various score +imputation approaches on three multimodal biometric score datasets, viz. NIST +BSSR1, BIOCOP2008, and MIT LL Trimodal, and investigate the factors which might +influence the effectiveness of imputation. Our studies reveal three key +observations: (1) Imputation is preferable over not imputing missing scores, +even when the fusion rule does not require complete score data. (2) Balancing +the classes in the training data is crucial to mitigate negative biases in the +imputation technique towards the under-represented class, even if it involves +dropping a substantial number of score vectors. (3) Multivariate imputation +approaches seem to be beneficial when scores between modalities are correlated, +while univariate approaches seem to benefit scenarios where scores between +modalities are less correlated. + +
+
+ comment: Proc. of IEEE International Workshop on Information Forensics and + Security (WIFS), (Nuremberg, Germany), December 2023 +
+
+
+
+
+ + ☆ Continuous Perception Benchmark + + +
+ Humans continuously perceive and process visual signals. However, current +video models typically either sample key frames sparsely or divide videos into +chunks and densely sample within each chunk. This approach stems from the fact +that most existing video benchmarks can be addressed by analyzing key frames or +aggregating information from separate chunks. We anticipate that the next +generation of vision models will emulate human perception by processing visual +input continuously and holistically. To facilitate the development of such +models, we propose the Continuous Perception Benchmark, a video question +answering task that cannot be solved by focusing solely on a few frames or by +captioning small chunks and then summarizing using language models. Extensive +experiments demonstrate that existing models, whether commercial or +open-source, struggle with these tasks, indicating the need for new technical +advancements in this direction. + +
+
+
+
+
+ + ☆ A Novel Generative Artificial Intelligence Method for Interference Study + on Multiplex Brightfield Immunohistochemistry Images + + +
+ Multiplex brightfield imaging offers the advantage of simultaneously +analyzing multiple biomarkers on a single slide, as opposed to single biomarker +labeling on multiple consecutive slides. To accurately analyze multiple +biomarkers localized at the same cellular compartment, two representative +biomarker sets were selected as assay models - cMET-PDL1-EGFR and +CD8-LAG3-PDL1, where all three biomarkers can co-localize on the cell membrane. +One of the most crucial preliminary stages for analyzing such assay is +identifying each unique chromogen on individual cells. This is a challenging +problem due to the co-localization of membrane stains from all the three +biomarkers. It requires advanced color unmixing for creating the equivalent +singleplex images from each triplex image for each biomarker. + In this project, we developed a cycle-Generative Adversarial Network +(cycle-GAN) method for unmixing the triplex images generated from the +above-mentioned assays. Three different models were designed to generate the +singleplex image for each of the three stains Tamra (purple), QM-Dabsyl +(yellow) and Green. A notable novelty of our approach was that the input to the +network were images in the optical density domain instead of conventionally +used RGB images. The use of the optical density domain helped in reducing the +blurriness of the synthetic singleplex images, which was often observed when +the network was trained on RGB images. + The cycle-GAN models were validated on 10,800 lung, gastric and colon images +for the cMET-PDL1-EGFR assay and 3600 colon images for the CD8-LAG3-PDL1 assay. +Visual as well as quantified assessments demonstrated that the proposed method +is effective and efficient when compared with the manual reviewing results and +is readily applicable to various multiplex assays. + +
+
+
+
+
+ + ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ☆ Efficient Data-Sketches and Fine-Tuning for Early Detection of + Distributional Drift in Medical Imaging + + +
+ Distributional drift detection is important in medical applications as it +helps ensure the accuracy and reliability of models by identifying changes in +the underlying data distribution that could affect diagnostic or treatment +decisions. However, current methods have limitations in detecting drift; for +example, the inclusion of abnormal datasets can lead to unfair comparisons. +This paper presents an accurate and sensitive approach to detect distributional +drift in CT-scan medical images by leveraging data-sketching and fine-tuning +techniques. We developed a robust baseline library model for real-time anomaly +detection, allowing for efficient comparison of incoming images and +identification of anomalies. Additionally, we fine-tuned a vision transformer +pre-trained model to extract relevant features using breast cancer images as an +example, significantly enhancing model accuracy to 99.11\%. Combining with +data-sketches and fine-tuning, our feature extraction evaluation demonstrated +that cosine similarity scores between similar datasets provide greater +improvements, from around 50\% increased to 100\%. Finally, the sensitivity +evaluation shows that our solutions are highly sensitive to even 1\% +salt-and-pepper and speckle noise, and it is not sensitive to lighting noise +(e.g., lighting conditions have no impact on data drift). The proposed methods +offer a scalable and reliable solution for maintaining the accuracy of +diagnostic models in dynamic clinical environments. + +
+
+
+
+
+ + ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention + + +
+ The Transformer architecture has revolutionized deep learning through its +Self-Attention mechanism, which effectively captures contextual information. +However, the memory footprint of Self-Attention presents significant challenges +for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by +grouping queries and mean-pooling the corresponding key-value heads - reducing +the number of overall parameters and memory requirements in a flexible manner +without adversely compromising model accuracy. In this work, we introduce +enhancements to GQA, focusing on two novel approaches that deviate from the +static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic +Key-Distributed GQA (DGQA), which leverage information from the norms of the +key heads to inform query allocation. Specifically, KDGQA looks at the ratios +of the norms of the key heads during each forward pass, while DGQA examines the +ratios of the norms as they evolve through training. Additionally, we present +Perturbed GQA (PGQA) as a case-study, which introduces variability in (static) +group formation via subtracting noise from the attention maps. Our experiments +with up-trained Vision Transformers, for Image Classification on datasets such +as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of +these variants in improving upon the original GQA through more informed and +adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of +up to 8% when utilizing DGQA in comparison to GQA and other variants. We +further analyze the impact of the number of Key-Value Heads on performance, +underscoring the importance of utilizing query-key affinities. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ SpectralEarth: Training Hyperspectral Foundation Models at Scale + + +
+ Foundation models have triggered a paradigm shift in computer vision and are +increasingly being adopted in remote sensing, particularly for multispectral +imagery. Yet, their potential in hyperspectral imaging (HSI) remains untapped +due to the absence of comprehensive and globally representative hyperspectral +datasets. To close this gap, we introduce SpectralEarth, a large-scale +multi-temporal dataset designed to pretrain hyperspectral foundation models +leveraging data from the Environmental Mapping and Analysis Program (EnMAP). +SpectralEarth comprises 538,974 image patches covering 415,153 unique locations +from more than 11,636 globally distributed EnMAP scenes spanning two years of +archive. Additionally, 17.5% of these locations include multiple timestamps, +enabling multi-temporal HSI analysis. Utilizing state-of-the-art +self-supervised learning (SSL) algorithms, we pretrain a series of foundation +models on SpectralEarth. We integrate a spectral adapter into classical vision +backbones to accommodate the unique characteristics of HSI. In tandem, we +construct four downstream datasets for land-cover and crop-type mapping, +providing benchmarks for model evaluation. Experimental results support the +versatility of our models, showcasing their generalizability across different +tasks and sensors. We also highlight computational efficiency during model +fine-tuning. The dataset, models, and source code will be made publicly +available. + +
+
+
+
+
+ + ☆ PQV-Mobile: A Combined Pruning and Quantization Toolkit to Optimize + Vision Transformers for Mobile Applications + + +
+ While Vision Transformers (ViTs) are extremely effective at computer vision +tasks and are replacing convolutional neural networks as the new +state-of-the-art, they are complex and memory-intensive models. In order to +effectively run these models on resource-constrained mobile/edge systems, there +is a need to not only compress these models but also to optimize them and +convert them into deployment-friendly formats. To this end, this paper presents +a combined pruning and quantization tool, called PQV-Mobile, to optimize vision +transformers for mobile applications. The tool is able to support different +types of structured pruning based on magnitude importance, Taylor importance, +and Hessian importance. It also supports quantization from FP32 to FP16 and +int8, targeting different mobile hardware backends. We demonstrate the +capabilities of our tool and show important latency-memory-accuracy trade-offs +for different amounts of pruning and int8 quantization with Facebook Data +Efficient Image Transformer (DeiT) models. Our results show that even pruning a +DeiT model by 9.375% and quantizing it to int8 from FP32 followed by optimizing +for mobile applications, we find a latency reduction by 7.18X with a small +accuracy loss of 2.24%. The tool is open source. + +
+
+
+
+
+ + ☆ Predictive uncertainty estimation in deep learning for lung carcinoma + classification in digital pathology under real dataset shifts + + +
+ Deep learning has shown tremendous progress in a wide range of digital +pathology and medical image classification tasks. Its integration into safe +clinical decision-making support requires robust and reliable models. However, +real-world data comes with diversities that often lie outside the intended +source distribution. Moreover, when test samples are dramatically different, +clinical decision-making is greatly affected. Quantifying predictive +uncertainty in models is crucial for well-calibrated predictions and +determining when (or not) to trust a model. Unfortunately, many works have +overlooked the importance of predictive uncertainty estimation. This paper +evaluates whether predictive uncertainty estimation adds robustness to deep +learning-based diagnostic decision-making systems. We investigate the effect of +various carcinoma distribution shift scenarios on predictive performance and +calibration. We first systematically investigate three popular methods for +improving predictive uncertainty: Monte Carlo dropout, deep ensemble, and +few-shot learning on lung adenocarcinoma classification as a primary disease in +whole slide images. Secondly, we compare the effectiveness of the methods in +terms of performance and calibration under clinically relevant distribution +shifts such as in-distribution shifts comprising primary disease sub-types and +other characterization analysis data; out-of-distribution shifts comprising +well-differentiated cases, different organ origin, and imaging modality shifts. +While studies on uncertainty estimation exist, to our best knowledge, no +rigorous large-scale benchmark compares predictive uncertainty estimation +including these dataset shifts for lung carcinoma classification. + +
+
+ comment: 17 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Penny-Wise and Pound-Foolish in Deepfake Detection + + +
+ The diffusion of deepfake technologies has sparked serious concerns about its +potential misuse across various domains, prompting the urgent need for robust +detection methods. Despite advancement, many current approaches prioritize +short-term gains at expense of long-term effectiveness. This paper critiques +the overly specialized approach of fine-tuning pre-trained models solely with a +penny-wise objective on a single deepfake dataset, while disregarding the +pound-wise balance for generalization and knowledge retention. To address this +"Penny-Wise and Pound-Foolish" issue, we propose a novel learning framework +(PoundNet) for generalization of deepfake detection on a pre-trained +vision-language model. PoundNet incorporates a learnable prompt design and a +balanced objective to preserve broad knowledge from upstream tasks (object +classification) while enhancing generalization for downstream tasks (deepfake +detection). We train PoundNet on a standard single deepfake dataset, following +common practice in the literature. We then evaluate its performance across 10 +public large-scale deepfake datasets with 5 main evaluation metrics-forming the +largest benchmark test set for assessing the generalization ability of deepfake +detection models, to our knowledge. The comprehensive benchmark evaluation +demonstrates the proposed PoundNet is significantly less "Penny-Wise and +Pound-Foolish", achieving a remarkable improvement of 19% in deepfake detection +performance compared to state-of-the-art methods, while maintaining a strong +performance of 63% on object classification tasks, where other deepfake +detection models tend to be ineffective. Code and data are open-sourced at +https://github.com/iamwangyabin/PoundNet. + +
+
+
+
+
+ + ☆ Level Up Your Tutorials: VLMs for Game Tutorials Quality Assessment ECCV 2024 + + +
+ Designing effective game tutorials is crucial for a smooth learning curve for +new players, especially in games with many rules and complex core mechanics. +Evaluating the effectiveness of these tutorials usually requires multiple +iterations with testers who have no prior knowledge of the game. Recent +Vision-Language Models (VLMs) have demonstrated significant capabilities in +understanding and interpreting visual content. VLMs can analyze images, provide +detailed insights, and answer questions about their content. They can recognize +objects, actions, and contexts in visual data, making them valuable tools for +various applications, including automated game testing. In this work, we +propose an automated game-testing solution to evaluate the quality of game +tutorials. Our approach leverages VLMs to analyze frames from video game +tutorials, answer relevant questions to simulate human perception, and provide +feedback. This feedback is compared with expected results to identify confusing +or problematic scenes and highlight potential errors for developers. In +addition, we publish complete tutorial videos and annotated frames from +different game versions used in our tests. This solution reduces the need for +extensive manual testing, especially by speeding up and simplifying the initial +development stages of the tutorial to improve the final game experience. + +
+
+ comment: Accepted at ECCV 2024 CV2 Workshop +
+
+
+
+
+ + ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations evolve +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations evolves through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that the shape of this ID evolution curve +differs noticeably between natural and medical image models: medical image +models peak in representation ID earlier in the network, implying a difference +in the image features and their abstractness that are typically used for +downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ☆ 5%>100%: Breaking Performance Shackles of Full Fine-Tuning on Visual + Recognition Tasks + + +
+ Pre-training & fine-tuning can enhance the transferring efficiency and +performance in visual tasks. Recent delta-tuning methods provide more options +for visual classification tasks. Despite their success, existing visual +delta-tuning art fails to exceed the upper limit of full fine-tuning on +challenging tasks like object detection and segmentation. To find a competitive +alternative to full fine-tuning, we propose the Multi-cognitive Visual Adapter +(Mona) tuning, a novel adapter-based tuning method. First, we introduce +multiple vision-friendly filters into the adapter to enhance its ability to +process visual signals, while previous methods mainly rely on language-friendly +linear filters. Second, we add the scaled normalization layer in the adapter to +regulate the distribution of input features for visual filters. To fully +demonstrate the practicality and generality of Mona, we conduct experiments on +multiple representative visual tasks, including instance segmentation on COCO, +semantic segmentation on ADE20K, object detection on Pascal VOC, oriented +object detection on DOTA/STAR, and image classification on three common +datasets. Exciting results illustrate that Mona surpasses full fine-tuning on +all these tasks, and is the only delta-tuning method outperforming full +fine-tuning on the above various tasks. For example, Mona achieves 1% +performance gain on the COCO dataset compared to full fine-tuning. +Comprehensive results suggest that Mona-tuning is more suitable for retaining +and utilizing the capabilities of pre-trained models than full fine-tuning. We +will make the code publicly available. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2311.15010 +
+
+
+
+
+ + ☆ CT4D: Consistent Text-to-4D Generation with Animatable Meshes + + +
+ Text-to-4D generation has recently been demonstrated viable by integrating a +2D image diffusion model with a video diffusion model. However, existing models +tend to produce results with inconsistent motions and geometric structures over +time. To this end, we present a novel framework, coined CT4D, which directly +operates on animatable meshes for generating consistent 4D content from +arbitrary user-supplied prompts. The primary challenges of our mesh-based +framework involve stably generating a mesh with details that align with the +text prompt while directly driving it and maintaining surface continuity. Our +CT4D framework incorporates a unique Generate-Refine-Animate (GRA) algorithm to +enhance the creation of text-aligned meshes. To improve surface continuity, we +divide a mesh into several smaller regions and implement a uniform driving +function within each area. Additionally, we constrain the animating stage with +a rigidity regulation to ensure cross-region continuity. Our experimental +results, both qualitative and quantitative, demonstrate that our CT4D framework +surpasses existing text-to-4D techniques in maintaining interframe consistency +and preserving global geometry. Furthermore, we showcase that this enhanced +representation inherently possesses the capability for combinational 4D +generation and texture editing. + +
+
+
+
+
+ + ♻ ☆ Nearest Neighbor Classification for Classical Image Upsampling + + +
+ Given a set of ordered pixel data in the form of an image, our goal is to +perform upsampling on the data such that: the resulting resolution is improved +by some factor, the final result passes the human test, having added new, +believable, and realistic information and detail to the image, the time +complexity for upscaling is relatively close to that of lossy upscaling +implementations. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization + in CTR Prediction + + +
+ In the realm of modern mobile E-commerce, providing users with nearby +commercial service recommendations through location-based online services has +become increasingly vital. While machine learning approaches have shown promise +in multi-scene recommendation, existing methodologies often struggle to address +cold-start problems in unprecedented scenes: the increasing diversity of +commercial choices, along with the short online lifespan of scenes, give rise +to the complexity of effective recommendations in online and dynamic scenes. In +this work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that +emphasizes high-performance cold-start online recommendations for new scenes. +Our approach introduces several crucial capabilities, including scene +similarity learning, user-specific scene transition cognition, scene-specific +information construction for the new scene, and enhancing the diverged logical +information between scenes. We demonstrate SwAN's potential to optimize dynamic +multi-scene recommendation problems by effectively online handling cold-start +recommendations for any newly arrived scenes. More encouragingly, SwAN has been +successfully deployed in Meituan's online catering recommendation service, +which serves millions of customers per day, and SwAN has achieved a 5.64% CTR +index improvement relative to the baselines and a 5.19% increase in daily order +volume proportion. + +
+
+ comment: 10 pages, 6 figures, accepted by Recsys 2024 +
+
+
+
+
+ + ♻ ☆ A Distributed Privacy Preserving Model for the Detection of Alzheimer's + Disease + + +
+ BACKGROUND: Segmentation of medical data, concerns about personal health +information (PHI) breaches, and the direct and indirect costs of consolidating +and managing such segmented date should motivate diagnostic machine learning +(DML) researchers to identify privacy-preserving machine learning algorithms +that can train on distributed or decentralized datasets of different +modalities. Federated learning models provide such a decentralized machine +learning framework in which multiple investigators in possession of disparate +datasets and working on different devices or servers can train collaboratively +a global machine learning models without ever having to exchange local data and +thus can meet statutory PHI protections. To this end, a vertical federate +learning model is devised and tested for efficacy in the detection of +Alzheimer's Disease (AD). + METHODS: The second version of Open Access Series of Imaging Studies -- with +its panoply of demographic, imaging, and clinical assessment datasets -- was +used to test a multimodal vertical federated learning (VFL) model for AD +detection. + RESULTS: By training and validating this VFL model on the demographic, +clinical, and MRI data in OASIS-2, an 82.9\% accuracy rate is achieved, +consistent with previously reported results. + CONCLUSIONS: The VFL architecture proposed herein offers a novel distributed +architecture, enabling collaborative learning across diverse sources of medical +data while respecting statutory privacy constraints. By leveraging multiple +modalities of data, the robustness and accuracy of AD detection can be +enhanced. This model not only contributes to the advancement of federated +learning techniques but also holds promise for overcoming the hurdles posed by +data segmentation in medical research. + +
+
+ comment: 17 pages, 7 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ EgoPoseFormer: A Simple Baseline for Stereo Egocentric 3D Human Pose + Estimation ECCV 2024 + + +
+ We present EgoPoseFormer, a simple yet effective transformer-based model for +stereo egocentric human pose estimation. The main challenge in egocentric pose +estimation is overcoming joint invisibility, which is caused by self-occlusion +or a limited field of view (FOV) of head-mounted cameras. Our approach +overcomes this challenge by incorporating a two-stage pose estimation paradigm: +in the first stage, our model leverages the global information to estimate each +joint's coarse location, then in the second stage, it employs a DETR style +transformer to refine the coarse locations by exploiting fine-grained stereo +visual features. In addition, we present a Deformable Stereo Attention +operation to enable our transformer to effectively process multi-view features, +which enables it to accurately localize each joint in the 3D world. We evaluate +our method on the stereo UnrealEgo dataset and show it significantly +outperforms previous approaches while being computationally efficient: it +improves MPJPE by 27.4mm (45% improvement) with only 7.9% model parameters and +13.1% FLOPs compared to the state-of-the-art. Surprisingly, with proper +training settings, we find that even our first-stage pose proposal network can +achieve superior performance compared to previous arts. We also show that our +method can be seamlessly extended to monocular settings, which achieves +state-of-the-art performance on the SceneEgo dataset, improving MPJPE by 25.5mm +(21% improvement) compared to the best existing method with only 60.7% model +parameters and 36.4% FLOPs. Code is available at: +https://github.com/ChenhongyiYang/egoposeformer . + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DyFFPAD: Dynamic Fusion of Convolutional and Handcrafted Features for + Fingerprint Presentation Attack Detection + + +
+ Automatic fingerprint recognition systems suffer from the threat of +presentation attacks due to their wide range of deployment in areas including +national borders and commercial applications. A presentation attack can be +performed by creating a spoof of a user's fingerprint with or without their +consent. This paper presents a dynamic ensemble of deep CNN and handcrafted +features to detect presentation attacks in known-material and unknown-material +protocols of the livness detection competition. The proposed presentation +attack detection model, in this way, utilizes the capabilities of both deep CNN +and handcrafted features techniques and exhibits better performance than their +individual performances. We have validated our proposed method on benchmark +databases from the Liveness Detection Competition in 2015, 2017, and 2019, +yielding overall accuracy of 96.10\%, 96.49\%, and 94.99\% on them, +respectively. The proposed method outperforms state-of-the-art methods in terms +of classification accuracy. + +
+
+ comment: arXiv admin note: +
+
+
+
+
+ + ♻ ☆ DiffPMAE: Diffusion Masked Autoencoders for Point Cloud Reconstruction + + +
+ Point cloud streaming is increasingly getting popular, evolving into the norm +for interactive service delivery and the future Metaverse. However, the +substantial volume of data associated with point clouds presents numerous +challenges, particularly in terms of high bandwidth consumption and large +storage capacity. Despite various solutions proposed thus far, with a focus on +point cloud compression, upsampling, and completion, these +reconstruction-related methods continue to fall short in delivering high +fidelity point cloud output. As a solution, in DiffPMAE, we propose an +effective point cloud reconstruction architecture. Inspired by self-supervised +learning concepts, we combine Masked Auto-Encoding and Diffusion Model +mechanism to remotely reconstruct point cloud data. By the nature of this +reconstruction process, DiffPMAE can be extended to many related downstream +tasks including point cloud compression, upsampling and completion. Leveraging +ShapeNet-55 and ModelNet datasets with over 60000 objects, we validate the +performance of DiffPMAE exceeding many state-of-the-art methods in-terms of +auto-encoding and downstream tasks considered. + +
+
+
+
+
+ + ♻ ☆ GIR: 3D Gaussian Inverse Rendering for Relightable Scene Factorization + + +
+ This paper presents a 3D Gaussian Inverse Rendering (GIR) method, employing +3D Gaussian representations to effectively factorize the scene into material +properties, light, and geometry. The key contributions lie in three-fold. We +compute the normal of each 3D Gaussian using the shortest eigenvector, with a +directional masking scheme forcing accurate normal estimation without external +supervision. We adopt an efficient voxel-based indirect illumination tracing +scheme that stores direction-aware outgoing radiance in each 3D Gaussian to +disentangle secondary illumination for approximating multi-bounce light +transport. To further enhance the illumination disentanglement, we represent a +high-resolution environmental map with a learnable low-resolution map and a +lightweight, fully convolutional network. Our method achieves state-of-the-art +performance in both relighting and novel view synthesis tasks among the +recently proposed inverse rendering methods while achieving real-time +rendering. This substantiates our proposed method's efficacy and broad +applicability, highlighting its potential as an influential tool in various +real-time interactive graphics applications such as material editing and +relighting. The code will be released at https://github.com/guduxiaolang/GIR. + +
+
+ comment: technical report +
+
+
+
+
+ + ♻ ☆ Deep Learning Innovations for Underwater Waste Detection: An In-Depth + Analysis + + +
+ Addressing the issue of submerged underwater trash is crucial for +safeguarding aquatic ecosystems and preserving marine life. While identifying +debris present on the surface of water bodies is straightforward, assessing the +underwater submerged waste is a challenge due to the image distortions caused +by factors such as light refraction, absorption, suspended particles, color +shifts, and occlusion. This paper conducts a comprehensive review of +state-of-the-art architectures and on the existing datasets to establish a +baseline for submerged waste and trash detection. The primary goal remains to +establish the benchmark of the object localization techniques to be leveraged +by advanced underwater sensors and autonomous underwater vehicles. The ultimate +objective is to explore the underwater environment, to identify, and remove +underwater debris. The absence of benchmarks (dataset or algorithm) in many +researches emphasizes the need for a more robust algorithmic solution. Through +this research, we aim to give performance comparative analysis of various +underwater trash detection algorithms. + +
+
+
+
+
+ + ♻ ☆ Examining Common Paradigms in Multi-Task Learning + + +
+ While multi-task learning (MTL) has gained significant attention in recent +years, its underlying mechanisms remain poorly understood. Recent methods did +not yield consistent performance improvements over single task learning (STL) +baselines, underscoring the importance of gaining more profound insights about +challenges specific to MTL. In our study, we investigate paradigms in MTL in +the context of STL: First, the impact of the choice of optimizer has only been +mildly investigated in MTL. We show the pivotal role of common STL tools such +as the Adam optimizer in MTL empirically in various experiments. To further +investigate Adam's effectiveness, we theoretical derive a partial loss-scale +invariance under mild assumptions. Second, the notion of gradient conflicts has +often been phrased as a specific problem in MTL. We delve into the role of +gradient conflicts in MTL and compare it to STL. For angular gradient alignment +we find no evidence that this is a unique problem in MTL. We emphasize +differences in gradient magnitude as the main distinguishing factor. Overall, +we find surprising similarities between STL and MTL suggesting to consider +methods from both fields in a broader context. + +
+
+ comment: Accepted for publication in German Conference for Pattern Recognition + (GCPR), 2024 +
+
+
+
+
+ + ♻ ☆ MagicFace: Training-free Universal-Style Human Image Customized + Synthesis + + +
+ Existing human image personalized generation methods often require tedious +training: either fine-tuning with a few images or retraining on large-scale +datasets. In such cases, these methods are prone to overfitting and encounter +difficulties when personalizing individuals of diverse styles. Moreover, these +training-based approaches also struggle with multi-concept human image +customizing. To this end, we propose MagicFace, the first method for +universal-style human image personalized synthesis that enables +single/multi-concept customization for humans of any style in a training-free +manner. MagicFace introduces a coarse-to-fine generation pipeline, involving +two sequential stages: semantic scene construction and concept feature +injection. This is achieved by our Reference-aware Self-Attention (RSA) and +Region-grouped Blend Attention (RBA) mechanisms. Specifically, in the first +stage, RSA enables the latent image to query features from reference concepts +simultaneously, extracting the coarse-grained overall semantic understanding to +facilitate the initial semantic layout establishment. In the second stage, we +employ an attention-based semantic segmentation method to pinpoint the +generated regions of all concepts in the latent image at each step. Following +this, RBA divides the pixels of the latent image into semantic groups, with +each group querying fine-grained features from its reference concept, which +ensures precise attribute alignment and feature injection. Throughout the +two-stage process, a weight mask strategy is employed to ensure the model +focuses more on the reference concepts. Extensive experiments demonstrate our +superiority in both human-centric subject-to-image synthesis and multi-concept +human image customization. Our approach also can be applied to texture +transformation, further enhancing its versatility and applicability. + +
+
+ comment: project page: https://codegoat24.github.io/MagicFace +
+
+
+
+
+ + ♻ ☆ PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition BMVC 2024 + + +
+ We present PlainMamba: a simple non-hierarchical state space model (SSM) +designed for general visual recognition. The recent Mamba model has shown how +SSMs can be highly competitive with other architectures on sequential data and +initial attempts have been made to apply it to images. In this paper, we +further adapt the selective scanning process of Mamba to the visual domain, +enhancing its ability to learn features from two-dimensional images by (i) a +continuous 2D scanning process that improves spatial continuity by ensuring +adjacency of tokens in the scanning sequence, and (ii) direction-aware updating +which enables the model to discern the spatial relations of tokens by encoding +directional information. Our architecture is designed to be easy to use and +easy to scale, formed by stacking identical PlainMamba blocks, resulting in a +model with constant width throughout all layers. The architecture is further +simplified by removing the need for special tokens. We evaluate PlainMamba on a +variety of visual recognition tasks, achieving performance gains over previous +non-hierarchical models and is competitive with hierarchical alternatives. For +tasks requiring high-resolution inputs, in particular, PlainMamba requires much +less computing while maintaining high performance. Code and models are +available at: https://github.com/ChenhongyiYang/PlainMamba . + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Fully Test-Time rPPG Estimation via Synthetic Signal-Guided Feature + Learning + + +
+ Many remote photoplethysmography (rPPG) estimation models have achieved +promising performance in the training domain but often fail to accurately +estimate physiological signals or heart rates (HR) in the target domains. +Domain generalization (DG) or domain adaptation (DA) techniques are therefore +adopted during the offline training stage to adapt the model to either +unobserved or observed target domains by utilizing all available source domain +data. However, in rPPG estimation problems, the adapted model usually +encounters challenges in estimating target data with significant domain +variation. In contrast, Test-Time Adaptation (TTA) enables the model to +adaptively estimate rPPG signals in various unseen domains by online adapting +to unlabeled target data without referring to any source data. In this paper, +we first establish a new TTA-rPPG benchmark that encompasses various domain +information and HR distributions to simulate the challenges encountered in +real-world rPPG estimation. Next, we propose a novel synthetic signal-guided +rPPG estimation framework to address the forgetting issue during the TTA stage +and to enhance the adaptation capability of the pre-trained rPPG model. To this +end, we develop a synthetic signal-guided feature learning method by +synthesizing pseudo rPPG signals as pseudo ground truths to guide a conditional +generator in generating latent rPPG features. In addition, we design an +effective spectral-based entropy minimization technique to encourage the rPPG +model to learn new target domain information. Both the generated rPPG features +and synthesized rPPG signals prevent the rPPG model from overfitting to target +data and forgetting previously acquired knowledge, while also broadly covering +various heart rate (HR) distributions. Our extensive experiments on the +TTA-rPPG benchmark show that the proposed method achieves superior performance. + +
+
+
+
+
+ + ♻ ☆ Robustness-Aware 3D Object Detection in Autonomous Driving: A Review and + Outlook + + +
+ In the realm of modern autonomous driving, the perception system is +indispensable for accurately assessing the state of the surrounding +environment, thereby enabling informed prediction and planning. The key step to +this system is related to 3D object detection that utilizes vehicle-mounted +sensors such as LiDAR and cameras to identify the size, the category, and the +location of nearby objects. Despite the surge in 3D object detection methods +aimed at enhancing detection precision and efficiency, there is a gap in the +literature that systematically examines their resilience against environmental +variations, noise, and weather changes. This study emphasizes the importance of +robustness, alongside accuracy and latency, in evaluating perception systems +under practical scenarios. Our work presents an extensive survey of +camera-only, LiDAR-only, and multi-modal 3D object detection algorithms, +thoroughly evaluating their trade-off between accuracy, latency, and +robustness, particularly on datasets like KITTI-C and nuScenes-C to ensure fair +comparisons. Among these, multi-modal 3D detection approaches exhibit superior +robustness, and a novel taxonomy is introduced to reorganize the literature for +enhanced clarity. This survey aims to offer a more practical perspective on the +current capabilities and the constraints of 3D object detection algorithms in +real-world applications, thus steering future research towards +robustness-centric advancements. + +
+
+
+
+
+ + ♻ ☆ The Llama 3 Herd of Models + + +
+ Modern artificial intelligence (AI) systems are powered by foundation models. +This paper presents a new set of foundation models, called Llama 3. It is a +herd of language models that natively support multilinguality, coding, +reasoning, and tool usage. Our largest model is a dense Transformer with 405B +parameters and a context window of up to 128K tokens. This paper presents an +extensive empirical evaluation of Llama 3. We find that Llama 3 delivers +comparable quality to leading language models such as GPT-4 on a plethora of +tasks. We publicly release Llama 3, including pre-trained and post-trained +versions of the 405B parameter language model and our Llama Guard 3 model for +input and output safety. The paper also presents the results of experiments in +which we integrate image, video, and speech capabilities into Llama 3 via a +compositional approach. We observe this approach performs competitively with +the state-of-the-art on image, video, and speech recognition tasks. The +resulting models are not yet being broadly released as they are still under +development. + +
+
+
+
+
+ + ♻ ☆ End-to-end Autonomous Driving: Challenges and Frontiers + + +
+ The autonomous driving community has witnessed a rapid growth in approaches +that embrace an end-to-end algorithm framework, utilizing raw sensor input to +generate vehicle motion plans, instead of concentrating on individual tasks +such as detection and motion prediction. End-to-end systems, in comparison to +modular pipelines, benefit from joint feature optimization for perception and +planning. This field has flourished due to the availability of large-scale +datasets, closed-loop evaluation, and the increasing need for autonomous +driving algorithms to perform effectively in challenging scenarios. In this +survey, we provide a comprehensive analysis of more than 270 papers, covering +the motivation, roadmap, methodology, challenges, and future trends in +end-to-end autonomous driving. We delve into several critical challenges, +including multi-modality, interpretability, causal confusion, robustness, and +world models, amongst others. Additionally, we discuss current advancements in +foundation models and visual pre-training, as well as how to incorporate these +techniques within the end-to-end driving framework. we maintain an active +repository that contains up-to-date literature and open-source projects at +https://github.com/OpenDriveLab/End-to-end-Autonomous-Driving. + +
+
+ comment: Accepted by IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Prompt-Based Segmentation at Multiple Resolutions and Lighting + Conditions using Segment Anything Model 2 + + +
+ This paper provides insight into the effectiveness of zero-shot, +prompt-based, Segment Anything Model (SAM), and its updated version, SAM 2, and +the non-promptable, conventional convolutional network (CNN), in segmenting +solar panels, in RGB aerial imagery, across lighting conditions, spatial +resolutions, and prompt strategies. SAM 2 demonstrates improvements over SAM, +particularly in sub-optimal lighting conditions when prompted by points. Both +SAMs, prompted by user-box, outperformed CNN, in all scenarios. Additionally, +YOLOv9 prompting outperformed user points prompting. In high-resolution +imagery, both in optimal and sub-optimal lighting conditions, Eff-UNet +outperformed both SAM models prompted by YOLOv9 boxes, positioning Eff-UNet as +the appropriate model for automatic segmentation in high-resolution data. In +low-resolution data, user box prompts were found crucial to achieve a +reasonable performance. This paper provides details on strengths and +limitations of each model and outlines robustness of user prompted image +segmentation models in inconsistent resolution and lighting conditions of +remotely sensed data. + +
+
+
+
+
+ + ♻ ☆ Long-Tailed Classification Based on Coarse-Grained Leading Forest and + Multi-Center Loss + + +
+ Long-tailed (LT) classification is an unavoidable and challenging problem in +the real world. Most existing long-tailed classification methods focus only on +solving the class-wise imbalance while ignoring the attribute-wise imbalance. +The deviation of a classification model is caused by both class-wise and +attribute-wise imbalance. Due to the fact that attributes are implicit in most +datasets and the combination of attributes is complex, attribute-wise imbalance +is more difficult to handle. For this purpose, we proposed a novel long-tailed +classification framework, aiming to build a multi-granularity classification +model by means of invariant feature learning. This method first unsupervisedly +constructs Coarse-Grained forest (CLF) to better characterize the distribution +of attributes within a class. Depending on the distribution of attributes, one +can customize suitable sampling strategies to construct different imbalanced +datasets. We then introduce multi-center loss (MCL) that aims to gradually +eliminate confusing attributes during feature learning process. The proposed +framework does not necessarily couple to a specific LT classification model +structure and can be integrated with any existing LT method as an independent +component. Extensive experiments show that our approach achieves +state-of-the-art performance on both existing benchmarks ImageNet-GLT and +MSCOCO-GLT and can improve the performance of existing LT methods. Our codes +are available on GitHub: \url{https://github.com/jinyery/cognisance} + +
+
+ comment: This is another research work to apply leading tree structure along + with deep learning architecture, aiming to deal with attribute-wise long-tail + distribution within class +
+
+
+
+
+ + ♻ ☆ Few Shot Class Incremental Learning using Vision-Language models + + +
+ Recent advancements in deep learning have demonstrated remarkable performance +comparable to human capabilities across various supervised computer vision +tasks. However, the prevalent assumption of having an extensive pool of +training data encompassing all classes prior to model training often diverges +from real-world scenarios, where limited data availability for novel classes is +the norm. The challenge emerges in seamlessly integrating new classes with few +samples into the training data, demanding the model to adeptly accommodate +these additions without compromising its performance on base classes. To +address this exigency, the research community has introduced several solutions +under the realm of few-shot class incremental learning (FSCIL). + In this study, we introduce an innovative FSCIL framework that utilizes +language regularizer and subspace regularizer. During base training, the +language regularizer helps incorporate semantic information extracted from a +Vision-Language model. The subspace regularizer helps in facilitating the +model's acquisition of nuanced connections between image and text semantics +inherent to base classes during incremental training. Our proposed framework +not only empowers the model to embrace novel classes with limited data, but +also ensures the preservation of performance on base classes. To substantiate +the efficacy of our approach, we conduct comprehensive experiments on three +distinct FSCIL benchmarks, where our framework attains state-of-the-art +performance. + +
+
+
+
+
+ + ♻ ☆ Capturing Human Motion from Monocular Images in World Space with + Weak-supervised Calibration + + +
+ Previous methods for 3D human motion recovery from monocular images often +fall short due to reliance on camera coordinates, leading to inaccuracies in +real-world applications where complex shooting conditions are prevalent. The +limited availability and diversity of focal length labels further exacerbate +misalignment issues in reconstructed 3D human bodies. To address these +challenges, we introduce W-HMR, a weak-supervised calibration method that +predicts "reasonable" focal lengths based on body distortion information, +eliminating the need for precise focal length labels. Our approach enhances 2D +supervision precision and recovery accuracy. Additionally, we present the +OrientCorrect module, which corrects body orientation for plausible +reconstructions in world space, avoiding the error accumulation associated with +inaccurate camera rotation predictions. Our contributions include a novel +weak-supervised camera calibration technique, an effective orientation +correction module, and a decoupling strategy that significantly improves the +generalizability and accuracy of human motion recovery in both camera and world +coordinates. The robustness of W-HMR is validated through extensive experiments +on various datasets, showcasing its superiority over existing methods. Codes +and demos have been released on the project page +https://yw0208.github.io/w-hmr/. + +
+
+ comment: Project Page: https://yw0208.github.io/w-hmr/ +
+
+
+
+
+ + ♻ ☆ Detector Collapse: Physical-World Backdooring Object Detection to + Catastrophic Overload or Blindness in Autonomous Driving IJCAI 2024 + + +
+ Object detection tasks, crucial in safety-critical systems like autonomous +driving, focus on pinpointing object locations. These detectors are known to be +susceptible to backdoor attacks. However, existing backdoor techniques have +primarily been adapted from classification tasks, overlooking deeper +vulnerabilities specific to object detection. This paper is dedicated to +bridging this gap by introducing Detector Collapse} (DC), a brand-new backdoor +attack paradigm tailored for object detection. DC is designed to instantly +incapacitate detectors (i.e., severely impairing detector's performance and +culminating in a denial-of-service). To this end, we develop two innovative +attack schemes: Sponge for triggering widespread misidentifications and +Blinding for rendering objects invisible. Remarkably, we introduce a novel +poisoning strategy exploiting natural objects, enabling DC to act as a +practical backdoor in real-world environments. Our experiments on different +detectors across several benchmarks show a significant improvement +($\sim$10\%-60\% absolute and $\sim$2-7$\times$ relative) in attack efficacy +over state-of-the-art attacks. + +
+
+ comment: Accepted to IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Video Desmoking for Laparoscopic Surgery + + +
+ Due to the difficulty of collecting real paired data, most existing desmoking +methods train the models by synthesizing smoke, generalizing poorly to real +surgical scenarios. Although a few works have explored single-image real-world +desmoking in unpaired learning manners, they still encounter challenges in +handling dense smoke. In this work, we address these issues together by +introducing the self-supervised surgery video desmoking (SelfSVD). On the one +hand, we observe that the frame captured before the activation of high-energy +devices is generally clear (named pre-smoke frame, PS frame), thus it can serve +as supervision for other smoky frames, making real-world self-supervised video +desmoking practically feasible. On the other hand, in order to enhance the +desmoking performance, we further feed the valuable information from PS frame +into models, where a masking strategy and a regularization term are presented +to avoid trivial solutions. In addition, we construct a real surgery video +dataset for desmoking, which covers a variety of smoky scenes. Extensive +experiments on the dataset show that our SelfSVD can remove smoke more +effectively and efficiently while recovering more photo-realistic details than +the state-of-the-art methods. The dataset, codes, and pre-trained models are +available at \url{https://github.com/ZcsrenlongZ/SelfSVD}. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ♻ ☆ REFRAME: Reflective Surface Real-Time Rendering for Mobile Devices ECCV 2024 + + +
+ This work tackles the challenging task of achieving real-time novel view +synthesis for reflective surfaces across various scenes. Existing real-time +rendering methods, especially those based on meshes, often have subpar +performance in modeling surfaces with rich view-dependent appearances. Our key +idea lies in leveraging meshes for rendering acceleration while incorporating a +novel approach to parameterize view-dependent information. We decompose the +color into diffuse and specular, and model the specular color in the reflected +direction based on a neural environment map. Our experiments demonstrate that +our method achieves comparable reconstruction quality for highly reflective +surfaces compared to state-of-the-art offline methods, while also efficiently +enabling real-time rendering on edge devices such as smartphones. + +
+
+ comment: ECCV 2024 accepted. Project Page: https://xdimlab.github.io/REFRAME/ +
+
+
+
+
+ + ♻ ☆ Get Your Embedding Space in Order: Domain-Adaptive Regression for Forest + Monitoring + + +
+ Image-level regression is an important task in Earth observation, where +visual domain and label shifts are a core challenge hampering generalization. +However, cross-domain regression within remote sensing data remains +understudied due to the absence of suited datasets. We introduce a new dataset +with aerial and satellite imagery in five countries with three forest-related +regression tasks. To match real-world applicative interests, we compare methods +through a restrictive setup where no prior on the target domain is available +during training, and models are adapted with limited information during +testing. Building on the assumption that ordered relationships generalize +better, we propose manifold diffusion for regression as a strong baseline for +transduction in low-data regimes. Our comparison highlights the comparative +advantages of inductive and transductive methods in cross-domain regression. + +
+
+ comment: Updated with review comments +
+
+
+
+
+ + ♻ ☆ Dual-Camera Smooth Zoom on Mobile Phones + + +
+ When zooming between dual cameras on a mobile, noticeable jumps in geometric +content and image color occur in the preview, inevitably affecting the user's +zoom experience. In this work, we introduce a new task, ie, dual-camera smooth +zoom (DCSZ) to achieve a smooth zoom preview. The frame interpolation (FI) +technique is a potential solution but struggles with ground-truth collection. +To address the issue, we suggest a data factory solution where continuous +virtual cameras are assembled to generate DCSZ data by rendering reconstructed +3D models of the scene. In particular, we propose a novel dual-camera smooth +zoom Gaussian Splatting (ZoomGS), where a camera-specific encoding is +introduced to construct a specific 3D model for each virtual camera. With the +proposed data factory, we construct a synthetic dataset for DCSZ, and we +utilize it to fine-tune FI models. In addition, we collect real-world dual-zoom +images without ground-truth for evaluation. Extensive experiments are conducted +with multiple FI methods. The results show that the fine-tuned FI models +achieve a significant performance improvement over the original ones on DCSZ +task. The datasets, codes, and pre-trained models will are available at +https://github.com/ZcsrenlongZ/ZoomGS. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Multi-Modality Co-Learning for Efficient Skeleton-based Action + Recognition + + +
+ Skeleton-based action recognition has garnered significant attention due to +the utilization of concise and resilient skeletons. Nevertheless, the absence +of detailed body information in skeletons restricts performance, while other +multimodal methods require substantial inference resources and are inefficient +when using multimodal data during both training and inference stages. To +address this and fully harness the complementary multimodal features, we +propose a novel multi-modality co-learning (MMCL) framework by leveraging the +multimodal large language models (LLMs) as auxiliary networks for efficient +skeleton-based action recognition, which engages in multi-modality co-learning +during the training stage and keeps efficiency by employing only concise +skeletons in inference. Our MMCL framework primarily consists of two modules. +First, the Feature Alignment Module (FAM) extracts rich RGB features from video +frames and aligns them with global skeleton features via contrastive learning. +Second, the Feature Refinement Module (FRM) uses RGB images with temporal +information and text instruction to generate instructive features based on the +powerful generalization of multimodal LLMs. These instructive text features +will further refine the classification scores and the refined scores will +enhance the model's robustness and generalization in a manner similar to soft +labels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA +benchmarks consistently verify the effectiveness of our MMCL, which outperforms +the existing skeleton-based action recognition methods. Meanwhile, experiments +on UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization +of our MMCL in zero-shot and domain-adaptive action recognition. Our code is +publicly available at: https://github.com/liujf69/MMCL-Action. + +
+
+
+
+
+ + ♻ ☆ Identifying Important Group of Pixels using Interactions CVPR 2024 + + +
+ To better understand the behavior of image classifiers, it is useful to +visualize the contribution of individual pixels to the model prediction. In +this study, we propose a method, MoXI ($\textbf{Mo}$del e$\textbf{X}$planation +by $\textbf{I}$nteractions), that efficiently and accurately identifies a group +of pixels with high prediction confidence. The proposed method employs +game-theoretic concepts, Shapley values and interactions, taking into account +the effects of individual pixels and the cooperative influence of pixels on +model confidence. Theoretical analysis and experiments demonstrate that our +method better identifies the pixels that are highly contributing to the model +outputs than widely-used visualization by Grad-CAM, Attention rollout, and +Shapley value. While prior studies have suffered from the exponential +computational cost in the computation of Shapley value and interactions, we +show that this can be reduced to quadratic cost for our task. The code is +available at https://github.com/KosukeSumiyasu/MoXI. + +
+
+ comment: CVPR 2024 (update: minor typos, new references, Eqs. (12) and (13)) +
+
+
+
+
+ + ♻ ☆ A Spitting Image: Modular Superpixel Tokenization in Vision Transformers ECCV + + +
+ Vision Transformer (ViT) architectures traditionally employ a grid-based +approach to tokenization independent of the semantic content of an image. We +propose a modular superpixel tokenization strategy which decouples tokenization +and feature extraction; a shift from contemporary approaches where these are +treated as an undifferentiated whole. Using on-line content-aware tokenization +and scale- and shape-invariant positional embeddings, we perform experiments +and ablations that contrast our approach with patch-based tokenization and +randomized partitions as baselines. We show that our method significantly +improves the faithfulness of attributions, gives pixel-level granularity on +zero-shot unsupervised dense prediction tasks, while maintaining predictive +performance in classification tasks. Our approach provides a modular +tokenization framework commensurable with standard architectures, extending the +space of ViTs to a larger class of semantically-rich models. + +
+
+ comment: To appear in ECCV (MELEX) 2024 Workshop Proceedings +
+
+
+
+
+ + ♻ ☆ MLAAN: Scaling Supervised Local Learning with Multilaminar Leap + Augmented Auxiliary Network + + +
+ Deep neural networks (DNNs) typically employ an end-to-end (E2E) training +paradigm which presents several challenges, including high GPU memory +consumption, inefficiency, and difficulties in model parallelization during +training. Recent research has sought to address these issues, with one +promising approach being local learning. This method involves partitioning the +backbone network into gradient-isolated modules and manually designing +auxiliary networks to train these local modules. Existing methods often neglect +the interaction of information between local modules, leading to myopic issues +and a performance gap compared to E2E training. To address these limitations, +we propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN). +Specifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap +Augmented Modules (LAM). MLM captures both local and global features through +independent and cascaded auxiliary networks, alleviating performance issues +caused by insufficient global features. However, overly simplistic auxiliary +networks can impede MLM's ability to capture global information. To address +this, we further design LAM, an enhanced auxiliary network that uses the +Exponential Moving Average (EMA) method to facilitate information exchange +between local modules, thereby mitigating the shortsightedness resulting from +inadequate interaction. The synergy between MLM and LAM has demonstrated +excellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and +ImageNet datasets show that MLAAN can be seamlessly integrated into existing +local learning frameworks, significantly enhancing their performance and even +surpassing end-to-end (E2E) training methods, while also reducing GPU memory +consumption. + +
+
+
+
+
+ + ♻ ☆ Label Dropout: Improved Deep Learning Echocardiography Segmentation + Using Multiple Datasets With Domain Shift and Partial Labelling MICCAI 2024 + + +
+ Echocardiography (echo) is the first imaging modality used when assessing +cardiac function. The measurement of functional biomarkers from echo relies +upon the segmentation of cardiac structures and deep learning models have been +proposed to automate the segmentation process. However, in order to translate +these tools to widespread clinical use it is important that the segmentation +models are robust to a wide variety of images (e.g. acquired from different +scanners, by operators with different levels of expertise etc.). To achieve +this level of robustness it is necessary that the models are trained with +multiple diverse datasets. A significant challenge faced when training with +multiple diverse datasets is the variation in label presence, i.e. the combined +data are often partially-labelled. Adaptations of the cross entropy loss +function have been proposed to deal with partially labelled data. In this paper +we show that training naively with such a loss function and multiple diverse +datasets can lead to a form of shortcut learning, where the model associates +label presence with domain characteristics, leading to a drop in performance. +To address this problem, we propose a novel label dropout scheme to break the +link between domain characteristics and the presence or absence of labels. We +demonstrate that label dropout improves echo segmentation Dice score by 62% and +25% on two cardiac structures when training using multiple diverse partially +labelled datasets. + +
+
+ comment: 10 pages, 5 figures, ASMUS 2024, Held in Conjunction with MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ DeepInteraction++: Multi-Modality Interaction for Autonomous Driving NeurIPS 2022 + + +
+ Existing top-performance autonomous driving systems typically rely on the +multi-modal fusion strategy for reliable scene understanding. This design is +however fundamentally restricted due to overlooking the modality-specific +strengths and finally hampering the model performance. To address this +limitation, in this work, we introduce a novel modality interaction strategy +that allows individual per-modality representations to be learned and +maintained throughout, enabling their unique characteristics to be exploited +during the whole perception pipeline. To demonstrate the effectiveness of the +proposed strategy, we design DeepInteraction++, a multi-modal interaction +framework characterized by a multi-modal representational interaction encoder +and a multi-modal predictive interaction decoder. Specifically, the encoder is +implemented as a dual-stream Transformer with specialized attention operation +for information exchange and integration between separate modality-specific +representations. Our multi-modal representational learning incorporates both +object-centric, precise sampling-based feature alignment and global dense +information spreading, essential for the more challenging planning task. The +decoder is designed to iteratively refine the predictions by alternately +aggregating information from separate representations in a unified +modality-agnostic manner, realizing multi-modal predictive interaction. +Extensive experiments demonstrate the superior performance of the proposed +framework on both 3D object detection and end-to-end autonomous driving tasks. +Our code is available at https://github.com/fudan-zvg/DeepInteraction. + +
+
+ comment: Journal extension of NeurIPS 2022. arXiv admin note: text overlap + with arXiv:2208.11112 +
+
+
+
+
+ + ♻ ☆ BAPLe: Backdoor Attacks on Medical Foundational Models using Prompt + Learning MICCAI 2024 + + +
+ Medical foundation models are gaining prominence in the medical community for +their ability to derive general representations from extensive collections of +medical image-text pairs. Recent research indicates that these models are +susceptible to backdoor attacks, which allow them to classify clean images +accurately but fail when specific triggers are introduced. However, traditional +backdoor attacks necessitate a considerable amount of additional data to +maliciously pre-train a model. This requirement is often impractical in medical +imaging applications due to the usual scarcity of data. Inspired by the latest +developments in learnable prompts, this work introduces a method to embed a +backdoor into the medical foundation model during the prompt learning phase. By +incorporating learnable prompts within the text encoder and introducing +imperceptible learnable noise trigger to the input images, we exploit the full +capabilities of the medical foundation models (Med-FM). Our method, BAPLe, +requires only a minimal subset of data to adjust the noise trigger and the text +prompts for downstream tasks, enabling the creation of an effective backdoor +attack. Through extensive experiments with four medical foundation models, each +pre-trained on different modalities and evaluated across six downstream +datasets, we demonstrate the efficacy of our approach. BAPLe achieves a high +backdoor success rate across all models and datasets, outperforming the +baseline backdoor attack methods. Our work highlights the vulnerability of +Med-FMs towards backdoor attacks and strives to promote the safe adoption of +Med-FMs before their deployment in real-world applications. Code is available +at https://asif-hanif.github.io/baple/. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties ECCV 2024 + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Accepted to ECCV 2024. Code is available at + https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ Masked Generative Extractor for Synergistic Representation and 3D + Generation of Point Clouds + + +
+ Representation and generative learning, as reconstruction-based methods, have +demonstrated their potential for mutual reinforcement across various domains. +In the field of point cloud processing, although existing studies have adopted +training strategies from generative models to enhance representational +capabilities, these methods are limited by their inability to genuinely +generate 3D shapes. To explore the benefits of deeply integrating 3D +representation learning and generative learning, we propose an innovative +framework called \textit{Point-MGE}. Specifically, this framework first +utilizes a vector quantized variational autoencoder to reconstruct a neural +field representation of 3D shapes, thereby learning discrete semantic features +of point patches. Subsequently, we design a sliding masking ratios to smooth +the transition from representation learning to generative learning. Moreover, +our method demonstrates strong generalization capability in learning +high-capacity models, achieving new state-of-the-art performance across +multiple downstream tasks. In shape classification, Point-MGE achieved an +accuracy of 94.2% (+1.0%) on the ModelNet40 dataset and 92.9% (+5.5%) on the +ScanObjectNN dataset. Experimental results also confirmed that Point-MGE can +generate high-quality 3D shapes in both unconditional and conditional settings. + +
+
+
+
+
+ + ♻ ☆ Robust Zero-Shot Crowd Counting and Localization With Adaptive + Resolution SAM ECCV 2024 + + +
+ The existing crowd counting models require extensive training data, which is +time-consuming to annotate. To tackle this issue, we propose a simple yet +effective crowd counting method by utilizing the Segment-Everything-Everywhere +Model (SEEM), an adaptation of the Segmentation Anything Model (SAM), to +generate pseudo-labels for training crowd counting models. However, our initial +investigation reveals that SEEM's performance in dense crowd scenes is limited, +primarily due to the omission of many persons in high-density areas. To +overcome this limitation, we propose an adaptive resolution SEEM to handle the +scale variations, occlusions, and overlapping of people within crowd scenes. +Alongside this, we introduce a robust localization method, based on Gaussian +Mixture Models, for predicting the head positions in the predicted people +masks. Given the mask and point pseudo-labels, we propose a robust loss +function, which is designed to exclude uncertain regions based on SEEM's +predictions, thereby enhancing the training process of the counting networks. +Finally, we propose an iterative method for generating pseudo-labels. This +method aims at improving the quality of the segmentation masks by identifying +more tiny persons in high-density regions, which are often missed in the first +pseudo-labeling stage. Overall, our proposed method achieves the best +unsupervised performance in crowd counting, while also being comparable results +to some supervised methods. This makes it a highly effective and versatile tool +for crowd counting, especially in situations where labeled data is not +available. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Implicit and Explicit Language Guidance for Diffusion-based Visual + Perception + + +
+ Text-to-image diffusion models have shown powerful ability on conditional +image synthesis. With large-scale vision-language pre-training, diffusion +models are able to generate high-quality images with rich texture and +reasonable structure under different text prompts. However, it is an open +problem to adapt the pre-trained diffusion model for visual perception. In this +paper, we propose an implicit and explicit language guidance framework for +diffusion-based perception, named IEDP. Our IEDP comprises an implicit language +guidance branch and an explicit language guidance branch. The implicit branch +employs frozen CLIP image encoder to directly generate implicit text embeddings +that are fed to diffusion model, without using explicit text prompts. The +explicit branch utilizes the ground-truth labels of corresponding images as +text prompts to condition feature extraction of diffusion model. During +training, we jointly train diffusion model by sharing the model weights of +these two branches. As a result, implicit and explicit branches can jointly +guide feature learning. During inference, we only employ implicit branch for +final prediction, which does not require any ground-truth labels. Experiments +are performed on two typical perception tasks, including semantic segmentation +and depth estimation. Our IEDP achieves promising performance on both tasks. +For semantic segmentation, our IEDP has the mIoU$^\text{ss}$ score of 55.9% on +AD20K validation set, which outperforms the baseline method VPD by 2.2%. For +depth estimation, our IEDP outperforms the baseline method VPD with a relative +gain of 11.0%. + +
+
+ comment: Accepted by IEEE TMM +
+
+
+
+
+ + ♻ ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation + in VEM images + + +
+ While imaging techniques at macro and mesoscales have garnered substantial +attention and resources, microscale Volume Electron Microscopy (vEM) imaging, +capable of revealing intricate vascular details, has lacked the necessary +benchmarking infrastructure. In this paper, we address a significant gap in +this field of neuroimaging by introducing the first-in-class public benchmark, +BvEM, designed specifically for cortical blood vessel segmentation in vEM +images. Our BvEM benchmark is based on vEM image volumes from three mammals: +adult mouse, macaque, and human. We standardized the resolution, addressed +imaging variations, and meticulously annotated blood vessels through +semi-automatic, manual, and quality control processes, ensuring high-quality 3D +segmentation. Furthermore, we developed a zero-shot cortical blood vessel +segmentation method named TriSAM, which leverages the powerful segmentation +model SAM for 3D segmentation. To extend SAM from 2D to 3D volume segmentation, +TriSAM employs a multi-seed tracking framework, leveraging the reliability of +certain image planes for tracking while using others to identify potential +turning points. This approach effectively achieves long-term 3D blood vessel +segmentation without model training or fine-tuning. Experimental results show +that TriSAM achieved superior performances on the BvEM benchmark across three +species. Our dataset, code, and model are available online at +\url{https://jia-wan.github.io/bvem}. + +
+
+ comment: BvEM-Mouse can be visualized at: https://tinyurl.com/yc2s38x9 +
+
+
+
+
+ + ♻ ☆ RNNs, CNNs and Transformers in Human Action Recognition: A Survey and a + Hybrid Model + + +
+ Human Action Recognition (HAR) encompasses the task of monitoring human +activities across various domains, including but not limited to medical, +educational, entertainment, visual surveillance, video retrieval, and the +identification of anomalous activities. Over the past decade, the field of HAR +has witnessed substantial progress by leveraging Convolutional Neural Networks +(CNNs) to effectively extract and comprehend intricate information, thereby +enhancing the overall performance of HAR systems. Recently, the domain of +computer vision has witnessed the emergence of Vision Transformers (ViTs) as a +potent solution. The efficacy of transformer architecture has been validated +beyond the confines of image analysis, extending their applicability to diverse +video-related tasks. Notably, within this landscape, the research community has +shown keen interest in HAR, acknowledging its manifold utility and widespread +adoption across various domains. This article aims to present an encompassing +survey that focuses on CNNs and the evolution of Recurrent Neural Networks +(RNNs) to ViTs given their importance in the domain of HAR. By conducting a +thorough examination of existing literature and exploring emerging trends, this +study undertakes a critical analysis and synthesis of the accumulated knowledge +in this field. Additionally, it investigates the ongoing efforts to develop +hybrid approaches. Following this direction, this article presents a novel +hybrid model that seeks to integrate the inherent strengths of CNNs and ViTs. + +
+
+
+
+
+ + ♻ ☆ AMAES: Augmented Masked Autoencoder Pretraining on Public Brain MRI Data + for 3D-Native Segmentation MICCAI 2024 + + +
+ This study investigates the impact of self-supervised pretraining of 3D +semantic segmentation models on a large-scale, domain-specific dataset. We +introduce BRAINS-45K, a dataset of 44,756 brain MRI volumes from public +sources, the largest public dataset available, and revisit a number of design +choices for pretraining modern segmentation architectures by simplifying and +optimizing state-of-the-art methods, and combining them with a novel +augmentation strategy. The resulting AMAES framework is based on +masked-image-modeling and intensity-based augmentation reversal and balances +memory usage, runtime, and finetuning performance. Using the popular U-Net and +the recent MedNeXt architecture as backbones, we evaluate the effect of +pretraining on three challenging downstream tasks, covering single-sequence, +low-resource settings, and out-of-domain generalization. The results highlight +that pretraining on the proposed dataset with AMAES significantly improves +segmentation performance in the majority of evaluated cases, and that it is +beneficial to pretrain the model with augmentations, despite pretraing on a +large-scale dataset. Code and model checkpoints for reproducing results, as +well as the BRAINS-45K dataset are available at +\url{https://github.com/asbjrnmunk/amaes}. + +
+
+ comment: Accepted at ADSMI @ MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Research on the Spatial Data Intelligent Large Model + + +
+ This report focuses on spatial data intelligent large models, delving into +the principles, methods, and cutting-edge applications of these models. It +provides an in-depth discussion on the definition, development history, current +status, and trends of spatial data intelligent large models, as well as the +challenges they face. The report systematically elucidates the key technologies +of spatial data intelligent large models and their applications in urban +environments, aerospace remote sensing, geography, transportation, and other +scenarios. Additionally, it summarizes the latest application cases of spatial +data intelligent large models in themes such as urban development, multimodal +systems, remote sensing, smart transportation, and resource environments. +Finally, the report concludes with an overview and outlook on the development +prospects of spatial data intelligent large models. + +
+
+ comment: V1 and V2 are in Chinese language, other versions are in English +
+
+
+
+
+ + ♻ ☆ Interactive Character Control with Auto-Regressive Motion Diffusion + Models + + +
+ Real-time character control is an essential component for interactive +experiences, with a broad range of applications, including physics simulations, +video games, and virtual reality. The success of diffusion models for image +synthesis has led to the use of these models for motion synthesis. However, the +majority of these motion diffusion models are primarily designed for offline +applications, where space-time models are used to synthesize an entire sequence +of frames simultaneously with a pre-specified length. To enable real-time +motion synthesis with diffusion model that allows time-varying controls, we +propose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional +diffusion model takes an initial pose as input, and auto-regressively generates +successive motion frames conditioned on the previous frame. Despite its +streamlined network architecture, which uses simple MLPs, our framework is +capable of generating diverse, long-horizon, and high-fidelity motion +sequences. Furthermore, we introduce a suite of techniques for incorporating +interactive controls into A-MDM, such as task-oriented sampling, in-painting, +and hierarchical reinforcement learning. These techniques enable a pre-trained +A-MDM to be efficiently adapted for a variety of new downstream tasks. We +conduct a comprehensive suite of experiments to demonstrate the effectiveness +of A-MDM, and compare its performance against state-of-the-art auto-regressive +methods. + +
+
+
+
+
+ + ♻ ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ♻ ☆ Active Generation for Image Classification ECCV 2024 + + +
+ Recently, the growing capabilities of deep generative models have underscored +their potential in enhancing image classification accuracy. However, existing +methods often demand the generation of a disproportionately large number of +images compared to the original dataset, while having only marginal +improvements in accuracy. This computationally expensive and time-consuming +process hampers the practicality of such approaches. In this paper, we propose +to address the efficiency of image generation by focusing on the specific needs +and characteristics of the model. With a central tenet of active learning, our +method, named ActGen, takes a training-aware approach to image generation. It +aims to create images akin to the challenging or misclassified samples +encountered by the current model and incorporates these generated images into +the training set to augment model performance. ActGen introduces an attentive +image guidance technique, using real images as guides during the denoising +process of a diffusion model. The model's attention on class prompt is +leveraged to ensure the preservation of similar foreground object while +diversifying the background. Furthermore, we introduce a gradient-based +generation guidance method, which employs two losses to generate more +challenging samples and prevent the generated images from being too similar to +previously generated ones. Experimental results on the CIFAR and ImageNet +datasets demonstrate that our method achieves better performance with a +significantly reduced number of generated images. Code is available at +https://github.com/hunto/ActGen. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing chest X-ray datasets with privacy-preserving large language + models and multi-type annotations: a data-driven approach for improved + classification + + +
+ In chest X-ray (CXR) image analysis, rule-based systems are usually employed +to extract labels from reports for dataset releases. However, there is still +room for improvement in label quality. These labelers typically output only +presence labels, sometimes with binary uncertainty indicators, which limits +their usefulness. Supervised deep learning models have also been developed for +report labeling but lack adaptability, similar to rule-based systems. In this +work, we present MAPLEZ (Medical report Annotations with Privacy-preserving +Large language model using Expeditious Zero shot answers), a novel approach +leveraging a locally executable Large Language Model (LLM) to extract and +enhance findings labels on CXR reports. MAPLEZ extracts not only binary labels +indicating the presence or absence of a finding but also the location, +severity, and radiologists' uncertainty about the finding. Over eight +abnormalities from five test sets, we show that our method can extract these +annotations with an increase of 3.6 percentage points (pp) in macro F1 score +for categorical presence annotations and more than 20 pp increase in F1 score +for the location annotations over competing labelers. Additionally, using the +combination of improved annotations and multi-type annotations in +classification supervision, we demonstrate substantial advancements in model +quality, with an increase of 1.1 pp in AUROC over models trained with +annotations from the best alternative approach. We share code and annotations. + +
+
+ comment: Code and data: + https://github.com/rsummers11/CADLab/tree/master/MAPLEZ_LLM_report_labeler/ +
+
+
+
+
+ + ♻ ☆ A Non-negative VAE:the Generalized Gamma Belief Network + + +
+ The gamma belief network (GBN), often regarded as a deep topic model, has +demonstrated its potential for uncovering multi-layer interpretable latent +representations in text data. Its notable capability to acquire interpretable +latent factors is partially attributed to sparse and non-negative +gamma-distributed latent variables. However, the existing GBN and its +variations are constrained by the linear generative model, thereby limiting +their expressiveness and applicability. To address this limitation, we +introduce the generalized gamma belief network (Generalized GBN) in this paper, +which extends the original linear generative model to a more expressive +non-linear generative model. Since the parameters of the Generalized GBN no +longer possess an analytic conditional posterior, we further propose an +upward-downward Weibull inference network to approximate the posterior +distribution of the latent variables. The parameters of both the generative +model and the inference network are jointly trained within the variational +inference framework. Finally, we conduct comprehensive experiments on both +expressivity and disentangled representation learning tasks to evaluate the +performance of the Generalized GBN against state-of-the-art Gaussian +variational autoencoders serving as baselines. + +
+
+
+
+
+ + ♻ ☆ MathScape: Evaluating MLLMs in multimodal Math Scenarios through a + Hierarchical Benchmark + + +
+ With the development of Multimodal Large Language Models (MLLMs), the +evaluation of multimodal models in the context of mathematical problems has +become a valuable research field. Multimodal visual-textual mathematical +reasoning serves as a critical indicator for evaluating the comprehension and +complex multi-step quantitative reasoning abilities of MLLMs. However, previous +multimodal math benchmarks have not sufficiently integrated visual and textual +information. To address this gap, we proposed MathScape, a new benchmark that +emphasizes the understanding and application of combined visual and textual +information. MathScape is designed to evaluate photo-based math problem +scenarios, assessing the theoretical understanding and application ability of +MLLMs through a categorical hierarchical approach. We conduct a +multi-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark +is challenging even for the most sophisticated models. By analyzing the +evaluation results, we identify the limitations of MLLMs, offering valuable +insights for enhancing model performance. + +
+
+
+
+
+ + ♻ ☆ MetaSeg: MetaFormer-based Global Contexts-aware Network for Efficient + Semantic Segmentation WACV 2024 + + +
+ Beyond the Transformer, it is important to explore how to exploit the +capacity of the MetaFormer, an architecture that is fundamental to the +performance improvements of the Transformer. Previous studies have exploited it +only for the backbone network. Unlike previous studies, we explore the capacity +of the Metaformer architecture more extensively in the semantic segmentation +task. We propose a powerful semantic segmentation network, MetaSeg, which +leverages the Metaformer architecture from the backbone to the decoder. Our +MetaSeg shows that the MetaFormer architecture plays a significant role in +capturing the useful contexts for the decoder as well as for the backbone. In +addition, recent segmentation methods have shown that using a CNN-based +backbone for extracting the spatial information and a decoder for extracting +the global information is more effective than using a transformer-based +backbone with a CNN-based decoder. This motivates us to adopt the CNN-based +backbone using the MetaFormer block and design our MetaFormer-based decoder, +which consists of a novel self-attention module to capture the global contexts. +To consider both the global contexts extraction and the computational +efficiency of the self-attention for semantic segmentation, we propose a +Channel Reduction Attention (CRA) module that reduces the channel dimension of +the query and key into the one dimension. In this way, our proposed MetaSeg +outperforms the previous state-of-the-art methods with more efficient +computational costs on popular semantic segmentation and a medical image +segmentation benchmark, including ADE20K, Cityscapes, COCO-stuff, and Synapse. +The code is available at https://github.com/hyunwoo137/MetaSeg. + +
+
+ comment: Accepted by WACV 2024 +
+
+
+
+
+ + ♻ ☆ SAM-FNet: SAM-Guided Fusion Network for Laryngo-Pharyngeal Tumor + Detection + + +
+ Laryngo-pharyngeal cancer (LPC) is a highly fatal malignant disease affecting +the head and neck region. Previous studies on endoscopic tumor detection, +particularly those leveraging dual-branch network architectures, have shown +significant advancements in tumor detection. These studies highlight the +potential of dual-branch networks in improving diagnostic accuracy by +effectively integrating global and local (lesion) feature extraction. However, +they are still limited in their capabilities to accurately locate the lesion +region and capture the discriminative feature information between the global +and local branches. To address these issues, we propose a novel SAM-guided +fusion network (SAM-FNet), a dual-branch network for laryngo-pharyngeal tumor +detection. By leveraging the powerful object segmentation capabilities of the +Segment Anything Model (SAM), we introduce the SAM into the SAM-FNet to +accurately segment the lesion region. Furthermore, we propose a GAN-like +feature optimization (GFO) module to capture the discriminative features +between the global and local branches, enhancing the fusion feature +complementarity. Additionally, we collect two LPC datasets from the First +Affiliated Hospital (FAHSYSU) and the Sixth Affiliated Hospital (SAHSYSU) of +Sun Yat-sen University. The FAHSYSU dataset is used as the internal dataset for +training the model, while the SAHSYSU dataset is used as the external dataset +for evaluating the model's performance. Extensive experiments on both datasets +of FAHSYSU and SAHSYSU demonstrate that the SAM-FNet can achieve competitive +results, outperforming the state-of-the-art counterparts. The source code of +SAM-FNet is available at the URL of https://github.com/VVJia/SAM-FNet. + +
+
+
+
+
+ + ♻ ☆ SM4Depth: Seamless Monocular Metric Depth Estimation across Multiple + Cameras and Scenes by One Model + + +
+ In the last year, universal monocular metric depth estimation (universal +MMDE) has gained considerable attention, serving as the foundation model for +various multimedia tasks, such as video and image editing. Nonetheless, current +approaches face challenges in maintaining consistent accuracy across diverse +scenes without scene-specific parameters and pre-training, hindering the +practicality of MMDE. Furthermore, these methods rely on extensive datasets +comprising millions, if not tens of millions, of data for training, leading to +significant time and hardware expenses. This paper presents SM$^4$Depth, a +model that seamlessly works for both indoor and outdoor scenes, without needing +extensive training data and GPU clusters. Firstly, to obtain consistent depth +across diverse scenes, we propose a novel metric scale modeling, i.e., +variation-based unnormalized depth bins. It reduces the ambiguity of the +conventional metric bins and enables better adaptation to large depth gaps of +scenes during training. Secondly, we propose a "divide and conquer" solution to +reduce reliance on massive training data. Instead of estimating directly from +the vast solution space, the metric bins are estimated from multiple solution +sub-spaces to reduce complexity. Additionally, we introduce an uncut depth +dataset, BUPT Depth, to evaluate the depth accuracy and consistency across +various indoor and outdoor scenes. Trained on a consumer-grade GPU using just +150K RGB-D pairs, SM$^4$Depth achieves outstanding performance on the most +never-before-seen datasets, especially maintaining consistent accuracy across +indoors and outdoors. The code can be found +https://github.com/mRobotit/SM4Depth. + +
+
+ comment: Accepted by ACM MultiMedia 24, Project Page: + xuefeng-cvr.github.io/SM4Depth +
+
+
+
+
+ + ♻ ☆ Unsupervised Industrial Anomaly Detection via Pattern Generative and + Contrastive Networks + + +
+ It is hard to collect enough flaw images for training deep learning network +in industrial production. Therefore, existing industrial anomaly detection +methods prefer to use CNN-based unsupervised detection and localization network +to achieve this task. However, these methods always fail when there are +varieties happened in new signals since traditional end-to-end networks suffer +barriers of fitting nonlinear model in high-dimensional space. Moreover, they +have a memory library by clustering the feature of normal images essentially, +which cause it is not robust to texture change. To this end, we propose the +Vision Transformer based (VIT-based) unsupervised anomaly detection network. It +utilizes a hierarchical task learning and human experience to enhance its +interpretability. Our network consists of pattern generation and comparison +networks. Pattern generation network uses two VIT-based encoder modules to +extract the feature of two consecutive image patches, then uses VIT-based +decoder module to learn the human designed style of these features and predict +the third image patch. After this, we use the Siamese-based network to compute +the similarity of the generation image patch and original image patch. Finally, +we refine the anomaly localization by the bi-directional inference strategy. +Comparison experiments on public dataset MVTec dataset show our method achieves +99.8% AUC, which surpasses previous state-of-the-art methods. In addition, we +give a qualitative illustration on our own leather and cloth datasets. The +accurate segment results strongly prove the accuracy of our method in anomaly +detection. + +
+
+
+
+
+ + ♻ ☆ DIffSteISR: Harnessing Diffusion Prior for Superior Real-world Stereo + Image Super-Resolution + + +
+ We introduce DiffSteISR, a pioneering framework for reconstructing real-world +stereo images. DiffSteISR utilizes the powerful prior knowledge embedded in +pre-trained text-to-image model to efficiently recover the lost texture details +in low-resolution stereo images. Specifically, DiffSteISR implements a +time-aware stereo cross attention with temperature adapter (TASCATA) to guide +the diffusion process, ensuring that the generated left and right views exhibit +high texture consistency thereby reducing disparity error between the +super-resolved images and the ground truth (GT) images. Additionally, a stereo +omni attention control network (SOA ControlNet) is proposed to enhance the +consistency of super-resolved images with GT images in the pixel, perceptual, +and distribution space. Finally, DiffSteISR incorporates a stereo semantic +extractor (SSE) to capture unique viewpoint soft semantic information and +shared hard tag semantic information, thereby effectively improving the +semantic accuracy and consistency of the generated left and right images. +Extensive experimental results demonstrate that DiffSteISR accurately +reconstructs natural and precise textures from low-resolution stereo images +while maintaining a high consistency of semantic and texture between the left +and right views. + +
+
+
+
+
+ + ♻ ☆ ControlNeXt: Powerful and Efficient Control for Image and Video + Generation + + +
+ Diffusion models have demonstrated remarkable and robust abilities in both +image and video generation. To achieve greater control over generated results, +researchers introduce additional architectures, such as ControlNet, Adapters +and ReferenceNet, to integrate conditioning controls. However, current +controllable generation methods often require substantial additional +computational resources, especially for video generation, and face challenges +in training or exhibit weak control. In this paper, we propose ControlNeXt: a +powerful and efficient method for controllable image and video generation. We +first design a more straightforward and efficient architecture, replacing heavy +additional branches with minimal additional cost compared to the base model. +Such a concise structure also allows our method to seamlessly integrate with +other LoRA weights, enabling style alteration without the need for additional +training. As for training, we reduce up to 90% of learnable parameters compared +to the alternatives. Furthermore, we propose another method called Cross +Normalization (CN) as a replacement for Zero-Convolution' to achieve fast and +stable training convergence. We have conducted various experiments with +different base models across images and videos, demonstrating the robustness of +our method. + +
+
+ comment: controllable generation +
+
+
+
+
+ + ♻ ☆ Hi-ResNet: Edge Detail Enhancement for High-Resolution Remote Sensing + Segmentation + + +
+ High-resolution remote sensing (HRS) semantic segmentation extracts key +objects from high-resolution coverage areas. However, objects of the same +category within HRS images generally show significant differences in scale and +shape across diverse geographical environments, making it difficult to fit the +data distribution. Additionally, a complex background environment causes +similar appearances of objects of different categories, which precipitates a +substantial number of objects into misclassification as background. These +issues make existing learning algorithms sub-optimal. In this work, we solve +the above-mentioned problems by proposing a High-resolution remote sensing +network (Hi-ResNet) with efficient network structure designs, which consists of +a funnel module, a multi-branch module with stacks of information aggregation +(IA) blocks, and a feature refinement module, sequentially, and Class-agnostic +Edge Aware (CEA) loss. Specifically, we propose a funnel module to downsample, +which reduces the computational cost, and extract high-resolution semantic +information from the initial input image. Secondly, we downsample the processed +feature images into multi-resolution branches incrementally to capture image +features at different scales and apply IA blocks, which capture key latent +information by leveraging attention mechanisms, for effective feature +aggregation, distinguishing image features of the same class with variant +scales and shapes. Finally, our feature refinement module integrate the CEA +loss function, which disambiguates inter-class objects with similar shapes and +increases the data distribution distance for correct predictions. With +effective pre-training strategies, we demonstrated the superiority of Hi-ResNet +over state-of-the-art methods on three HRS segmentation benchmarks. + +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ VCD-Texture: Variance Alignment based 3D-2D Co-Denoising for Text-Guided + Texturing ECCV 2024 + + +
+ Recent research on texture synthesis for 3D shapes benefits a lot from +dramatically developed 2D text-to-image diffusion models, including +inpainting-based and optimization-based approaches. However, these methods +ignore the modal gap between the 2D diffusion model and 3D objects, which +primarily render 3D objects into 2D images and texture each image separately. +In this paper, we revisit the texture synthesis and propose a Variance +alignment based 3D-2D Collaborative Denoising framework, dubbed VCD-Texture, to +address these issues. Formally, we first unify both 2D and 3D latent feature +learning in diffusion self-attention modules with re-projected 3D attention +receptive fields. Subsequently, the denoised multi-view 2D latent features are +aggregated into 3D space and then rasterized back to formulate more consistent +2D predictions. However, the rasterization process suffers from an intractable +variance bias, which is theoretically addressed by the proposed variance +alignment, achieving high-fidelity texture synthesis. Moreover, we present an +inpainting refinement to further improve the details with conflicting regions. +Notably, there is not a publicly available benchmark to evaluate texture +synthesis, which hinders its development. Thus we construct a new evaluation +set built upon three open-source 3D datasets and propose to use four metrics to +thoroughly validate the texturing performance. Comprehensive experiments +demonstrate that VCD-Texture achieves superior performance against other +counterparts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Dual-View Pyramid Pooling in Deep Neural Networks for Improved Medical + Image Classification and Confidence Calibration + + +
+ Spatial pooling (SP) and cross-channel pooling (CCP) operators have been +applied to aggregate spatial features and pixel-wise features from feature maps +in deep neural networks (DNNs), respectively. Their main goal is to reduce +computation and memory overhead without visibly weakening the performance of +DNNs. However, SP often faces the problem of losing the subtle feature +representations, while CCP has a high possibility of ignoring salient feature +representations, which may lead to both miscalibration of confidence issues and +suboptimal medical classification results. To address these problems, we +propose a novel dual-view framework, the first to systematically investigate +the relative roles of SP and CCP by analyzing the difference between spatial +features and pixel-wise features. Based on this framework, we propose a new +pooling method, termed dual-view pyramid pooling (DVPP), to aggregate +multi-scale dual-view features. DVPP aims to boost both medical image +classification and confidence calibration performance by fully leveraging the +merits of SP and CCP operators from a dual-axis perspective. Additionally, we +discuss how to fulfill DVPP with five parameter-free implementations. Extensive +experiments on six 2D/3D medical image classification tasks show that our DVPP +surpasses state-of-the-art pooling methods in terms of medical image +classification results and confidence calibration across different DNNs. + +
+
+ comment: 30 +
+
+
+
+
+ + ♻ ☆ Lane Graph as Path: Continuity-preserving Path-wise Modeling for Online + Lane Graph Construction ECCV 2024 + + +
+ Online lane graph construction is a promising but challenging task in +autonomous driving. Previous methods usually model the lane graph at the pixel +or piece level, and recover the lane graph by pixel-wise or piece-wise +connection, which breaks down the continuity of the lane and results in +suboptimal performance. Human drivers focus on and drive along the continuous +and complete paths instead of considering lane pieces. Autonomous vehicles also +require path-specific guidance from lane graph for trajectory planning. We +argue that the path, which indicates the traffic flow, is the primitive of the +lane graph. Motivated by this, we propose to model the lane graph in a novel +path-wise manner, which well preserves the continuity of the lane and encodes +traffic information for planning. We present a path-based online lane graph +construction method, termed LaneGAP, which end-to-end learns the path and +recovers the lane graph via a Path2Graph algorithm. We qualitatively and +quantitatively demonstrate the superior accuracy and efficiency of LaneGAP over +conventional pixel-based and piece-based methods on the challenging nuScenes +and Argoverse2 datasets under controllable and fair conditions. Compared to the +recent state-of-the-art piece-wise method TopoNet on the OpenLane-V2 dataset, +LaneGAP still outperforms by 1.6 mIoU, further validating the effectiveness of +path-wise modeling. Abundant visualizations in the supplementary material show +LaneGAP can cope with diverse traffic conditions. Code is released at +\url{https://github.com/hustvl/LaneGAP}. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification using Variational Inference for Biomedical + Image Segmentation + + +
+ Deep learning motivated by convolutional neural networks has been highly +successful in a range of medical imaging problems like image classification, +image segmentation, image synthesis etc. However for validation and +interpretability, not only do we need the predictions made by the model but +also how confident it is while making those predictions. This is important in +safety critical applications for the people to accept it. In this work, we used +an encoder decoder architecture based on variational inference techniques for +segmenting brain tumour images. We evaluate our work on the publicly available +BRATS dataset using Dice Similarity Coefficient (DSC) and Intersection Over +Union (IOU) as the evaluation metrics. Our model is able to segment brain +tumours while taking into account both aleatoric uncertainty and epistemic +uncertainty in a principled bayesian manner. + +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for Weakly Supervised Generation and + Evaluation of Brain Tumor Segmentations on MR Images + + +
+ Segmentation of regions of interest (ROIs) for identifying abnormalities is a +leading problem in medical imaging. Using machine learning for this problem +generally requires manually annotated ground-truth segmentations, demanding +extensive time and resources from radiologists. This work presents a weakly +supervised approach that utilizes binary image-level labels, which are much +simpler to acquire, to effectively segment anomalies in 2D magnetic resonance +images without ground truth annotations. We train a generative adversarial +network (GAN) that converts cancerous images to healthy variants, which are +used along with localization seeds as priors to generate improved weakly +supervised segmentations. The non-cancerous variants can also be used to +evaluate the segmentations in a weakly supervised fashion, which allows for the +most effective segmentations to be identified and then applied to downstream +clinical classification tasks. On the Multimodal Brain Tumor Segmentation +(BraTS) 2020 dataset, our proposed method generates and identifies +segmentations that achieve test Dice coefficients of 83.91%. Using these +segmentations for pathology classification results with a test AUC of 93.32% +which is comparable to the test AUC of 95.80% achieved when using true +segmentations. + +
+
+
+
+
+ + ♻ ☆ Touch-GS: Visual-Tactile Supervised 3D Gaussian Splatting + + +
+ In this work, we propose a novel method to supervise 3D Gaussian Splatting +(3DGS) scenes using optical tactile sensors. Optical tactile sensors have +become widespread in their use in robotics for manipulation and object +representation; however, raw optical tactile sensor data is unsuitable to +directly supervise a 3DGS scene. Our representation leverages a Gaussian +Process Implicit Surface to implicitly represent the object, combining many +touches into a unified representation with uncertainty. We merge this model +with a monocular depth estimation network, which is aligned in a two stage +process, coarsely aligning with a depth camera and then finely adjusting to +match our touch data. For every training image, our method produces a +corresponding fused depth and uncertainty map. Utilizing this additional +information, we propose a new loss function, variance weighted depth supervised +loss, for training the 3DGS scene model. We leverage the DenseTact optical +tactile sensor and RealSense RGB-D camera to show that combining touch and +vision in this manner leads to quantitatively and qualitatively better results +than vision or touch alone in a few-view scene syntheses on opaque as well as +on reflective and transparent objects. Please see our project page at +http://armlabstanford.github.io/touch-gs + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Out-of-Distribution Detection via Deep Multi-Comprehension Ensemble ICML 2024 + + +
+ Recent research underscores the pivotal role of the Out-of-Distribution (OOD) +feature representation field scale in determining the efficacy of models in OOD +detection. Consequently, the adoption of model ensembles has emerged as a +prominent strategy to augment this feature representation field, capitalizing +on anticipated model diversity. + However, our introduction of novel qualitative and quantitative model +ensemble evaluation methods, specifically Loss Basin/Barrier Visualization and +the Self-Coupling Index, reveals a critical drawback in existing ensemble +methods. We find that these methods incorporate weights that are +affine-transformable, exhibiting limited variability and thus failing to +achieve the desired diversity in feature representation. + To address this limitation, we elevate the dimensions of traditional model +ensembles, incorporating various factors such as different weight +initializations, data holdout, etc., into distinct supervision tasks. This +innovative approach, termed Multi-Comprehension (MC) Ensemble, leverages +diverse training tasks to generate distinct comprehensions of the data and +labels, thereby extending the feature representation field. + Our experimental results demonstrate the superior performance of the MC +Ensemble strategy in OOD detection compared to both the naive Deep Ensemble +method and a standalone model of comparable size. This underscores the +effectiveness of our proposed approach in enhancing the model's capability to +detect instances outside its training distribution. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ MxT: Mamba x Transformer for Image Inpainting + + +
+ Image inpainting, or image completion, is a crucial task in computer vision +that aims to restore missing or damaged regions of images with semantically +coherent content. This technique requires a precise balance of local texture +replication and global contextual understanding to ensure the restored image +integrates seamlessly with its surroundings. Traditional methods using +Convolutional Neural Networks (CNNs) are effective at capturing local patterns +but often struggle with broader contextual relationships due to the limited +receptive fields. Recent advancements have incorporated transformers, +leveraging their ability to understand global interactions. However, these +methods face computational inefficiencies and struggle to maintain fine-grained +details. To overcome these challenges, we introduce MxT composed of the +proposed Hybrid Module (HM), which combines Mamba with the transformer in a +synergistic manner. Mamba is adept at efficiently processing long sequences +with linear computational costs, making it an ideal complement to the +transformer for handling long-scale data interactions. Our HM facilitates +dual-level interaction learning at both pixel and patch levels, greatly +enhancing the model to reconstruct images with high quality and contextual +accuracy. We evaluate MxT on the widely-used CelebA-HQ and Places2-standard +datasets, where it consistently outperformed existing state-of-the-art methods. +The code will be released: {\url{https://github.com/ChrisChen1023/MxT}}. + +
+
+
+
+
+ + ♻ ☆ Hyperparameters in Continual Learning: A Reality Check + + +
+ In this paper, we argue that the conventional evaluation protocol in +continual learning (CL) research deviates from the fundamental principle in +machine learning evaluation. The primary objective of CL algorithm is to +balance the trade-off between plasticity (learning new knowledge from new +tasks) and stability (retaining knowledge from previous tasks). To evaluate it, +a CL scenario is constructed by using a benchmark dataset, where a neural +network model is continually trained on the training data of each task, and the +best hyperparameters for a CL algorithm are selected based on validation +data.The final evaluation involves assessing the model trained with these +hyperparameters on the test data from the same scenario. This evaluation +protocol primarily aims to assess how well a CL algorithm performs on unseen +data within that specific scenario. However, to accurately evaluate the CL +algorithm, the focus should be on assessing generalizability of each +algorithm's CL capacity to handle unseen scenarios. To achieve this evaluation +goal, we propose a revised evaluation protocol. Our protocol consists of two +phases: hyperparameter tuning and evaluation. Both phases share the same +scenario configuration (e.g., the number of tasks) but the scenarios for each +phase are generated from different datasets. During the hyperparameter tuning +phase, the best hyperparameters are identified, which are then used to train +the model using the CL algorithm in the evaluation phase. Finally, the result +from this phase is reported as the final evaluation. We apply the proposed +evaluation protocol to class-incremental learning algorithms, both with and +without a pretrained model. Through extensive experiments involving +approximately 5000 trials, we demonstrate that most state-of-the-art algorithms +fail to exhibit the reported performance, revealing a lack of generalizability. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ FedRobo: Federated Learning Driven Autonomous Inter Robots Communication + For Optimal Chemical Sprays + + +
+ Federated Learning enables robots to learn from each other's experiences +without relying on centralized data collection. Each robot independently +maintains a model of crop conditions and chemical spray effectiveness, which is +periodically shared with other robots in the fleet. A communication protocol is +designed to optimize chemical spray applications by facilitating the exchange +of information about crop conditions, weather, and other critical factors. The +federated learning algorithm leverages this shared data to continuously refine +the chemical spray strategy, reducing waste and improving crop yields. This +approach has the potential to revolutionize the agriculture industry by +offering a scalable and efficient solution for crop protection. However, +significant challenges remain, including the development of a secure and robust +communication protocol, the design of a federated learning algorithm that +effectively integrates data from multiple sources, and ensuring the safety and +reliability of autonomous robots. The proposed cluster-based federated learning +approach also effectively reduces the computational load on the global server +and minimizes communication overhead among clients. + +
+
+ comment: This research article is going to be submitted to a best-fit + conference. We are looking for a conference +
+
+
+
+
+ + ♻ ☆ Personalizing Federated Instrument Segmentation with Visual Trait Priors + in Robotic Surgery + + +
+ Personalized federated learning (PFL) for surgical instrument segmentation +(SIS) is a promising approach. It enables multiple clinical sites to +collaboratively train a series of models in privacy, with each model tailored +to the individual distribution of each site. Existing PFL methods rarely +consider the personalization of multi-headed self-attention, and do not account +for appearance diversity and instrument shape similarity, both inherent in +surgical scenes. We thus propose PFedSIS, a novel PFL method with visual trait +priors for SIS, incorporating global-personalized disentanglement (GPD), +appearance-regulation personalized enhancement (APE), and shape-similarity +global enhancement (SGE), to boost SIS performance in each site. GPD represents +the first attempt at head-wise assignment for multi-headed self-attention +personalization. To preserve the unique appearance representation of each site +and gradually leverage the inter-site difference, APE introduces appearance +regulation and provides customized layer-wise aggregation solutions via +hypernetworks for each site's personalized parameters. The mutual shape +information of instruments is maintained and shared via SGE, which enhances the +cross-style shape consistency on the image level and computes the +shape-similarity contribution of each site on the prediction level for updating +the global parameters. PFedSIS outperforms state-of-the-art methods with +1.51% +Dice, +2.11% IoU, -2.79 ASSD, -15.55 HD95 performance gains. The corresponding +code and models will be released at https://github.com/wzjialang/PFedSIS. + +
+
+ comment: 9 pages, 3 figures, under review +
+
+
+
+
+ + ♻ ☆ SATO: Stable Text-to-Motion Framework + + +
+ Is the Text to Motion model robust? Recent advancements in Text to Motion +models primarily stem from more accurate predictions of specific actions. +However, the text modality typically relies solely on pre-trained Contrastive +Language-Image Pretraining (CLIP) models. Our research has uncovered a +significant issue with the text-to-motion model: its predictions often exhibit +inconsistent outputs, resulting in vastly different or even incorrect poses +when presented with semantically similar or identical text inputs. In this +paper, we undertake an analysis to elucidate the underlying causes of this +instability, establishing a clear link between the unpredictability of model +outputs and the erratic attention patterns of the text encoder module. +Consequently, we introduce a formal framework aimed at addressing this issue, +which we term the Stable Text-to-Motion Framework (SATO). SATO consists of +three modules, each dedicated to stable attention, stable prediction, and +maintaining a balance between accuracy and robustness trade-off. We present a +methodology for constructing an SATO that satisfies the stability of attention +and prediction. To verify the stability of the model, we introduced a new +textual synonym perturbation dataset based on HumanML3D and KIT-ML. Results +show that SATO is significantly more stable against synonyms and other slight +perturbations while keeping its high accuracy performance. + +
+
+
+
+
+ + ♻ ☆ Brain Tumor Segmentation (BraTS) Challenge 2024: Meningioma Radiotherapy + Planning Automated Segmentation + + +
+ The 2024 Brain Tumor Segmentation Meningioma Radiotherapy (BraTS-MEN-RT) +challenge aims to advance automated segmentation algorithms using the largest +known multi-institutional dataset of radiotherapy planning brain MRIs with +expert-annotated target labels for patients with intact or postoperative +meningioma that underwent either conventional external beam radiotherapy or +stereotactic radiosurgery. Each case includes a defaced 3D post-contrast +T1-weighted radiotherapy planning MRI in its native acquisition space, +accompanied by a single-label "target volume" representing the gross tumor +volume (GTV) and any at-risk postoperative site. Target volume annotations +adhere to established radiotherapy planning protocols, ensuring consistency +across cases and institutions. For preoperative meningiomas, the target volume +encompasses the entire GTV and associated nodular dural tail, while for +postoperative cases, it includes at-risk resection cavity margins as determined +by the treating institution. Case annotations were reviewed and approved by +expert neuroradiologists and radiation oncologists. Participating teams will +develop, containerize, and evaluate automated segmentation models using this +comprehensive dataset. Model performance will be assessed using an adapted +lesion-wise Dice Similarity Coefficient and the 95% Hausdorff distance. The +top-performing teams will be recognized at the Medical Image Computing and +Computer Assisted Intervention Conference in October 2024. BraTS-MEN-RT is +expected to significantly advance automated radiotherapy planning by enabling +precise tumor segmentation and facilitating tailored treatment, ultimately +improving patient outcomes. + +
+
+ comment: 14 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ UniFed: A Universal Federation of a Mixture of Highly Heterogeneous + Medical Image Classification Tasks MICCAI 2024 + + +
+ A fundamental challenge in federated learning lies in mixing heterogeneous +datasets and classification tasks while minimizing the high communication cost +caused by clients as well as the exchange of weight updates with the server +over a fixed number of rounds. This results in divergent model convergence +rates and performance, which may hinder their deployment in precision medicine. +In real-world scenarios, client data is collected from different hospitals with +extremely varying components (e.g., imaging modality, organ type, etc). +Previous studies often overlooked the convoluted heterogeneity during the +training stage where the target learning tasks vary across clients as well as +the dataset type and their distributions. To address such limitations, we +unprecedentedly introduce UniFed, a universal federated learning paradigm that +aims to classify any disease from any imaging modality. UniFed also handles the +issue of varying convergence times in the client-specific optimization based on +the complexity of their learning tasks. Specifically, by dynamically adjusting +both local and global models, UniFed considers the varying task complexities of +clients and the server, enhancing its adaptability to real-world scenarios, +thereby mitigating issues related to overtraining and excessive communication. +Furthermore, our framework incorporates a sequential model transfer mechanism +that takes into account the diverse tasks among hospitals and a dynamic +task-complexity based ordering. We demonstrate the superiority of our framework +in terms of accuracy, communication cost, and convergence time over relevant +benchmarks in diagnosing retina, histopathology, and liver tumour diseases +under federated learning. Our UniFed code is available at +https://github.com/basiralab/UniFed. + +
+
+ comment: MLMI@MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ GLObal Building heights for Urban Studies (UT-GLOBUS) for city- and + street- scale urban simulations: Development and first applications + + +
+ We introduce University of Texas - Global Building heights for Urban Studies +(UT-GLOBUS), a dataset providing building heights and urban canopy parameters +(UCPs) for more than 1200 cities or locales worldwide. UT-GLOBUS combines +open-source spaceborne altimetry (ICESat-2 and GEDI) and coarse-resolution +urban canopy elevation data with a machine-learning model to estimate +building-level information. Validation using LiDAR data from six US cities +showed UT-GLOBUS-derived building heights had a root mean squared error (RMSE) +of 9.1 meters. Validation of mean building heights within 1-km^2 grid cells, +including data from Hamburg and Sydney, resulted in an RMSE of 7.8 meters. +Testing the UCPs in the urban Weather Research and Forecasting (WRF-Urban) +model resulted in a significant improvement (55% in RMSE) in intra-urban air +temperature representation compared to the existing table-based local climate +zone approach in Houston, TX. Additionally, we demonstrated the dataset's +utility for simulating heat mitigation strategies and building energy +consumption using WRF-Urban, with test cases in Chicago, IL, and Austin, TX. +Street-scale mean radiant temperature simulations using the Solar and LongWave +Environmental Irradiance Geometry (SOLWEIG) model, incorporating UT-GLOBUS and +LiDAR-derived building heights, confirmed the dataset's effectiveness in +modeling human thermal comfort in Baltimore, MD (daytime RMSE = 2.85 C). Thus, +UT-GLOBUS can be used for modeling urban hazards with significant socioeconomic +and biometeorological risks, enabling finer scale urban climate simulations and +overcoming previous limitations due to the lack of building information. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ DaRec: A Disentangled Alignment Framework for Large Language Model and + Recommender System + + +
+ Benefiting from the strong reasoning capabilities, Large language models +(LLMs) have demonstrated remarkable performance in recommender systems. Various +efforts have been made to distill knowledge from LLMs to enhance collaborative +models, employing techniques like contrastive learning for representation +alignment. In this work, we prove that directly aligning the representations of +LLMs and collaborative models is sub-optimal for enhancing downstream +recommendation tasks performance, based on the information theorem. +Consequently, the challenge of effectively aligning semantic representations +between collaborative models and LLMs remains unresolved. Inspired by this +viewpoint, we propose a novel plug-and-play alignment framework for LLMs and +collaborative models. Specifically, we first disentangle the latent +representations of both LLMs and collaborative models into specific and shared +components via projection layers and representation regularization. +Subsequently, we perform both global and local structure alignment on the +shared representations to facilitate knowledge transfer. Additionally, we +theoretically prove that the specific and shared representations contain more +pertinent and less irrelevant information, which can enhance the effectiveness +of downstream recommendation tasks. Extensive experimental results on benchmark +datasets demonstrate that our method is superior to existing state-of-the-art +algorithms. + +
+
+
+
+
+ + ☆ Modeling Domain and Feedback Transitions for Cross-Domain Sequential + Recommendation + + +
+ Nowadays, many recommender systems encompass various domains to cater to +users' diverse needs, leading to user behaviors transitioning across different +domains. In fact, user behaviors across different domains reveal changes in +preference toward recommended items. For instance, a shift from negative +feedback to positive feedback indicates improved user satisfaction. However, +existing cross-domain sequential recommendation methods typically model user +interests by focusing solely on information about domain transitions, often +overlooking the valuable insights provided by users' feedback transitions. In +this paper, we propose $\text{Transition}^2$, a novel method to model +transitions across both domains and types of user feedback. Specifically, +$\text{Transition}^2$ introduces a transition-aware graph encoder based on user +history, assigning different weights to edges according to the feedback type. +This enables the graph encoder to extract historical embeddings that capture +the transition information between different domains and feedback types. +Subsequently, we encode the user history using a cross-transition multi-head +self-attention, incorporating various masks to distinguish different types of +transitions. Finally, we integrate these modules to make predictions across +different domains. Experimental results on two public datasets demonstrate the +effectiveness of $\text{Transition}^2$. + +
+
+
+
+
+ + ☆ LLM4DSR: Leveraing Large Language Model for Denoising Sequential + Recommendation + + +
+ Sequential recommendation systems fundamentally rely on users' historical +interaction sequences, which are often contaminated by noisy interactions. +Identifying these noisy interactions accurately without additional information +is particularly difficult due to the lack of explicit supervisory signals to +denote noise. Large Language Models (LLMs), equipped with extensive open +knowledge and semantic reasoning abilities, present a promising avenue to +bridge this information gap. However, employing LLMs for denoising in +sequential recommendation introduces notable challenges: 1) Direct application +of pretrained LLMs may not be competent for the denoising task, frequently +generating nonsensical responses; 2) Even after fine-tuning, the reliability of +LLM outputs remains questionable, especially given the complexity of the task +and th inherent hallucinatory issue of LLMs. + To tackle these challenges, we propose LLM4DSR, a tailored approach for +denoising sequential recommendation using LLMs. We constructed a +self-supervised fine-tuning task to activate LLMs' capabilities to identify +noisy items and suggest replacements. Furthermore, we developed an uncertainty +estimation module that ensures only high-confidence responses are utilized for +sequence corrections. Remarkably, LLM4DSR is model-agnostic, allowing the +corrected sequences to be flexibly applied across various recommendation +models. Extensive experiments validate the superiority of LLM4DSR over existing +methods across three datasets and three recommendation backbones. + +
+
+
+
+
+ + ☆ From Clicks to Carbon: The Environmental Toll of Recommender Systems + + +
+ As global warming soars, evaluating the environmental impact of research is +more critical now than ever before. However, we find that few to no recommender +systems research papers document their impact on the environment. Consequently, +in this paper, we conduct a comprehensive analysis of the environmental impact +of recommender system research by reproducing a characteristic recommender +systems experimental pipeline. We focus on estimating the carbon footprint of +recommender systems research papers, highlighting the evolution of the +environmental impact of recommender systems research experiments over time. We +thoroughly evaluated all 79 full papers from the ACM RecSys conference in the +years 2013 and 2023 to analyze representative experimental pipelines for papers +utilizing traditional, so-called good old-fashioned AI algorithms and deep +learning algorithms, respectively. We reproduced these representative +experimental pipelines, measured electricity consumption using a hardware +energy meter, and converted the measured energy consumption into CO2 +equivalents to estimate the environmental impact. Our results show that a +recommender systems research paper utilizing deep learning algorithms emits +approximately 42 times more CO2 equivalents than a paper utilizing traditional +algorithms. Furthermore, on average, such a paper produces 3,297 kilograms of +CO2 equivalents, which is more than one person produces by flying from New York +City to Melbourne or the amount one tree sequesters in 300 years. + +
+
+ comment: Accepted for presentation at the 18th ACM Conference on Recommender + Systems in the Reproducibility Track +
+
+
+
+
+ + ☆ KGV: Integrating Large Language Models with Knowledge Graphs for Cyber + Threat Intelligence Credibility Assessment + + +
+ Cyber threat intelligence is a critical tool that many organizations and +individuals use to protect themselves from sophisticated, organized, +persistent, and weaponized cyber attacks. However, few studies have focused on +the quality assessment of threat intelligence provided by intelligence +platforms, and this work still requires manual analysis by cybersecurity +experts. In this paper, we propose a knowledge graph-based verifier, a novel +Cyber Threat Intelligence (CTI) quality assessment framework that combines +knowledge graphs and Large Language Models (LLMs). Our approach introduces LLMs +to automatically extract OSCTI key claims to be verified and utilizes a +knowledge graph consisting of paragraphs for fact-checking. This method differs +from the traditional way of constructing complex knowledge graphs with entities +as nodes. By constructing knowledge graphs with paragraphs as nodes and +semantic similarity as edges, it effectively enhances the semantic +understanding ability of the model and simplifies labeling requirements. +Additionally, to fill the gap in the research field, we created and made public +the first dataset for threat intelligence assessment from heterogeneous +sources. To the best of our knowledge, this work is the first to create a +dataset on threat intelligence reliability verification, providing a reference +for future research. Experimental results show that KGV (Knowledge Graph +Verifier) significantly improves the performance of LLMs in intelligence +quality assessment. Compared with traditional methods, we reduce a large amount +of data annotation while the model still exhibits strong reasoning +capabilities. Finally, our method can achieve XXX accuracy in network threat +assessment. + +
+
+
+
+
+ + ☆ Extracting Sentence Embeddings from Pretrained Transformer Models + + +
+ Background/introduction: Pre-trained transformer models shine in many natural +language processing tasks and therefore are expected to bear the representation +of the input sentence or text meaning. These sentence-level embeddings are also +important in retrieval-augmented generation. But do commonly used plain +averaging or prompt templates surface it enough? + Methods: Given 110M parameters BERT's hidden representations from multiple +layers and multiple tokens we tried various ways to extract optimal sentence +representations. We tested various token aggregation and representation +post-processing techniques. We also tested multiple ways of using a general +Wikitext dataset to complement BERTs sentence representations. All methods were +tested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12 +classification tasks. We also evaluated our representation-shaping techniques +on other static models, including random token representations. + Results: Proposed representation extraction methods improved the performance +on STS and clustering tasks for all models considered. Very high improvements +for static token-based models, especially random embeddings for STS tasks +almost reach the performance of BERT-derived representations. + Conclusions: Our work shows that for multiple tasks simple baselines with +representation shaping techniques reach or even outperform more complex +BERT-based models or are able to contribute to their performance. + +
+
+
+
+
+ + ☆ Mamba Retriever: Utilizing Mamba for Effective and Efficient Dense + Retrieval + + +
+ In the information retrieval (IR) area, dense retrieval (DR) models use deep +learning techniques to encode queries and passages into embedding space to +compute their semantic relations. It is important for DR models to balance both +efficiency and effectiveness. Pre-trained language models (PLMs), especially +Transformer-based PLMs, have been proven to be effective encoders of DR models. +However, the self-attention component in Transformer-based PLM results in a +computational complexity that grows quadratically with sequence length, and +thus exhibits a slow inference speed for long-text retrieval. Some recently +proposed non-Transformer PLMs, especially the Mamba architecture PLMs, have +demonstrated not only comparable effectiveness to Transformer-based PLMs on +generative language tasks but also better efficiency due to linear time scaling +in sequence length. This paper implements the Mamba Retriever to explore +whether Mamba can serve as an effective and efficient encoder of DR model for +IR tasks. We fine-tune the Mamba Retriever on the classic short-text MS MARCO +passage ranking dataset and the long-text LoCoV0 dataset. Experimental results +show that (1) on the MS MARCO passage ranking dataset and BEIR, the Mamba +Retriever achieves comparable or better effectiveness compared to +Transformer-based retrieval models, and the effectiveness grows with the size +of the Mamba model; (2) on the long-text LoCoV0 dataset, the Mamba Retriever +can extend to longer text length than its pre-trained length after fine-tuning +on retrieval task, and it has comparable or better effectiveness compared to +other long-text retrieval models; (3) the Mamba Retriever has superior +inference speed for long-text retrieval. In conclusion, Mamba Retriever is both +effective and efficient, making it a practical model, especially for long-text +retrieval. + +
+
+
+
+
+ + ☆ An Efficient Continuous Control Perspective for + Reinforcement-Learning-based Sequential Recommendation + + +
+ Sequential recommendation, where user preference is dynamically inferred from +sequential historical behaviors, is a critical task in recommender systems +(RSs). To further optimize long-term user engagement, offline +reinforcement-learning-based RSs have become a mainstream technique as they +provide an additional advantage in avoiding global explorations that may harm +online users' experiences. However, previous studies mainly focus on discrete +action and policy spaces, which might have difficulties in handling +dramatically growing items efficiently. + To mitigate this issue, in this paper, we aim to design an algorithmic +framework applicable to continuous policies. To facilitate the control in the +low-dimensional but dense user preference space, we propose an +\underline{\textbf{E}}fficient \underline{\textbf{Co}}ntinuous +\underline{\textbf{C}}ontrol framework (ECoC). Based on a statistically tested +assumption, we first propose the novel unified action representation abstracted +from normalized user and item spaces. Then, we develop the corresponding policy +evaluation and policy improvement procedures. During this process, strategic +exploration and directional control in terms of unified actions are carefully +designed and crucial to final recommendation decisions. Moreover, beneficial +from unified actions, the conservatism regularization for policies and value +functions are combined and perfectly compatible with the continuous framework. +The resulting dual regularization ensures the successful offline training of +RL-based recommendation policies. Finally, we conduct extensive experiments to +validate the effectiveness of our framework. The results show that compared to +the discrete baselines, our ECoC is trained far more efficiently. Meanwhile, +the final policies outperform baselines in both capturing the offline data and +gaining long-term rewards. + +
+
+
+
+
+ + ☆ AIE: Auction Information Enhanced Framework for CTR Prediction in Online + Advertising + + +
+ Click-Through Rate (CTR) prediction is a fundamental technique for online +advertising recommendation and the complex online competitive auction process +also brings many difficulties to CTR optimization. Recent studies have shown +that introducing posterior auction information contributes to the performance +of CTR prediction. However, existing work doesn't fully capitalize on the +benefits of auction information and overlooks the data bias brought by the +auction, leading to biased and suboptimal results. To address these +limitations, we propose Auction Information Enhanced Framework (AIE) for CTR +prediction in online advertising, which delves into the problem of insufficient +utilization of auction signals and first reveals the auction bias. +Specifically, AIE introduces two pluggable modules, namely Adaptive +Market-price Auxiliary Module (AM2) and Bid Calibration Module (BCM), which +work collaboratively to excavate the posterior auction signals better and +enhance the performance of CTR prediction. Furthermore, the two proposed +modules are lightweight, model-agnostic, and friendly to inference latency. +Extensive experiments are conducted on a public dataset and an industrial +dataset to demonstrate the effectiveness and compatibility of AIE. Besides, a +one-month online A/B test in a large-scale advertising platform shows that AIE +improves the base model by 5.76% and 2.44% in terms of eCPM and CTR, +respectively. + +
+
+
+
+
+ + ☆ The Nah Bandit: Modeling User Non-compliance in Recommendation Systems + + +
+ Recommendation systems now pervade the digital world, ranging from +advertising to entertainment. However, it remains challenging to implement +effective recommendation systems in the physical world, such as in mobility or +health. This work focuses on a key challenge: in the physical world, it is +often easy for the user to opt out of taking any recommendation if they are not +to her liking, and to fall back to her baseline behavior. It is thus crucial in +cyber-physical recommendation systems to operate with an interaction model that +is aware of such user behavior, lest the user abandon the recommendations +altogether. This paper thus introduces the Nah Bandit, a tongue-in-cheek +reference to describe a Bandit problem where users can say `nah' to the +recommendation and opt for their preferred option instead. As such, this +problem lies in between a typical bandit setup and supervised learning. We +model the user non-compliance by parameterizing an anchoring effect of +recommendations on users. We then propose the Expert with Clustering (EWC) +algorithm, a hierarchical approach that incorporates feedback from both +recommended and non-recommended options to accelerate user preference learning. +In a recommendation scenario with $N$ users, $T$ rounds per user, and $K$ +clusters, EWC achieves a regret bound of $O(N\sqrt{T\log K} + NT)$, achieving +superior theoretical performance in the short term compared to LinUCB +algorithm. Experimental results also highlight that EWC outperforms both +supervised learning and traditional contextual bandit approaches. This +advancement reveals that effective use of non-compliance feedback can +accelerate preference learning and improve recommendation accuracy. This work +lays the foundation for future research in Nah Bandit, providing a robust +framework for more effective recommendation systems. + +
+
+ comment: 12 pages, 8 figures, under review +
+
+
+
+
+ + ☆ W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question + Answering + + +
+ In knowledge-intensive tasks such as open-domain question answering (OpenQA), +Large Language Models (LLMs) often struggle to generate factual answers relying +solely on their internal (parametric) knowledge. To address this limitation, +Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving +relevant information from external sources, thereby positioning the retriever +as a pivotal component. Although dense retrieval demonstrates state-of-the-art +performance, its training poses challenges due to the scarcity of ground-truth +evidence, largely attributed to the high costs of human annotation. In this +paper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create +weakly labeled data for training dense retrievers. Specifically, we rerank the +top-$K$ passages retrieved via BM25 by assessing the probability that LLMs will +generate the correct answer based on the question and each passage. The +highest-ranking passages are then used as positive training examples for dense +retrieval. Our comprehensive experiments across four publicly available OpenQA +datasets demonstrate that our approach enhances both retrieval and OpenQA +performance compared to baseline models. + +
+
+
+
+
+ + ☆ Towards Realistic Synthetic User-Generated Content: A Scaffolding + Approach to Generating Online Discussions + + +
+ The emergence of synthetic data represents a pivotal shift in modern machine +learning, offering a solution to satisfy the need for large volumes of data in +domains where real data is scarce, highly private, or difficult to obtain. We +investigate the feasibility of creating realistic, large-scale synthetic +datasets of user-generated content, noting that such content is increasingly +prevalent and a source of frequently sought information. Large language models +(LLMs) offer a starting point for generating synthetic social media discussion +threads, due to their ability to produce diverse responses that typify online +interactions. However, as we demonstrate, straightforward application of LLMs +yields limited success in capturing the complex structure of online +discussions, and standard prompting mechanisms lack sufficient control. We +therefore propose a multi-step generation process, predicated on the idea of +creating compact representations of discussion threads, referred to as +scaffolds. Our framework is generic yet adaptable to the unique characteristics +of specific social media platforms. We demonstrate its feasibility using data +from two distinct online discussion platforms. To address the fundamental +challenge of ensuring the representativeness and realism of synthetic data, we +propose a portfolio of evaluation measures to compare various instantiations of +our framework. + +
+
+
+
+
+ + ♻ ☆ Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization + in CTR Prediction + + +
+ In the realm of modern mobile E-commerce, providing users with nearby +commercial service recommendations through location-based online services has +become increasingly vital. While machine learning approaches have shown promise +in multi-scene recommendation, existing methodologies often struggle to address +cold-start problems in unprecedented scenes: the increasing diversity of +commercial choices, along with the short online lifespan of scenes, give rise +to the complexity of effective recommendations in online and dynamic scenes. In +this work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that +emphasizes high-performance cold-start online recommendations for new scenes. +Our approach introduces several crucial capabilities, including scene +similarity learning, user-specific scene transition cognition, scene-specific +information construction for the new scene, and enhancing the diverged logical +information between scenes. We demonstrate SwAN's potential to optimize dynamic +multi-scene recommendation problems by effectively online handling cold-start +recommendations for any newly arrived scenes. More encouragingly, SwAN has been +successfully deployed in Meituan's online catering recommendation service, +which serves millions of customers per day, and SwAN has achieved a 5.64% CTR +index improvement relative to the baselines and a 5.19% increase in daily order +volume proportion. + +
+
+ comment: 10 pages, 6 figures, accepted by Recsys 2024 +
+
+
+
+
+ + ♻ ☆ Judgement Citation Retrieval using Contextual Similarity + + +
+ Traditionally in the domain of legal research, the retrieval of pertinent +citations from intricate case descriptions has demanded manual effort and +keyword-based search applications that mandate expertise in understanding legal +jargon. Legal case descriptions hold pivotal information for legal +professionals and researchers, necessitating more efficient and automated +approaches. We propose a methodology that combines natural language processing +(NLP) and machine learning techniques to enhance the organization and +utilization of legal case descriptions. This approach revolves around the +creation of textual embeddings with the help of state-of-art embedding models. +Our methodology addresses two primary objectives: unsupervised clustering and +supervised citation retrieval, both designed to automate the citation +extraction process. Although the proposed methodology can be used for any +dataset, we employed the Supreme Court of The United States (SCOTUS) dataset, +yielding remarkable results. Our methodology achieved an impressive accuracy +rate of 90.9%. By automating labor-intensive processes, we pave the way for a +more efficient, time-saving, and accessible landscape in legal research, +benefiting legal professionals, academics, and researchers. + +
+
+ comment: 14 pages, 16 images +
+
+
+
+
+ + ♻ ☆ RAGSys: Item-Cold-Start Recommender as RAG System + + +
+ Large Language Models (LLM) hold immense promise for real-world applications, +but their generic knowledge often falls short of domain-specific needs. +Fine-tuning, a common approach, can suffer from catastrophic forgetting and +hinder generalizability. In-Context Learning (ICL) offers an alternative, which +can leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant +demonstrations for few-shot learning tasks. This paper explores the desired +qualities of a demonstration retrieval system for ICL. We argue that ICL +retrieval in this context resembles item-cold-start recommender systems, +prioritizing discovery and maximizing information gain over strict relevance. +We propose a novel evaluation method that measures the LLM's subsequent +performance on NLP tasks, eliminating the need for subjective diversity scores. +Our findings demonstrate the critical role of diversity and quality bias in +retrieved demonstrations for effective ICL, and highlight the potential of +recommender system techniques in this domain. + +
+
+
+
+
+
+
+
+ + Machine Learning 142 + +
+
+
+ + ☆ Can Large Language Models Understand Symbolic Graphics Programs? + + +
+ Assessing the capabilities of large language models (LLMs) is often +challenging, in part, because it is hard to find tasks to which they have not +been exposed during training. We take one step to address this challenge by +turning to a new task: focusing on symbolic graphics programs, which are a +popular representation for graphics content that procedurally generates visual +data. LLMs have shown exciting promise towards program synthesis, but do they +understand symbolic graphics programs? Unlike conventional programs, symbolic +graphics programs can be translated to graphics content. Here, we characterize +an LLM's understanding of symbolic programs in terms of their ability to answer +questions related to the graphics content. This task is challenging as the +questions are difficult to answer from the symbolic programs alone -- yet, they +would be easy to answer from the corresponding graphics content as we verify +through a human experiment. To understand symbolic programs, LLMs may need to +possess the ability to imagine how the corresponding graphics content would +look without directly accessing the rendered visual content. We use this task +to evaluate LLMs by creating a large benchmark for the semantic understanding +of symbolic graphics programs. This benchmark is built via program-graphics +correspondence, hence requiring minimal human efforts. We evaluate current LLMs +on our benchmark to elucidate a preliminary assessment of their ability to +reason about visual scenes from programs. We find that this task distinguishes +existing LLMs and models considered good at reasoning perform better. Lastly, +we introduce Symbolic Instruction Tuning (SIT) to improve this ability. +Specifically, we query GPT4-o with questions and images generated by symbolic +programs. Such data are then used to finetune an LLM. We also find that SIT +data can improve the general instruction following ability of LLMs. + +
+
+ comment: Technical Report v1 (44 pages, 23 figures, project page: + https://sgp-bench.github.io/) +
+
+
+
+
+ + ☆ Understanding the Local Geometry of Generative Model Manifolds + + +
+ Deep generative models learn continuous representations of complex data +manifolds using a finite number of samples during training. For a pre-trained +generative model, the common way to evaluate the quality of the manifold +representation learned, is by computing global metrics like Fr\'echet Inception +Distance using a large number of generated and real samples. However, +generative model performance is not uniform across the learned manifold, e.g., +for \textit{foundation models} like Stable Diffusion generation performance can +vary significantly based on the conditioning or initial noise vector being +denoised. In this paper we study the relationship between the \textit{local +geometry of the learned manifold} and downstream generation. Based on the +theory of continuous piecewise-linear (CPWL) generators, we use three geometric +descriptors - scaling ($\psi$), rank ($\nu$), and complexity ($\delta$) - to +characterize a pre-trained generative model manifold locally. We provide +quantitative and qualitative evidence showing that for a given latent, the +local descriptors are correlated with generation aesthetics, artifacts, +uncertainty, and even memorization. Finally we demonstrate that training a +\textit{reward model} on the local geometry can allow controlling the +likelihood of a generated sample under the learned distribution. + +
+
+ comment: Pre-print. 11 pages main, 8 pages app., 28 figures +
+
+
+
+
+ + ☆ Benchmarking the Capabilities of Large Language Models in Transportation + System Engineering: Accuracy, Consistency, and Reasoning Behaviors + + +
+ In this paper, we explore the capabilities of state-of-the-art large language +models (LLMs) such as GPT-4, GPT-4o, Claude 3.5 Sonnet, Claude 3 Opus, Gemini +1.5 Pro, Llama 3, and Llama 3.1 in solving some selected undergraduate-level +transportation engineering problems. We introduce TransportBench, a benchmark +dataset that includes a sample of transportation engineering problems on a wide +range of subjects in the context of planning, design, management, and control +of transportation systems. This dataset is used by human experts to evaluate +the capabilities of various commercial and open-sourced LLMs, especially their +accuracy, consistency, and reasoning behaviors, in solving transportation +engineering problems. Our comprehensive analysis uncovers the unique strengths +and limitations of each LLM, e.g. our analysis shows the impressive accuracy +and some unexpected inconsistent behaviors of Claude 3.5 Sonnet in solving +TransportBench problems. Our study marks a thrilling first step toward +harnessing artificial general intelligence for complex transportation +challenges. + +
+
+
+
+
+ + ☆ HELP: Hierarchical Embeddings-based Log Parsing + + +
+ Logs are a first-hand source of information for software maintenance and +failure diagnosis. Log parsing, which converts semi-structured log messages +into structured templates, is a prerequisite for automated log analysis tasks +such as anomaly detection, troubleshooting, and root cause analysis. However, +existing log parsers fail in real-world systems for three main reasons. First, +traditional heuristics-based parsers require handcrafted features and domain +knowledge, which are difficult to generalize at scale. Second, existing large +language model-based parsers rely on periodic offline processing, limiting +their effectiveness in real-time use cases. Third, existing online parsing +algorithms are susceptible to log drift, where slight log changes create false +positives that drown out real anomalies. To address these challenges, we +propose HELP, a Hierarchical Embeddings-based Log Parser. HELP is the first +online semantic-based parser to leverage LLMs for performant and cost-effective +log parsing. We achieve this through a novel hierarchical embeddings module, +which fine-tunes a text embedding model to cluster logs before parsing, +reducing querying costs by multiple orders of magnitude. To combat log drift, +we also develop an iterative rebalancing module, which periodically updates +existing log groupings. We evaluate HELP extensively on 14 public large-scale +datasets, showing that HELP achieves significantly higher F1-weighted grouping +and parsing accuracy than current state-of-the-art online log parsers. We also +implement HELP into Iudex's production observability platform, confirming +HELP's practicality in a production environment. Our results show that HELP is +effective and efficient for high-throughput real-world log parsing. + +
+
+
+
+
+ + ☆ SLCA++: Unleash the Power of Sequential Fine-tuning for Continual + Learning with Pre-training ICCV 23 + + +
+ In recent years, continual learning with pre-training (CLPT) has received +widespread interest, instead of its traditional focus of training from scratch. +The use of strong pre-trained models (PTMs) can greatly facilitate knowledge +transfer and alleviate catastrophic forgetting, but also suffers from +progressive overfitting of pre-trained knowledge into specific downstream +tasks. A majority of current efforts often keep the PTMs frozen and incorporate +task-specific prompts to instruct representation learning, coupled with a +prompt selection process for inference. However, due to the limited capacity of +prompt parameters, this strategy demonstrates only sub-optimal performance in +continual learning. In comparison, tuning all parameters of PTMs often provides +the greatest potential for representation learning, making sequential +fine-tuning (Seq FT) a fundamental baseline that has been overlooked in CLPT. +To this end, we present an in-depth analysis of the progressive overfitting +problem from the lens of Seq FT. Considering that the overly fast +representation learning and the biased classification layer constitute this +particular problem, we introduce the advanced Slow Learner with Classifier +Alignment (SLCA++) framework to unleash the power of Seq FT, serving as a +strong baseline approach for CLPT. Our approach involves a Slow Learner to +selectively reduce the learning rate of backbone parameters, and a Classifier +Alignment to align the disjoint classification layers in a post-hoc fashion. We +further enhance the efficacy of SL with a symmetric cross-entropy loss, as well +as employ a parameter-efficient strategy to implement Seq FT with SLCA++. +Across a variety of continual learning scenarios on image classification +benchmarks, our approach provides substantial improvements and outperforms +state-of-the-art methods by a large margin. Code: +https://github.com/GengDavid/SLCA. + +
+
+ comment: This paper is an extension of our ICCV 23 paper (arXiv:2303.05118) +
+
+
+
+
+ + ☆ Aliasing and Label-Independent Decomposition of Risk: Beyond the + bias-variance trade-off + + +
+ A central problem in data science is to use potentially noisy samples of an +unknown function to predict function values for unseen inputs. In classical +statistics, the predictive error is understood as a trade-off between the bias +and the variance that balances model simplicity with its ability to fit complex +functions. However, over-parameterized models exhibit counter-intuitive +behaviors, such as "double descent" in which models of increasing complexity +exhibit decreasing generalization error. We introduce an alternative paradigm +called the generalized aliasing decomposition. We explain the asymptotically +small error of complex models as a systematic "de-aliasing" that occurs in the +over-parameterized regime. In the limit of large models, the contribution due +to aliasing vanishes, leaving an expression for the asymptotic total error we +call the invertibility failure of very large models on few training points. +Because the generalized aliasing decomposition can be explicitly calculated +from the relationship between model class and samples without seeing any data +labels, it can answer questions related to experimental design and model +selection before collecting data or performing experiments. We demonstrate this +approach using several examples, including classical regression problems and a +cluster expansion model used in materials science. + +
+
+
+
+
+ + ☆ Absence of Closed-Form Descriptions for Gradient Flow in Two-Layer + Narrow Networks + + +
+ In the field of machine learning, comprehending the intricate training +dynamics of neural networks poses a significant challenge. This paper explores +the training dynamics of neural networks, particularly whether these dynamics +can be expressed in a general closed-form solution. We demonstrate that the +dynamics of the gradient flow in two-layer narrow networks is not an integrable +system. Integrable systems are characterized by trajectories confined to +submanifolds defined by level sets of first integrals (invariants), +facilitating predictable and reducible dynamics. In contrast, non-integrable +systems exhibit complex behaviors that are difficult to predict. To establish +the non-integrability, we employ differential Galois theory, which focuses on +the solvability of linear differential equations. We demonstrate that under +mild conditions, the identity component of the differential Galois group of the +variational equations of the gradient flow is non-solvable. This result +confirms the system's non-integrability and implies that the training dynamics +cannot be represented by Liouvillian functions, precluding a closed-form +solution for describing these dynamics. Our findings highlight the necessity of +employing numerical methods to tackle optimization problems within neural +networks. The results contribute to a deeper understanding of neural network +training dynamics and their implications for machine learning optimization +strategies. + +
+
+
+
+
+ + ☆ Accurate and efficient structure elucidation from routine + one-dimensional NMR spectra using multitask machine learning + + +
+ Rapid determination of molecular structures can greatly accelerate workflows +across many chemical disciplines. However, elucidating structure using only +one-dimensional (1D) NMR spectra, the most readily accessible data, remains an +extremely challenging problem because of the combinatorial explosion of the +number of possible molecules as the number of constituent atoms is increased. +Here, we introduce a multitask machine learning framework that predicts the +molecular structure (formula and connectivity) of an unknown compound solely +based on its 1D 1H and/or 13C NMR spectra. First, we show how a transformer +architecture can be constructed to efficiently solve the task, traditionally +performed by chemists, of assembling large numbers of molecular fragments into +molecular structures. Integrating this capability with a convolutional neural +network (CNN), we build an end-to-end model for predicting structure from +spectra that is fast and accurate. We demonstrate the effectiveness of this +framework on molecules with up to 19 heavy (non-hydrogen) atoms, a size for +which there are trillions of possible structures. Without relying on any prior +chemical knowledge such as the molecular formula, we show that our approach +predicts the exact molecule 69.6% of the time within the first 15 predictions, +reducing the search space by up to 11 orders of magnitude. + +
+
+
+
+
+ + ☆ Autonomous Behavior Planning For Humanoid Loco-manipulation Through + Grounded Language Model IROS 2024 + + +
+ Enabling humanoid robots to perform autonomously loco-manipulation in +unstructured environments is crucial and highly challenging for achieving +embodied intelligence. This involves robots being able to plan their actions +and behaviors in long-horizon tasks while using multi-modality to perceive +deviations between task execution and high-level planning. Recently, large +language models (LLMs) have demonstrated powerful planning and reasoning +capabilities for comprehension and processing of semantic information through +robot control tasks, as well as the usability of analytical judgment and +decision-making for multi-modal inputs. To leverage the power of LLMs towards +humanoid loco-manipulation, we propose a novel language-model based framework +that enables robots to autonomously plan behaviors and low-level execution +under given textual instructions, while observing and correcting failures that +may occur during task execution. To systematically evaluate this framework in +grounding LLMs, we created the robot 'action' and 'sensing' behavior library +for task planning, and conducted mobile manipulation tasks and experiments in +both simulated and real environments using the CENTAURO robot, and verified the +effectiveness and application of this approach in robotic tasks with autonomous +behavioral planning. + +
+
+ comment: Paper accepted by IROS 2024 +
+
+
+
+
+ + ☆ BAM! Just Like That: Simple and Efficient Parameter Upcycling for + Mixture of Experts + + +
+ The Mixture of Experts (MoE) framework has become a popular architecture for +large language models due to its superior performance over dense models. +However, training MoEs from scratch in a large-scale regime is prohibitively +expensive. Existing methods mitigate this by pre-training multiple dense expert +models independently and using them to initialize an MoE. This is done by using +experts' feed-forward network (FFN) to initialize the MoE's experts while +merging other parameters. However, this method limits the reuse of dense model +parameters to only the FFN layers, thereby constraining the advantages when +"upcycling" these models into MoEs. We propose BAM (Branch-Attend-Mix), a +simple yet effective method that addresses this shortcoming. BAM makes full use +of specialized dense models by not only using their FFN to initialize the MoE +layers but also leveraging experts' attention parameters fully by initializing +them into a soft-variant of Mixture of Attention (MoA) layers. We explore two +methods for upcycling attention parameters: 1) initializing separate attention +experts from dense models including all attention parameters for the best model +performance; and 2) sharing key and value parameters across all experts to +facilitate for better inference efficiency. To further improve efficiency, we +adopt a parallel attention transformer architecture to MoEs, which allows the +attention experts and FFN experts to be computed concurrently. Our experiments +on seed models ranging from 590 million to 2 billion parameters demonstrate +that BAM surpasses baselines in both perplexity and downstream task +performance, within the same computational and data constraints. + +
+
+
+
+
+ + ☆ Is Knowledge Power? On the (Im)possibility of Learning from Strategic + Interaction + + +
+ When learning in strategic environments, a key question is whether agents can +overcome uncertainty about their preferences to achieve outcomes they could +have achieved absent any uncertainty. Can they do this solely through +interactions with each other? We focus this question on the ability of agents +to attain the value of their Stackelberg optimal strategy and study the impact +of information asymmetry. We study repeated interactions in fully strategic +environments where players' actions are decided based on learning algorithms +that take into account their observed histories and knowledge of the game. We +study the pure Nash equilibria (PNE) of a meta-game where players choose these +algorithms as their actions. We demonstrate that if one player has perfect +knowledge about the game, then any initial informational gap persists. That is, +while there is always a PNE in which the informed agent achieves her +Stackelberg value, there is a game where no PNE of the meta-game allows the +partially informed player to achieve her Stackelberg value. On the other hand, +if both players start with some uncertainty about the game, the quality of +information alone does not determine which agent can achieve her Stackelberg +value. In this case, the concept of information asymmetry becomes nuanced and +depends on the game's structure. Overall, our findings suggest that repeated +strategic interactions alone cannot facilitate learning effectively enough to +earn an uninformed player her Stackelberg value. + +
+
+
+
+
+ + ☆ InVAErt networks for amortized inference and identifiability analysis of + lumped parameter hemodynamic models + + +
+ Estimation of cardiovascular model parameters from electronic health records +(EHR) poses a significant challenge primarily due to lack of identifiability. +Structural non-identifiability arises when a manifold in the space of +parameters is mapped to a common output, while practical non-identifiability +can result due to limited data, model misspecification, or noise corruption. To +address the resulting ill-posed inverse problem, optimization-based or Bayesian +inference approaches typically use regularization, thereby limiting the +possibility of discovering multiple solutions. In this study, we use inVAErt +networks, a neural network-based, data-driven framework for enhanced digital +twin analysis of stiff dynamical systems. We demonstrate the flexibility and +effectiveness of inVAErt networks in the context of physiological inversion of +a six-compartment lumped parameter hemodynamic model from synthetic data to +real data with missing components. + +
+
+
+
+
+ + ☆ GSVD-NMF: Recovering Missing Features in Non-negative Matrix + Factorization + + +
+ Non-negative matrix factorization (NMF) is an important tool in signal +processing and widely used to separate mixed sources into their components. +However, NMF is NP-hard and thus may fail to discover the ideal factorization; +moreover, the number of components may not be known in advance and thus +features may be missed or incompletely separated. To recover missing components +from under-complete NMF, we introduce GSVD-NMF, which proposes new components +based on the generalized singular value decomposition (GSVD) between +preliminary NMF results and the SVD of the original matrix. Simulation and +experimental results demonstrate that GSVD-NMF often recovers missing features +from under-complete NMF and helps NMF achieve better local optima. + +
+
+
+
+
+ + ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce +\textbf{\textit{Snuffy}} architecture, a novel MIL-pooling method based on +sparse transformers that mitigates performance loss with limited pre-training +and enables continual few-shot pre-training as a competitive option. Our +sparsity pattern is tailored for pathology and is theoretically proven to be a +universal approximator with the tightest probabilistic sharp bound on the +number of layers for sparse transformers, to date. We demonstrate Snuffy's +effectiveness on CAMELYON16 and TCGA Lung cancer datasets, achieving superior +WSI and patch-level accuracies. The code is available on +\url{https://github.com/jafarinia/snuffy}. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ☆ Derivative-Free Guidance in Continuous and Discrete Diffusion Models + with Soft Value-Based Decoding + + +
+ Diffusion models excel at capturing the natural design spaces of images, +molecules, DNA, RNA, and protein sequences. However, rather than merely +generating designs that are natural, we often aim to optimize downstream reward +functions while preserving the naturalness of these design spaces. Existing +methods for achieving this goal often require ``differentiable'' proxy models +(\textit{e.g.}, classifier guidance or DPS) or involve computationally +expensive fine-tuning of diffusion models (\textit{e.g.}, classifier-free +guidance, RL-based fine-tuning). In our work, we propose a new method to +address these challenges. Our algorithm is an iterative sampling method that +integrates soft value functions, which looks ahead to how intermediate noisy +states lead to high rewards in the future, into the standard inference +procedure of pre-trained diffusion models. Notably, our approach avoids +fine-tuning generative models and eliminates the need to construct +differentiable models. This enables us to (1) directly utilize +non-differentiable features/reward feedback, commonly used in many scientific +domains, and (2) apply our method to recent discrete diffusion models in a +principled way. Finally, we demonstrate the effectiveness of our algorithm +across several domains, including image generation, molecule generation, and +DNA/RNA sequence generation. The code is available at +\href{https://github.com/masa-ue/SVDD}{https://github.com/masa-ue/SVDD}. + +
+
+ comment: The code is available at https://github.com/masa-ue/SVDD +
+
+
+
+
+ + ☆ A Conflicts-free, Speed-lossless KAN-based Reinforcement Learning + Decision System for Interactive Driving in Roundabouts + + +
+ Safety and efficiency are crucial for autonomous driving in roundabouts, +especially in the context of mixed traffic where autonomous vehicles (AVs) and +human-driven vehicles coexist. This paper introduces a learning-based algorithm +tailored to foster safe and efficient driving behaviors across varying levels +of traffic flows in roundabouts. The proposed algorithm employs a deep +Q-learning network to effectively learn safe and efficient driving strategies +in complex multi-vehicle roundabouts. Additionally, a KAN (Kolmogorov-Arnold +network) enhances the AVs' ability to learn their surroundings robustly and +precisely. An action inspector is integrated to replace dangerous actions to +avoid collisions when the AV interacts with the environment, and a route +planner is proposed to enhance the driving efficiency and safety of the AVs. +Moreover, a model predictive control is adopted to ensure stability and +precision of the driving actions. The results show that our proposed system +consistently achieves safe and efficient driving whilst maintaining a stable +training process, as evidenced by the smooth convergence of the reward function +and the low variance in the training curves across various traffic flows. +Compared to state-of-the-art benchmarks, the proposed algorithm achieves a +lower number of collisions and reduced travel time to destination. + +
+
+ comment: 15 pages, 12 figures, submitted to an IEEE journal +
+
+
+
+
+ + ☆ The Z-Gromov-Wasserstein Distance + + +
+ The Gromov-Wasserstein (GW) distance is a powerful tool for comparing metric +measure spaces which has found broad applications in data science and machine +learning. Driven by the need to analyze datasets whose objects have +increasingly complex structure (such as node and edge-attributed graphs), +several variants of GW distance have been introduced in the recent literature. +With a view toward establishing a general framework for the theory of GW-like +distances, this paper considers a vast generalization of the notion of a metric +measure space: for an arbitrary metric space $Z$, we define a $Z$-network to be +a measure space endowed with a kernel valued in $Z$. We introduce a method for +comparing $Z$-networks by defining a generalization of GW distance, which we +refer to as $Z$-Gromov-Wasserstein ($Z$-GW) distance. This construction +subsumes many previously known metrics and offers a unified approach to +understanding their shared properties. The paper demonstrates that the $Z$-GW +distance defines a metric on the space of $Z$-networks which retains desirable +properties of $Z$, such as separability, completeness, and geodesicity. Many of +these properties were unknown for existing variants of GW distance that fall +under our framework. Our focus is on foundational theory, but our results also +include computable lower bounds and approximations of the distance which will +be useful for practical applications. + +
+
+
+
+
+ + ☆ Explaining an Agent's Future Beliefs through Temporally Decomposing + Future Reward Estimators ECAI 2024 + + +
+ Future reward estimation is a core component of reinforcement learning +agents; i.e., Q-value and state-value functions, predicting an agent's sum of +future rewards. Their scalar output, however, obfuscates when or what +individual future rewards an agent may expect to receive. We address this by +modifying an agent's future reward estimator to predict their next N expected +rewards, referred to as Temporal Reward Decomposition (TRD). This unlocks novel +explanations of agent behaviour. Through TRD we can: estimate when an agent may +expect to receive a reward, the value of the reward and the agent's confidence +in receiving it; measure an input feature's temporal importance to the agent's +action decisions; and predict the influence of different actions on future +rewards. Furthermore, we show that DQN agents trained on Atari environments can +be efficiently retrained to incorporate TRD with minimal impact on performance. + +
+
+ comment: 7 pages + 3 pages of supplementary material. Published at ECAI 2024 +
+
+
+
+
+ + ☆ Enhancing Sharpness-Aware Minimization by Learning Perturbation Radius ECML + + +
+ Sharpness-aware minimization (SAM) is to improve model generalization by +searching for flat minima in the loss landscape. The SAM update consists of one +step for computing the perturbation and the other for computing the update +gradient. Within the two steps, the choice of the perturbation radius is +crucial to the performance of SAM, but finding an appropriate perturbation +radius is challenging. In this paper, we propose a bilevel optimization +framework called LEarning the perTurbation radiuS (LETS) to learn the +perturbation radius for sharpness-aware minimization algorithms. Specifically, +in the proposed LETS method, the upper-level problem aims at seeking a good +perturbation radius by minimizing the squared generalization gap between the +training and validation losses, while the lower-level problem is the SAM +optimization problem. Moreover, the LETS method can be combined with any +variant of SAM. Experimental results on various architectures and benchmark +datasets in computer vision and natural language processing demonstrate the +effectiveness of the proposed LETS method in improving the performance of SAM. + +
+
+ comment: Accepted by ECML PKDD 2024 +
+
+
+
+
+ + ☆ RED-CT: A Systems Design Methodology for Using LLM-labeled Data to Train + and Deploy Edge Classifiers for Computational Social Science + + +
+ Large language models (LLMs) have enhanced our ability to rapidly analyze and +classify unstructured natural language data. However, concerns regarding cost, +network limitations, and security constraints have posed challenges for their +integration into work processes. In this study, we adopt a systems design +approach to employing LLMs as imperfect data annotators for downstream +supervised learning tasks, introducing novel system intervention measures aimed +at improving classification performance. Our methodology outperforms +LLM-generated labels in seven of eight tests, demonstrating an effective +strategy for incorporating LLMs into the design and deployment of specialized, +supervised learning models present in many industry use cases. + +
+
+
+
+
+ + ☆ Moving Healthcare AI-Support Systems for Visually Detectable Diseases + onto Constrained Devices + + +
+ Image classification usually requires connectivity and access to the cloud +which is often limited in many parts of the world, including hard to reach +rural areas. TinyML aims to solve this problem by hosting AI assistants on +constrained devices, eliminating connectivity issues by processing data within +the device itself, without internet or cloud access. This pilot study explores +the use of tinyML to provide healthcare support with low spec devices in low +connectivity environments, focusing on diagnosis of skin diseases and the +ethical use of AI assistants in a healthcare setting. To investigate this, +10,000 images of skin lesions were used to train a model for classifying +visually detectable diseases (VDDs). The model weights were then offloaded to a +Raspberry Pi with a webcam attached, to be used for the classification of skin +lesions without internet access. It was found that the developed prototype +achieved a test accuracy of 78% and a test loss of 1.08. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Federated Fairness Analytics: Quantifying Fairness in Federated Learning + + +
+ Federated Learning (FL) is a privacy-enhancing technology for distributed ML. +By training models locally and aggregating updates - a federation learns +together, while bypassing centralised data collection. FL is increasingly +popular in healthcare, finance and personal computing. However, it inherits +fairness challenges from classical ML and introduces new ones, resulting from +differences in data quality, client participation, communication constraints, +aggregation methods and underlying hardware. Fairness remains an unresolved +issue in FL and the community has identified an absence of succinct definitions +and metrics to quantify fairness; to address this, we propose Federated +Fairness Analytics - a methodology for measuring fairness. Our definition of +fairness comprises four notions with novel, corresponding metrics. They are +symptomatically defined and leverage techniques originating from XAI, +cooperative game-theory and networking engineering. We tested a range of +experimental settings, varying the FL approach, ML task and data settings. The +results show that statistical heterogeneity and client participation affect +fairness and fairness conscious approaches such as Ditto and q-FedAvg +marginally improve fairness-performance trade-offs. Using our techniques, FL +practitioners can uncover previously unobtainable insights into their system's +fairness, at differing levels of granularity in order to address fairness +challenges in FL. We have open-sourced our work at: +https://github.com/oscardilley/federated-fairness. + +
+
+
+
+
+ + ☆ Does Reasoning Emerge? Examining the Probabilities of Causation in Large + Language Models + + +
+ Recent advances in AI have been significantly driven by the capabilities of +large language models (LLMs) to solve complex problems in ways that resemble +human thinking. However, there is an ongoing debate about the extent to which +LLMs are capable of actual reasoning. Central to this debate are two key +probabilistic concepts that are essential for connecting causes to their +effects: the probability of necessity (PN) and the probability of sufficiency +(PS). This paper introduces a framework that is both theoretical and practical, +aimed at assessing how effectively LLMs are able to replicate real-world +reasoning mechanisms using these probabilistic measures. By viewing LLMs as +abstract machines that process information through a natural language +interface, we examine the conditions under which it is possible to compute +suitable approximations of PN and PS. Our research marks an important step +towards gaining a deeper understanding of when LLMs are capable of reasoning, +as illustrated by a series of math examples. + +
+
+
+
+
+ + ☆ Stochastic Semi-Gradient Descent for Learning Mean Field Games with + Population-Aware Function Approximation + + +
+ Mean field games (MFGs) model the interactions within a large-population +multi-agent system using the population distribution. Traditional learning +methods for MFGs are based on fixed-point iteration (FPI), which calculates +best responses and induced population distribution separately and sequentially. +However, FPI-type methods suffer from inefficiency and instability, due to +oscillations caused by the forward-backward procedure. This paper considers an +online learning method for MFGs, where an agent updates its policy and +population estimates simultaneously and fully asynchronously, resulting in a +simple stochastic gradient descent (SGD) type method called SemiSGD. Not only +does SemiSGD exhibit numerical stability and efficiency, but it also provides a +novel perspective by treating the value function and population distribution as +a unified parameter. We theoretically show that SemiSGD directs this unified +parameter along a descent direction to the mean field equilibrium. Motivated by +this perspective, we develop a linear function approximation (LFA) for both the +value function and the population distribution, resulting in the first +population-aware LFA for MFGs on continuous state-action space. Finite-time +convergence and approximation error analysis are provided for SemiSGD equipped +with population-aware LFA. + +
+
+
+
+
+ + ☆ Data-driven identification of latent port-Hamiltonian systems + + +
+ Conventional physics-based modeling techniques involve high effort, e.g., +time and expert knowledge, while data-driven methods often lack +interpretability, structure, and sometimes reliability. To mitigate this, we +present a data-driven system identification framework that derives models in +the port-Hamiltonian (pH) formulation. This formulation is suitable for +multi-physical systems while guaranteeing the useful system theoretical +properties of passivity and stability. Our framework combines linear and +nonlinear reduction with structured, physics-motivated system identification. +In this process, high-dimensional state data obtained from possibly nonlinear +systems serves as input for an autoencoder, which then performs two tasks: (i) +nonlinearly transforming and (ii) reducing this data onto a low-dimensional +latent space. In this space, a linear pH system, that satisfies the pH +properties per construction, is parameterized by the weights of a neural +network. The mathematical requirements are met by defining the pH matrices +through Cholesky factorizations. The neural networks that define the coordinate +transformation and the pH system are identified in a joint optimization process +to match the dynamics observed in the data while defining a linear pH system in +the latent space. The learned, low-dimensional pH system can describe even +nonlinear systems and is rapidly computable due to its small size. The method +is exemplified by a parametric mass-spring-damper and a nonlinear pendulum +example, as well as the high-dimensional model of a disc brake with linear +thermoelastic behavior. + +
+
+ comment: 33 pages, 8 figures +
+
+
+
+
+ + ☆ Not Every Image is Worth a Thousand Words: Quantifying Originality in + Stable Diffusion ICML 2024 + + +
+ This work addresses the challenge of quantifying originality in text-to-image +(T2I) generative diffusion models, with a focus on copyright originality. We +begin by evaluating T2I models' ability to innovate and generalize through +controlled experiments, revealing that stable diffusion models can effectively +recreate unseen elements with sufficiently diverse training data. Then, our key +insight is that concepts and combinations of image elements the model is +familiar with, and saw more during training, are more concisly represented in +the model's latent space. We hence propose a method that leverages textual +inversion to measure the originality of an image based on the number of tokens +required for its reconstruction by the model. Our approach is inspired by legal +definitions of originality and aims to assess whether a model can produce +original content without relying on specific prompts or having the training +data of the model. We demonstrate our method using both a pre-trained stable +diffusion model and a synthetic dataset, showing a correlation between the +number of tokens and image originality. This work contributes to the +understanding of originality in generative models and has implications for +copyright infringement cases. + +
+
+ comment: GenLaw ICML 2024 +
+
+
+
+
+ + ☆ Machine learning empowered Modulation detection for OFDM-based signals + + +
+ We propose a blind ML-based modulation detection for OFDM-based technologies. +Unlike previous works that assume an ideal environment with precise knowledge +of subcarrier count and cyclic prefix location, we consider blind modulation +detection while accounting for realistic environmental parameters and +imperfections. Our approach employs a ResNet network to simultaneously detect +the modulation type and accurately locate the cyclic prefix. Specifically, +after eliminating the environmental impact from the signal and accurately +extracting the OFDM symbols, we convert these symbols into scatter plots. Due +to their unique shapes, these scatter plots are then classified using ResNet. +As a result, our proposed modulation classification method can be applied to +any OFDM-based technology without prior knowledge of the transmitted signal. We +evaluate its performance across various modulation schemes and subcarrier +numbers. Simulation results show that our method achieves a modulation +detection accuracy exceeding $80\%$ at an SNR of $10$ dB and $95\%$ at an SNR +of $25$ dB. + +
+
+
+
+
+ + ☆ Towards flexible perception with visual memory + + +
+ Training a neural network is a monolithic endeavor, akin to carving knowledge +into stone: once the process is completed, editing the knowledge in a network +is nearly impossible, since all information is distributed across the network's +weights. We here explore a simple, compelling alternative by marrying the +representational power of deep neural networks with the flexibility of a +database. Decomposing the task of image classification into image similarity +(from a pre-trained embedding) and search (via fast nearest neighbor retrieval +from a knowledge database), we build a simple and flexible visual memory that +has the following key capabilities: (1.) The ability to flexibly add data +across scales: from individual samples all the way to entire classes and +billion-scale data; (2.) The ability to remove data through unlearning and +memory pruning; (3.) An interpretable decision-mechanism on which we can +intervene to control its behavior. Taken together, these capabilities +comprehensively demonstrate the benefits of an explicit visual memory. We hope +that it might contribute to a conversation on how knowledge should be +represented in deep vision models -- beyond carving it in ``stone'' weights. + +
+
+
+
+
+ + ☆ DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for + Reinforcement Learning and Monte-Carlo Tree Search + + +
+ We introduce DeepSeek-Prover-V1.5, an open-source language model designed for +theorem proving in Lean 4, which enhances DeepSeek-Prover-V1 by optimizing both +training and inference processes. Pre-trained on DeepSeekMath-Base with +specialization in formal mathematical languages, the model undergoes supervised +fine-tuning using an enhanced formal theorem proving dataset derived from +DeepSeek-Prover-V1. Further refinement is achieved through reinforcement +learning from proof assistant feedback (RLPAF). Beyond the single-pass +whole-proof generation approach of DeepSeek-Prover-V1, we propose RMaxTS, a +variant of Monte-Carlo tree search that employs an intrinsic-reward-driven +exploration strategy to generate diverse proof paths. DeepSeek-Prover-V1.5 +demonstrates significant improvements over DeepSeek-Prover-V1, achieving new +state-of-the-art results on the test set of the high school level miniF2F +benchmark ($63.5\%$) and the undergraduate level ProofNet benchmark ($25.3\%$). + +
+
+
+
+
+ + ☆ P/D-Serve: Serving Disaggregated Large Language Model at Scale + + +
+ Serving disaggregated large language models (LLMs) over tens of thousands of +xPU devices (GPUs or NPUs) with reliable performance faces multiple challenges. +1) Ignoring the diversity (various prefixes and tidal requests), treating all +the prompts in a mixed pool is inadequate. To facilitate the similarity per +scenario and minimize the inner mismatch on P/D (prefill and decoding) +processing, fine-grained organization is required, dynamically adjusting P/D +ratios for better performance. 2) Due to inaccurate estimation on workload +(queue status or maintained connections), the global scheduler easily incurs +unnecessary timeouts in prefill. 3) Block-fixed device-to-device (D2D) KVCache +transfer over cluster-level RDMA (remote direct memory access) fails to achieve +desired D2D utilization as expected. To overcome previous problems, this paper +proposes an end-to-end system P/D-Serve, complying with the paradigm of MLOps +(machine learning operations), which models end-to-end (E2E) P/D performance +and enables: 1) fine-grained P/D organization, mapping the service with RoCE +(RDMA over converged ethernet) as needed, to facilitate similar processing and +dynamic adjustments on P/D ratios; 2) on-demand forwarding upon rejections for +idle prefill, decoupling the scheduler from regular inaccurate reports and +local queues, to avoid timeouts in prefill; and 3) efficient KVCache transfer +via optimized D2D access. P/D-Serve is implemented upon Ascend and MindSpore, +has been deployed over tens of thousands of NPUs for more than eight months in +commercial use, and further achieves 60\%, 42\% and 46\% improvements on E2E +throughput, time-to-first-token (TTFT) SLO (service level objective) and D2D +transfer time. As the E2E system with optimizations, P/D-Serve achieves 6.7x +increase on throughput, compared with aggregated LLMs. + +
+
+
+
+
+ + ☆ Impact of Comprehensive Data Preprocessing on Predictive Modelling of + COVID-19 Mortality + + +
+ Accurate predictive models are crucial for analysing COVID-19 mortality +trends. This study evaluates the impact of a custom data preprocessing pipeline +on ten machine learning models predicting COVID-19 mortality using data from +Our World in Data (OWID). Our pipeline differs from a standard preprocessing +pipeline through four key steps. Firstly, it transforms weekly reported totals +into daily updates, correcting reporting biases and providing more accurate +estimates. Secondly, it uses localised outlier detection and processing to +preserve data variance and enhance accuracy. Thirdly, it utilises computational +dependencies among columns to ensure data consistency. Finally, it incorporates +an iterative feature selection process to optimise the feature set and improve +model performance. Results show a significant improvement with the custom +pipeline: the MLP Regressor achieved a test RMSE of 66.556 and a test R-squared +of 0.991, surpassing the DecisionTree Regressor from the standard pipeline, +which had a test RMSE of 222.858 and a test R-squared of 0.817. These findings +highlight the importance of tailored preprocessing techniques in enhancing +predictive modelling accuracy for COVID-19 mortality. Although specific to this +study, these methodologies offer valuable insights into diverse datasets and +domains, improving predictive performance across various contexts. + +
+
+ comment: 8 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Normalized AOPC: Fixing Misleading Faithfulness Metrics for Feature + Attribution Explainability + + +
+ Deep neural network predictions are notoriously difficult to interpret. +Feature attribution methods aim to explain these predictions by identifying the +contribution of each input feature. Faithfulness, often evaluated using the +area over the perturbation curve (AOPC), reflects feature attributions' +accuracy in describing the internal mechanisms of deep neural networks. +However, many studies rely on AOPC to compare faithfulness across different +models, which we show can lead to false conclusions about models' faithfulness. +Specifically, we find that AOPC is sensitive to variations in the model, +resulting in unreliable cross-model comparisons. Moreover, AOPC scores are +difficult to interpret in isolation without knowing the model-specific lower +and upper limits. To address these issues, we propose a normalization approach, +Normalized AOPC (NAOPC), enabling consistent cross-model evaluations and more +meaningful interpretation of individual scores. Our experiments demonstrate +that this normalization can radically change AOPC results, questioning the +conclusions of earlier studies and offering a more robust framework for +assessing feature attribution faithfulness. + +
+
+
+
+
+ + ☆ EXPLAIN, AGREE, LEARN: Scaling Learning for Neural Probabilistic Logic + + +
+ Neural probabilistic logic systems follow the neuro-symbolic (NeSy) paradigm +by combining the perceptive and learning capabilities of neural networks with +the robustness of probabilistic logic. Learning corresponds to likelihood +optimization of the neural networks. However, to obtain the likelihood exactly, +expensive probabilistic logic inference is required. To scale learning to more +complex systems, we therefore propose to instead optimize a sampling based +objective. We prove that the objective has a bounded error with respect to the +likelihood, which vanishes when increasing the sample count. Furthermore, the +error vanishes faster by exploiting a new concept of sample diversity. We then +develop the EXPLAIN, AGREE, LEARN (EXAL) method that uses this objective. +EXPLAIN samples explanations for the data. AGREE reweighs each explanation in +concordance with the neural component. LEARN uses the reweighed explanations as +a signal for learning. In contrast to previous NeSy methods, EXAL can scale to +larger problem sizes while retaining theoretical guarantees on the error. +Experimentally, our theoretical claims are verified and EXAL outperforms recent +NeSy methods when scaling up the MNIST addition and Warcraft pathfinding +problems. + +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Solving Inverse Problems with Neural + Networks + + +
+ Finding model parameters from data is an essential task in science and +engineering, from weather and climate forecasts to plasma control. Previous +works have employed neural networks to greatly accelerate finding solutions to +inverse problems. Of particular interest are end-to-end models which utilize +differentiable simulations in order to backpropagate feedback from the +simulated process to the network weights and enable roll-out of multiple time +steps. So far, it has been assumed that, while model inference is faster than +classical optimization, this comes at the cost of a decrease in solution +accuracy. We show that this is generally not true. In fact, neural networks +trained to learn solutions to inverse problems can find better solutions than +classical optimizers even on their training set. To demonstrate this, we +perform both a theoretical analysis as well an extensive empirical evaluation +on challenging problems involving local minima, chaos, and zero-gradient +regions. Our findings suggest an alternative use for neural networks: rather +than generalizing to new data for fast inference, they can also be used to find +better solutions on known data. + +
+
+ comment: Source code to follow soon: https://ge.in.tum.de +
+
+
+
+
+ + ☆ Learned denoising with simulated and experimental low-dose CT data + + +
+ Like in many other research fields, recent developments in computational +imaging have focused on developing machine learning (ML) approaches to tackle +its main challenges. To improve the performance of computational imaging +algorithms, machine learning methods are used for image processing tasks such +as noise reduction. Generally, these ML methods heavily rely on the +availability of high-quality data on which they are trained. This work explores +the application of ML methods, specifically convolutional neural networks +(CNNs), in the context of noise reduction for computed tomography (CT) imaging. +We utilize a large 2D computed tomography dataset for machine learning to carry +out for the first time a comprehensive study on the differences between the +observed performances of algorithms trained on simulated noisy data and on +real-world experimental noisy data. The study compares the performance of two +common CNN architectures, U-Net and MSD-Net, that are trained and evaluated on +both simulated and experimental noisy data. The results show that while +sinogram denoising performed better with simulated noisy data if evaluated in +the sinogram domain, the performance did not carry over to the reconstruction +domain where training on experimental noisy data shows a higher performance in +denoising experimental noisy data. Training the algorithms in an end-to-end +fashion from sinogram to reconstruction significantly improved model +performance, emphasizing the importance of matching raw measurement data to +high-quality CT reconstructions. The study furthermore suggests the need for +more sophisticated noise simulation approaches to bridge the gap between +simulated and real-world data in CT image denoising applications and gives +insights into the challenges and opportunities in leveraging simulated data for +machine learning in computational imaging. + +
+
+
+
+
+ + ☆ Hearing Your Blood Sugar: Non-Invasive Glucose Measurement Through + Simple Vocal Signals, Transforming any Speech into a Sensor with Machine + Learning + + +
+ Effective diabetes management relies heavily on the continuous monitoring of +blood glucose levels, traditionally achieved through invasive and uncomfortable +methods. While various non-invasive techniques have been explored, such as +optical, microwave, and electrochemical approaches, none have effectively +supplanted these invasive technologies due to issues related to complexity, +accuracy, and cost. In this study, we present a transformative and +straightforward method that utilizes voice analysis to predict blood glucose +levels. Our research investigates the relationship between fluctuations in +blood glucose and vocal characteristics, highlighting the influence of blood +vessel dynamics during voice production. By applying advanced machine learning +algorithms, we analyzed vocal signal variations and established a significant +correlation with blood glucose levels. We developed a predictive model using +artificial intelligence, based on voice recordings and corresponding glucose +measurements from participants, utilizing logistic regression and Ridge +regularization. Our findings indicate that voice analysis may serve as a viable +non-invasive alternative for glucose monitoring. This innovative approach not +only has the potential to streamline and reduce the costs associated with +diabetes management but also aims to enhance the quality of life for +individuals living with diabetes by providing a painless and user-friendly +method for monitoring blood sugar levels. + +
+
+ comment: 5 figure and 5 tables. This manuscript is a pre-print to be submitted + to a journal or/and a conference. arXiv admin note: substantial text overlap + with arXiv:2402.13812 +
+
+
+
+
+ + ☆ Adaptation of uncertainty-penalized Bayesian information criterion for + parametric partial differential equation discovery + + +
+ Data-driven discovery of partial differential equations (PDEs) has emerged as +a promising approach for deriving governing physics when domain knowledge about +observed data is limited. Despite recent progress, the identification of +governing equations and their parametric dependencies using conventional +information criteria remains challenging in noisy situations, as the criteria +tend to select overly complex PDEs. In this paper, we introduce an extension of +the uncertainty-penalized Bayesian information criterion (UBIC), which is +adapted to solve parametric PDE discovery problems efficiently without +requiring computationally expensive PDE simulations. This extended UBIC uses +quantified PDE uncertainty over different temporal or spatial points to prevent +overfitting in model selection. The UBIC is computed with data transformation +based on power spectral densities to discover the governing parametric PDE that +truly captures qualitative features in frequency space with a few significant +terms and their parametric dependencies (i.e., the varying PDE coefficients), +evaluated with confidence intervals. Numerical experiments on canonical PDEs +demonstrate that our extended UBIC can identify the true number of terms and +their varying coefficients accurately, even in the presence of noise. The code +is available at +\url{https://github.com/Pongpisit-Thanasutives/parametric-discovery}. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ An Efficient Replay for Class-Incremental Learning with Pre-trained + Models + + +
+ In general class-incremental learning, researchers typically use sample sets +as a tool to avoid catastrophic forgetting during continuous learning. At the +same time, researchers have also noted the differences between +class-incremental learning and Oracle training and have attempted to make +corrections. In recent years, researchers have begun to develop +class-incremental learning algorithms utilizing pre-trained models, achieving +significant results. This paper observes that in class-incremental learning, +the steady state among the weight guided by each class center is disrupted, +which is significantly correlated with catastrophic forgetting. Based on this, +we propose a new method to overcoming forgetting . In some cases, by retaining +only a single sample unit of each class in memory for replay and applying +simple gradient constraints, very good results can be achieved. Experimental +results indicate that under the condition of pre-trained models, our method can +achieve competitive performance with very low computational cost and by simply +using the cross-entropy loss. + +
+
+
+
+
+ + ☆ Independent Policy Mirror Descent for Markov Potential Games: Scaling to + Large Number of Players + + +
+ Markov Potential Games (MPGs) form an important sub-class of Markov games, +which are a common framework to model multi-agent reinforcement learning +problems. In particular, MPGs include as a special case the identical-interest +setting where all the agents share the same reward function. Scaling the +performance of Nash equilibrium learning algorithms to a large number of agents +is crucial for multi-agent systems. To address this important challenge, we +focus on the independent learning setting where agents can only have access to +their local information to update their own policy. In prior work on MPGs, the +iteration complexity for obtaining $\epsilon$-Nash regret scales linearly with +the number of agents $N$. In this work, we investigate the iteration complexity +of an independent policy mirror descent (PMD) algorithm for MPGs. We show that +PMD with KL regularization, also known as natural policy gradient, enjoys a +better $\sqrt{N}$ dependence on the number of agents, improving over PMD with +Euclidean regularization and prior work. Furthermore, the iteration complexity +is also independent of the sizes of the agents' action spaces. + +
+
+ comment: 16 pages, CDC 2024 +
+
+
+
+
+ + ☆ A Survey on Integrated Sensing, Communication, and Computation + + +
+ The forthcoming generation of wireless technology, 6G, promises a +revolutionary leap beyond traditional data-centric services. It aims to usher +in an era of ubiquitous intelligent services, where everything is +interconnected and intelligent. This vision requires the seamless integration +of three fundamental modules: Sensing for information acquisition, +communication for information sharing, and computation for information +processing and decision-making. These modules are intricately linked, +especially in complex tasks such as edge learning and inference. However, the +performance of these modules is interdependent, creating a resource competition +for time, energy, and bandwidth. Existing techniques like integrated +communication and computation (ICC), integrated sensing and computation (ISC), +and integrated sensing and communication (ISAC) have made partial strides in +addressing this challenge, but they fall short of meeting the extreme +performance requirements. To overcome these limitations, it is essential to +develop new techniques that comprehensively integrate sensing, communication, +and computation. This integrated approach, known as Integrated Sensing, +Communication, and Computation (ISCC), offers a systematic perspective for +enhancing task performance. This paper begins with a comprehensive survey of +historic and related techniques such as ICC, ISC, and ISAC, highlighting their +strengths and limitations. It then explores the state-of-the-art signal designs +for ISCC, along with network resource management strategies specifically +tailored for ISCC. Furthermore, this paper discusses the exciting research +opportunities that lie ahead for implementing ISCC in future advanced networks. +By embracing ISCC, we can unlock the full potential of intelligent +connectivity, paving the way for groundbreaking applications and services. + +
+
+
+
+
+ + ☆ Extracting Sentence Embeddings from Pretrained Transformer Models + + +
+ Background/introduction: Pre-trained transformer models shine in many natural +language processing tasks and therefore are expected to bear the representation +of the input sentence or text meaning. These sentence-level embeddings are also +important in retrieval-augmented generation. But do commonly used plain +averaging or prompt templates surface it enough? + Methods: Given 110M parameters BERT's hidden representations from multiple +layers and multiple tokens we tried various ways to extract optimal sentence +representations. We tested various token aggregation and representation +post-processing techniques. We also tested multiple ways of using a general +Wikitext dataset to complement BERTs sentence representations. All methods were +tested on 8 Semantic Textual Similarity (STS), 6 short text clustering, and 12 +classification tasks. We also evaluated our representation-shaping techniques +on other static models, including random token representations. + Results: Proposed representation extraction methods improved the performance +on STS and clustering tasks for all models considered. Very high improvements +for static token-based models, especially random embeddings for STS tasks +almost reach the performance of BERT-derived representations. + Conclusions: Our work shows that for multiple tasks simple baselines with +representation shaping techniques reach or even outperform more complex +BERT-based models or are able to contribute to their performance. + +
+
+
+
+
+ + ☆ Universality of Real Minimal Complexity Reservoir + + +
+ Reservoir Computing (RC) models, a subclass of recurrent neural networks, are +distinguished by their fixed, non-trainable input layer and dynamically coupled +reservoir, with only the static readout layer being trained. This design +circumvents the issues associated with backpropagating error signals through +time, thereby enhancing both stability and training efficiency. RC models have +been successfully applied across a broad range of application domains. +Crucially, they have been demonstrated to be universal approximators of +time-invariant dynamic filters with fading memory, under various settings of +approximation norms and input driving sources. + Simple Cycle Reservoirs (SCR) represent a specialized class of RC models with +a highly constrained reservoir architecture, characterized by uniform ring +connectivity and binary input-to-reservoir weights with an aperiodic sign +pattern. For linear reservoirs, given the reservoir size, the reservoir +construction has only one degree of freedom -- the reservoir cycle weight. Such +architectures are particularly amenable to hardware implementations without +significant performance degradation in many practical tasks. In this study we +endow these observations with solid theoretical foundations by proving that +SCRs operating in real domain are universal approximators of time-invariant +dynamic filters with fading memory. Our results supplement recent research +showing that SCRs in the complex domain can approximate, to arbitrary +precision, any unrestricted linear reservoir with a non-linear readout. We +furthermore introduce a novel method to drastically reduce the number of SCR +units, making such highly constrained architectures natural candidates for +low-complexity hardware implementations. Our findings are supported by +empirical studies on real-world time series datasets. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ BINDy -- Bayesian identification of nonlinear dynamics with + reversible-jump Markov-chain Monte-Carlo + + +
+ Model parsimony is an important \emph{cognitive bias} in data-driven +modelling that aids interpretability and helps to prevent over-fitting. Sparse +identification of nonlinear dynamics (SINDy) methods are able to learn sparse +representations of complex dynamics directly from data, given a basis of +library functions. In this work, a novel Bayesian treatment of dictionary +learning system identification, as an alternative to SINDy, is envisaged. The +proposed method -- Bayesian identification of nonlinear dynamics (BINDy) -- is +distinct from previous approaches in that it targets the full joint posterior +distribution over both the terms in the library and their parameterisation in +the model. This formulation confers the advantage that an arbitrary prior may +be placed over the model structure to produce models that are sparse in the +model space rather than in parameter space. Because this posterior is defined +over parameter vectors that can change in dimension, the inference cannot be +performed by standard techniques. Instead, a Gibbs sampler based on +reversible-jump Markov-chain Monte-Carlo is proposed. BINDy is shown to compare +favourably to ensemble SINDy in three benchmark case-studies. In particular, it +is seen that the proposed method is better able to assign high probability to +correct model terms. + +
+
+
+
+
+ + ☆ Maximally Permissive Reward Machines ECAI + + +
+ Reward machines allow the definition of rewards for temporally extended tasks +and behaviors. Specifying "informative" reward machines can be challenging. One +way to address this is to generate reward machines from a high-level abstract +description of the learning environment, using techniques such as AI planning. +However, previous planning-based approaches generate a reward machine based on +a single (sequential or partial-order) plan, and do not allow maximum +flexibility to the learning agent. In this paper we propose a new approach to +synthesising reward machines which is based on the set of partial order plans +for a goal. We prove that learning using such "maximally permissive" reward +machines results in higher rewards than learning using RMs based on a single +plan. We present experimental results which support our theoretical claims by +showing that our approach obtains higher rewards than the single-plan approach +in practice. + +
+
+ comment: Paper accepted for publication at the European Conference on + Artificial Intelligence (ECAI) 2024 +
+
+
+
+
+ + ☆ Navigating Data Scarcity using Foundation Models: A Benchmark of + Few-Shot and Zero-Shot Learning Approaches in Medical Imaging MICCAI 2024 + + +
+ Data scarcity is a major limiting factor for applying modern machine learning +techniques to clinical tasks. Although sufficient data exists for some +well-studied medical tasks, there remains a long tail of clinically relevant +tasks with poor data availability. Recently, numerous foundation models have +demonstrated high suitability for few-shot learning (FSL) and zero-shot +learning (ZSL), potentially making them more accessible to practitioners. +However, it remains unclear which foundation model performs best on FSL medical +image analysis tasks and what the optimal methods are for learning from limited +data. We conducted a comprehensive benchmark study of ZSL and FSL using 16 +pretrained foundation models on 19 diverse medical imaging datasets. Our +results indicate that BiomedCLIP, a model pretrained exclusively on medical +data, performs best on average for very small training set sizes, while very +large CLIP models pretrained on LAION-2B perform best with slightly more +training samples. However, simply fine-tuning a ResNet-18 pretrained on +ImageNet performs similarly with more than five training examples per class. +Our findings also highlight the need for further research on foundation models +specifically tailored for medical applications and the collection of more +datasets to train these models. + +
+
+ comment: Accepted as an oral presentation in MICCAI 2024 2nd International + Workshop on Foundation Models for General Medical AI +
+
+
+
+
+ + ☆ DATTA: Towards Diversity Adaptive Test-Time Adaptation in Dynamic Wild + World + + +
+ Test-time adaptation (TTA) effectively addresses distribution shifts between +training and testing data by adjusting models on test samples, which is crucial +for improving model inference in real-world applications. However, traditional +TTA methods typically follow a fixed pattern to address the dynamic data +patterns (low-diversity or high-diversity patterns) often leading to +performance degradation and consequently a decline in Quality of Experience +(QoE). The primary issues we observed are:Different scenarios require different +normalization methods (e.g., Instance Normalization is optimal in mixed domains +but not in static domains). Model fine-tuning can potentially harm the model +and waste time.Hence, it is crucial to design strategies for effectively +measuring and managing distribution diversity to minimize its negative impact +on model performance. Based on these observations, this paper proposes a new +general method, named Diversity Adaptive Test-Time Adaptation (DATTA), aimed at +improving QoE. DATTA dynamically selects the best batch normalization methods +and fine-tuning strategies by leveraging the Diversity Score to differentiate +between high and low diversity score batches. It features three key components: +Diversity Discrimination (DD) to assess batch diversity, Diversity Adaptive +Batch Normalization (DABN) to tailor normalization methods based on DD +insights, and Diversity Adaptive Fine-Tuning (DAFT) to selectively fine-tune +the model. Experimental results show that our method achieves up to a 21% +increase in accuracy compared to state-of-the-art methodologies, indicating +that our method maintains good model performance while demonstrating its +robustness. Our code will be released soon. + +
+
+ comment: 16 pages, 2 figures +
+
+
+
+
+ + ☆ COTODE: COntinuous Trajectory neural Ordinary Differential Equations for + modelling event sequences + + +
+ Observation of the underlying actors that generate event sequences reveals +that they often evolve continuously. Most modern methods, however, tend to +model such processes through at most piecewise-continuous trajectories. To +address this, we adopt a way of viewing events not as standalone phenomena but +instead as observations of a Gaussian Process, which in turn governs the +actor's dynamics. We propose integrating these obtained dynamics, resulting in +a continuous-trajectory modification of the widely successful Neural ODE model. +Through Gaussian Process theory, we were able to evaluate the uncertainty in an +actor's representation, which arises from not observing them between events. +This estimate led us to develop a novel, theoretically backed negative feedback +mechanism. Empirical studies indicate that our model with Gaussian process +interpolation and negative feedback achieves state-of-the-art performance, with +improvements up to 20% AUROC against similar architectures. + +
+
+
+
+
+ + ☆ An Efficient Continuous Control Perspective for + Reinforcement-Learning-based Sequential Recommendation + + +
+ Sequential recommendation, where user preference is dynamically inferred from +sequential historical behaviors, is a critical task in recommender systems +(RSs). To further optimize long-term user engagement, offline +reinforcement-learning-based RSs have become a mainstream technique as they +provide an additional advantage in avoiding global explorations that may harm +online users' experiences. However, previous studies mainly focus on discrete +action and policy spaces, which might have difficulties in handling +dramatically growing items efficiently. + To mitigate this issue, in this paper, we aim to design an algorithmic +framework applicable to continuous policies. To facilitate the control in the +low-dimensional but dense user preference space, we propose an +\underline{\textbf{E}}fficient \underline{\textbf{Co}}ntinuous +\underline{\textbf{C}}ontrol framework (ECoC). Based on a statistically tested +assumption, we first propose the novel unified action representation abstracted +from normalized user and item spaces. Then, we develop the corresponding policy +evaluation and policy improvement procedures. During this process, strategic +exploration and directional control in terms of unified actions are carefully +designed and crucial to final recommendation decisions. Moreover, beneficial +from unified actions, the conservatism regularization for policies and value +functions are combined and perfectly compatible with the continuous framework. +The resulting dual regularization ensures the successful offline training of +RL-based recommendation policies. Finally, we conduct extensive experiments to +validate the effectiveness of our framework. The results show that compared to +the discrete baselines, our ECoC is trained far more efficiently. Meanwhile, +the final policies outperform baselines in both capturing the offline data and +gaining long-term rewards. + +
+
+
+
+
+ + ☆ The Clever Hans Effect in Unsupervised Learning + + +
+ Unsupervised learning has become an essential building block of AI systems. +The representations it produces, e.g. in foundation models, are critical to a +wide variety of downstream applications. It is therefore important to carefully +examine unsupervised models to ensure not only that they produce accurate +predictions, but also that these predictions are not "right for the wrong +reasons", the so-called Clever Hans (CH) effect. Using specially developed +Explainable AI techniques, we show for the first time that CH effects are +widespread in unsupervised learning. Our empirical findings are enriched by +theoretical insights, which interestingly point to inductive biases in the +unsupervised learning machine as a primary source of CH effects. Overall, our +work sheds light on unexplored risks associated with practical applications of +unsupervised learning and suggests ways to make unsupervised learning more +robust. + +
+
+ comment: 12 pages + supplement +
+
+
+
+
+ + ☆ Adaptive User Journeys in Pharma E-Commerce with Reinforcement Learning: + Insights from SwipeRx KDD 2024 + + +
+ This paper introduces a reinforcement learning (RL) platform that enhances +end-to-end user journeys in healthcare digital tools through personalization. +We explore a case study with SwipeRx, the most popular all-in-one app for +pharmacists in Southeast Asia, demonstrating how the platform can be used to +personalize and adapt user experiences. Our RL framework is tested through a +series of experiments with product recommendations tailored to each pharmacy +based on real-time information on their purchasing history and in-app +engagement, showing a significant increase in basket size. By integrating +adaptive interventions into existing mobile health solutions and enriching user +journeys, our platform offers a scalable solution to improve pharmaceutical +supply chain management, health worker capacity building, and clinical decision +and patient care, ultimately contributing to better healthcare outcomes. + +
+
+ comment: Presented at the Third Workshop on End-to-End Customer Journey + Optimization at KDD 2024 (KDD CJ Workshop '24), August 26, Barcelona, Spain +
+
+
+
+
+ + ☆ Causal Discovery from Time-Series Data with Short-Term Invariance-Based + Convolutional Neural Networks + + +
+ Causal discovery from time-series data aims to capture both intra-slice +(contemporaneous) and inter-slice (time-lagged) causality between variables +within the temporal chain, which is crucial for various scientific disciplines. +Compared to causal discovery from non-time-series data, causal discovery from +time-series data necessitates more serialized samples with a larger amount of +observed time steps. To address the challenges, we propose a novel +gradient-based causal discovery approach STIC, which focuses on +\textbf{S}hort-\textbf{T}erm \textbf{I}nvariance using \textbf{C}onvolutional +neural networks to uncover the causal relationships from time-series data. +Specifically, STIC leverages both the short-term time and mechanism invariance +of causality within each window observation, which possesses the property of +independence, to enhance sample efficiency. Furthermore, we construct two +causal convolution kernels, which correspond to the short-term time and +mechanism invariance respectively, to estimate the window causal graph. To +demonstrate the necessity of convolutional neural networks for causal discovery +from time-series data, we theoretically derive the equivalence between +convolution and the underlying generative principle of time-series data under +the assumption that the additive noise model is identifiable. Experimental +evaluations conducted on both synthetic and FMRI benchmark datasets demonstrate +that our STIC outperforms baselines significantly and achieves the +state-of-the-art performance, particularly when the datasets contain a limited +number of observed time steps. Code is available at +\url{https://github.com/HITshenrj/STIC}. + +
+
+
+
+
+ + ☆ Accelerating High-Fidelity Waveform Generation via Adversarial Flow + Matching Optimization + + +
+ This paper introduces PeriodWave-Turbo, a high-fidelity and high-efficient +waveform generation model via adversarial flow matching optimization. Recently, +conditional flow matching (CFM) generative models have been successfully +adopted for waveform generation tasks, leveraging a single vector field +estimation objective for training. Although these models can generate +high-fidelity waveform signals, they require significantly more ODE steps +compared to GAN-based models, which only need a single generation step. +Additionally, the generated samples often lack high-frequency information due +to noisy vector field estimation, which fails to ensure high-frequency +reproduction. To address this limitation, we enhance pre-trained CFM-based +generative models by incorporating a fixed-step generator modification. We +utilized reconstruction losses and adversarial feedback to accelerate +high-fidelity waveform generation. Through adversarial flow matching +optimization, it only requires 1,000 steps of fine-tuning to achieve +state-of-the-art performance across various objective metrics. Moreover, we +significantly reduce inference speed from 16 steps to 2 or 4 steps. +Additionally, by scaling up the backbone of PeriodWave from 29M to 70M +parameters for improved generalization, PeriodWave-Turbo achieves unprecedented +performance, with a perceptual evaluation of speech quality (PESQ) score of +4.454 on the LibriTTS dataset. Audio samples, source code and checkpoints will +be available at https://github.com/sh-lee-prml/PeriodWave. + +
+
+ comment: 9 pages, 9 tables, 1 figure, +
+
+
+
+
+ + ☆ Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for + Collaborative DNN Training on Heterogeneous Edge Devices + + +
+ On-device Deep Neural Network (DNN) training has been recognized as crucial +for privacy-preserving machine learning at the edge. However, the intensive +training workload and limited onboard computing resources pose significant +challenges to the availability and efficiency of model training. While existing +works address these challenges through native resource management optimization, +we instead leverage our observation that edge environments usually comprise a +rich set of accompanying trusted edge devices with idle resources beyond a +single terminal. We propose Asteroid, a distributed edge training system that +breaks the resource walls across heterogeneous edge devices for efficient model +training acceleration. Asteroid adopts a hybrid pipeline parallelism to +orchestrate distributed training, along with a judicious parallelism planning +for maximizing throughput under certain resource constraints. Furthermore, a +fault-tolerant yet lightweight pipeline replay mechanism is developed to tame +the device-level dynamics for training robustness and performance stability. We +implement Asteroid on heterogeneous edge devices with both vision and language +models, demonstrating up to 12.2x faster training than conventional parallelism +methods and 2.1x faster than state-of-the-art hybrid parallelism methods +through evaluations. Furthermore, Asteroid can recover training pipeline 14x +faster than baseline methods while preserving comparable throughput despite +unexpected device exiting and failure. + +
+
+ comment: Accepted by The 30th Annual International Conference on Mobile + Computing and Networking (MobiCom'24) +
+
+
+
+
+ + ☆ Hessian QM9: A quantum chemistry database of molecular Hessians in + implicit solvents + + +
+ A significant challenge in computational chemistry is developing +approximations that accelerate \emph{ab initio} methods while preserving +accuracy. Machine learning interatomic potentials (MLIPs) have emerged as a +promising solution for constructing atomistic potentials that can be +transferred across different molecular and crystalline systems. Most MLIPs are +trained only on energies and forces in vacuum, while an improved description of +the potential energy surface could be achieved by including the curvature of +the potential energy surface. We present Hessian QM9, the first database of +equilibrium configurations and numerical Hessian matrices, consisting of 41,645 +molecules from the QM9 dataset at the $\omega$B97x/6-31G* level. Molecular +Hessians were calculated in vacuum, as well as water, tetrahydrofuran, and +toluene using an implicit solvation model. To demonstrate the utility of this +dataset, we show that incorporating second derivatives of the potential energy +surface into the loss function of a MLIP significantly improves the prediction +of vibrational frequencies in all solvent environments, thus making this +dataset extremely useful for studying organic molecules in realistic solvent +environments for experimental characterization. + +
+
+ comment: 7 pages, 2 figues +
+
+
+
+
+ + ☆ Inversion-DeepONet: A Novel DeepONet-Based Network with Encoder-Decoder + for Full Waveform Inversion + + +
+ Full waveform inversion (FWI) plays a crucial role in the field of +geophysics. There has been lots of research about applying deep learning (DL) +methods to FWI. The success of DL-FWI relies significantly on the quantity and +diversity of the datasets. Nevertheless, existing FWI datasets, like OpenFWI, +where sources have fixed locations or identical frequencies, provide limited +information and do not represent the complex real-world scene. For instance, +low frequencies help in resolving larger-scale structures. High frequencies +allow for a more detailed subsurface features. %A single source frequency is +insufficient to describe subsurface structural properties. We consider that +simultaneously using sources with different frequencies, instead of performing +inversion using low frequencies data and then gradually introducing higher +frequencies data, has rationale and potential advantages. Hence, we develop +three enhanced datasets based on OpenFWI where each source have varying +locations, frequencies or both. Moreover, we propose a novel deep operator +network (DeepONet) architecture Inversion-DeepONet for FWI. We utilize +convolutional neural network (CNN) to extract the features from seismic data in +branch net. Source parameters, such as locations and frequencies, are fed to +trunk net. Then another CNN is employed as the decoder of DeepONet to +reconstruct the velocity models more effectively. Through experiments, we +confirm the superior performance on accuracy and generalization ability of our +network, compared with existing data-driven FWI methods. + +
+
+
+
+
+ + ☆ Experimental evaluation of offline reinforcement learning for HVAC + control in buildings + + +
+ Reinforcement learning (RL) techniques have been increasingly investigated +for dynamic HVAC control in buildings. However, most studies focus on exploring +solutions in online or off-policy scenarios without discussing in detail the +implementation feasibility or effectiveness of dealing with purely offline +datasets or trajectories. The lack of these works limits the real-world +deployment of RL-based HVAC controllers, especially considering the abundance +of historical data. To this end, this paper comprehensively evaluates the +strengths and limitations of state-of-the-art offline RL algorithms by +conducting analytical and numerical studies. The analysis is conducted from two +perspectives: algorithms and dataset characteristics. As a prerequisite, the +necessity of applying offline RL algorithms is first confirmed in two building +environments. The ability of observation history modeling to reduce violations +and enhance performance is subsequently studied. Next, the performance of +RL-based controllers under datasets with different qualitative and quantitative +conditions is investigated, including constraint satisfaction and power +consumption. Finally, the sensitivity of certain hyperparameters is also +evaluated. The results indicate that datasets of a certain suboptimality level +and relatively small scale can be utilized to effectively train a +well-performed RL-based HVAC controller. Specifically, such controllers can +reduce at most 28.5% violation ratios of indoor temperatures and achieve at +most 12.1% power savings compared to the baseline controller. In summary, this +paper presents our well-structured investigations and new findings when +applying offline reinforcement learning to building HVAC systems. + +
+
+
+
+
+ + ☆ Analytical Uncertainty-Based Loss Weighting in Multi-Task Learning + + +
+ With the rise of neural networks in various domains, multi-task learning +(MTL) gained significant relevance. A key challenge in MTL is balancing +individual task losses during neural network training to improve performance +and efficiency through knowledge sharing across tasks. To address these +challenges, we propose a novel task-weighting method by building on the most +prevalent approach of Uncertainty Weighting and computing analytically optimal +uncertainty-based weights, normalized by a softmax function with tunable +temperature. Our approach yields comparable results to the combinatorially +prohibitive, brute-force approach of Scalarization while offering a more +cost-effective yet high-performing alternative. We conduct an extensive +benchmark on various datasets and architectures. Our method consistently +outperforms six other common weighting methods. Furthermore, we report +noteworthy experimental findings for the practical application of MTL. For +example, larger networks diminish the influence of weighting methods, and +tuning the weight decay has a low impact compared to the learning rate. + +
+
+
+
+
+ + ☆ Coupling without Communication and Drafter-Invariant Speculative + Decoding + + +
+ Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice +wants to generate a sample $a\sim P$ and Bob a sample $b \sim Q$ such that $a = +b$ with has as high of probability as possible. It is well-known that, by +sampling from an optimal coupling between the distributions, Alice and Bob can +achieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total +variation distance. What if Alice and Bob must solve this same problem without +communicating at all? Perhaps surprisingly, with access to public randomness, +they can still achieve $Pr[a = b] \geq \frac{1 - D_{TV}(P,Q)}{1 + D_{TV}(P,Q)} +\geq 1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple +protocol based on the Weighted MinHash algorithm. In this work, we explore the +communication-free coupling in greater depth. First, we show that an equally +simple protocol based on Gumbel sampling matches the worst-case guarantees of +the Weighted MinHash approach, but tends to perform better in practice. +Conversely, we prove that both approaches are actually sharp: no +communication-free protocol can achieve $Pr[a=b]>\frac{1 - D_{TV}(P,Q)}{1 + +D_{TV}(P,Q)}$ in the worst-case. Finally, we prove that, for distributions over +$n$ items, there exists a scheme that uses just $O(\log(n/\epsilon))$ bits of +communication to achieve $Pr[a = b] = 1 - D_{TV}(P,Q) - \epsilon$, i.e. to +essentially match optimal coupling. Beyond our theoretical results, we +demonstrate an application of communication-free coupling to speculative +decoding, a recent method for accelerating autoregressive large language models +[Leviathan, Kalman, Matias, ICML 2023]. We show that communication-free +protocols yield a variant of speculative decoding that we call +Drafter-Invariant Speculative Decoding, which has the desirable property that +the output of the method is fixed given a fixed random seed, regardless of what +drafter is used for speculation. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Addressing Skewed Heterogeneity via Federated Prototype Rectification + with Personalization + + +
+ Federated learning is an efficient framework designed to facilitate +collaborative model training across multiple distributed devices while +preserving user data privacy. A significant challenge of federated learning is +data-level heterogeneity, i.e., skewed or long-tailed distribution of private +data. Although various methods have been proposed to address this challenge, +most of them assume that the underlying global data is uniformly distributed +across all clients. This paper investigates data-level heterogeneity federated +learning with a brief review and redefines a more practical and challenging +setting called Skewed Heterogeneous Federated Learning (SHFL). Accordingly, we +propose a novel Federated Prototype Rectification with Personalization which +consists of two parts: Federated Personalization and Federated Prototype +Rectification. The former aims to construct balanced decision boundaries +between dominant and minority classes based on private data, while the latter +exploits both inter-class discrimination and intra-class consistency to rectify +empirical prototypes. Experiments on three popular benchmarks show that the +proposed approach outperforms current state-of-the-art methods and achieves +balanced performance in both personalization and generalization. + +
+
+
+
+
+ + ☆ Meta SAC-Lag: Towards Deployable Safe Reinforcement Learning via + MetaGradient-based Hyperparameter Tuning IROS + + +
+ Safe Reinforcement Learning (Safe RL) is one of the prevalently studied +subcategories of trial-and-error-based methods with the intention to be +deployed on real-world systems. In safe RL, the goal is to maximize reward +performance while minimizing constraints, often achieved by setting bounds on +constraint functions and utilizing the Lagrangian method. However, deploying +Lagrangian-based safe RL in real-world scenarios is challenging due to the +necessity of threshold fine-tuning, as imprecise adjustments may lead to +suboptimal policy convergence. To mitigate this challenge, we propose a unified +Lagrangian-based model-free architecture called Meta Soft Actor-Critic +Lagrangian (Meta SAC-Lag). Meta SAC-Lag uses meta-gradient optimization to +automatically update the safety-related hyperparameters. The proposed method is +designed to address safe exploration and threshold adjustment with minimal +hyperparameter tuning requirement. In our pipeline, the inner parameters are +updated through the conventional formulation and the hyperparameters are +adjusted using the meta-objectives which are defined based on the updated +parameters. Our results show that the agent can reliably adjust the safety +performance due to the relatively fast convergence rate of the safety +threshold. We evaluate the performance of Meta SAC-Lag in five simulated +environments against Lagrangian baselines, and the results demonstrate its +capability to create synergy between parameters, yielding better or competitive +results. Furthermore, we conduct a real-world experiment involving a robotic +arm tasked with pouring coffee into a cup without spillage. Meta SAC-Lag is +successfully trained to execute the task, while minimizing effort constraints. + +
+
+ comment: Main text accepted to the IEEE/RSJ International Conference on + Intelligent Robots and Systems (IROS) 2024, 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ RandomNet: Clustering Time Series Using Untrained Deep Neural Networks + + +
+ Neural networks are widely used in machine learning and data mining. +Typically, these networks need to be trained, implying the adjustment of +weights (parameters) within the network based on the input data. In this work, +we propose a novel approach, RandomNet, that employs untrained deep neural +networks to cluster time series. RandomNet uses different sets of random +weights to extract diverse representations of time series and then ensembles +the clustering relationships derived from these different representations to +build the final clustering results. By extracting diverse representations, our +model can effectively handle time series with different characteristics. Since +all parameters are randomly generated, no training is required during the +process. We provide a theoretical analysis of the effectiveness of the method. +To validate its performance, we conduct extensive experiments on all of the 128 +datasets in the well-known UCR time series archive and perform statistical +analysis of the results. These datasets have different sizes, sequence lengths, +and they are from diverse fields. The experimental results show that the +proposed method is competitive compared with existing state-of-the-art methods. + +
+
+ comment: 25 pages, 10 figures +
+
+
+
+
+ + ☆ Robust Offline Active Learning on Graphs + + +
+ We consider the problem of active learning on graphs, which has crucial +applications in many real-world networks where labeling node responses is +expensive. In this paper, we propose an offline active learning method that +selects nodes to query by explicitly incorporating information from both the +network structure and node covariates. Building on graph signal recovery +theories and the random spectral sparsification technique, the proposed method +adopts a two-stage biased sampling strategy that takes both informativeness and +representativeness into consideration for node querying. Informativeness refers +to the complexity of graph signals that are learnable from the responses of +queried nodes, while representativeness refers to the capacity of queried nodes +to control generalization errors given noisy node-level information. We +establish a theoretical relationship between generalization error and the +number of nodes selected by the proposed method. Our theoretical results +demonstrate the trade-off between informativeness and representativeness in +active learning. Extensive numerical experiments show that the proposed method +is competitive with existing graph-based active learning methods, especially +when node covariates and responses contain noises. Additionally, the proposed +method is applicable to both regression and classification tasks on graphs. + +
+
+
+
+
+ + ☆ MobileMEF: Fast and Efficient Method for Multi-Exposure Fusion + + +
+ Recent advances in camera design and imaging technology have enabled the +capture of high-quality images using smartphones. However, due to the limited +dynamic range of digital cameras, the quality of photographs captured in +environments with highly imbalanced lighting often results in poor-quality +images. To address this issue, most devices capture multi-exposure frames and +then use some multi-exposure fusion method to merge those frames into a final +fused image. Nevertheless, most traditional and current deep learning +approaches are unsuitable for real-time applications on mobile devices due to +their heavy computational and memory requirements. We propose a new method for +multi-exposure fusion based on an encoder-decoder deep learning architecture +with efficient building blocks tailored for mobile devices. This efficient +design makes our model capable of processing 4K resolution images in less than +2 seconds on mid-range smartphones. Our method outperforms state-of-the-art +techniques regarding full-reference quality measures and computational +efficiency (runtime and memory usage), making it ideal for real-time +applications on hardware-constrained devices. Our code is available at: +https://github.com/LucasKirsten/MobileMEF. + +
+
+
+
+
+ + ☆ A Single Channel-Based Neonatal Sleep-Wake Classification using Hjorth + Parameters and Improved Gradient Boosting + + +
+ Sleep plays a crucial role in neonatal development. Monitoring the sleep +patterns in neonates in a Neonatal Intensive Care Unit (NICU) is imperative for +understanding the maturation process. While polysomnography (PSG) is considered +the best practice for sleep classification, its expense and reliance on human +annotation pose challenges. Existing research often relies on multichannel EEG +signals; however, concerns arise regarding the vulnerability of neonates and +the potential impact on their sleep quality. This paper introduces a novel +approach to neonatal sleep stage classification using a single-channel gradient +boosting algorithm with Hjorth features. The gradient boosting parameters are +fine-tuned using random search cross-validation (randomsearchCV), achieving an +accuracy of 82.35% for neonatal sleep-wake classification. Validation is +conducted through 5-fold cross-validation. The proposed algorithm not only +enhances existing neonatal sleep algorithms but also opens avenues for broader +applications. + +
+
+ comment: 8 pages, 5 figures, 3 tables, International Polydisciplinary + Conference on Artificial Intelligence and New Technologies +
+
+
+
+
+ + ☆ A Deep Features-Based Approach Using Modified ResNet50 and Gradient + Boosting for Visual Sentiments Classification + + +
+ The versatile nature of Visual Sentiment Analysis (VSA) is one reason for its +rising profile. It isn't easy to efficiently manage social media data with +visual information since previous research has concentrated on Sentiment +Analysis (SA) of single modalities, like textual. In addition, most visual +sentiment studies need to adequately classify sentiment because they are mainly +focused on simply merging modal attributes without investigating their +intricate relationships. This prompted the suggestion of developing a fusion of +deep learning and machine learning algorithms. In this research, a deep +feature-based method for multiclass classification has been used to extract +deep features from modified ResNet50. Furthermore, gradient boosting algorithm +has been used to classify photos containing emotional content. The approach is +thoroughly evaluated on two benchmarked datasets, CrowdFlower and GAPED. +Finally, cutting-edge deep learning and machine learning models were used to +compare the proposed strategy. When compared to state-of-the-art approaches, +the proposed method demonstrates exceptional performance on the datasets +presented. + +
+
+ comment: 4 pages, 4 figures, 3 tables, IEEE International Conference on + Multimedia Information Processing and Retrieval (MIPR) 2024 +
+
+
+
+
+ + ☆ Physics-Informed Neural Network for Predicting Out-of-Training-Range + TCAD Solution with Minimized Domain Expertise + + +
+ Machine learning (ML) is promising in assisting technology computer-aided +design (TCAD) simulations to alleviate difficulty in convergence and prolonged +simulation time. While ML is widely used in TCAD, they either require access to +the internal solver, require extensive domain expertise, are only trained by +terminal quantities such as currents and voltages, and/or lack +out-of-training-range prediction capability. In this paper, using Si nanowire +as an example, we demonstrate that it is possible to use a physics-informed +neural network (PINN) to predict out-of-training-range TCAD solutions without +accessing the internal solver and with minimal domain expertise. The machine +not only can predict a 2.5 times larger range than the training but also can +predict the inversion region by only being trained with subthreshold region +data. The physics-informed module is also trained with data without the need +for human-coded equations making this easier to be extended to more +sophisticated systems. + +
+
+
+
+
+ + ☆ CEGRL-TKGR: A Causal Enhanced Graph Representation Learning Framework + for Improving Temporal Knowledge Graph Extrapolation Reasoning + + +
+ Temporal knowledge graph reasoning (TKGR) is increasingly gaining attention +for its ability to extrapolate new events from historical data, thereby +enriching the inherently incomplete temporal knowledge graphs. Existing +graph-based representation learning frameworks have made significant strides in +developing evolving representations for both entities and relational +embeddings. Despite these achievements, there's a notable tendency in these +models to inadvertently learn biased data representations and mine spurious +correlations, consequently failing to discern the causal relationships between +events. This often leads to incorrect predictions based on these false +correlations. To address this, we propose an innovative causal enhanced graph +representation learning framework for TKGR (named CEGRL-TKGR). This framework +introduces causal structures in graph-based representation learning to unveil +the essential causal relationships between events, ultimately enhancing task +performance. Specifically, we first disentangle the evolutionary +representations of entities and relations in a temporal graph sequence into two +distinct components, namely causal representations and confounding +representations. Then, drawing on causal intervention theory, we advocate the +utilization of causal representations for predictions, aiming to mitigate the +effects of erroneous correlations caused by confounding features, thus +achieving more robust and accurate predictions. Finally, extensive experimental +results on six benchmark datasets demonstrate the superior performance of our +model in the link prediction task. + +
+
+
+
+
+ + ☆ KAN versus MLP on Irregular or Noisy Functions + + +
+ In this paper, we compare the performance of Kolmogorov-Arnold Networks (KAN) +and Multi-Layer Perceptron (MLP) networks on irregular or noisy functions. We +control the number of parameters and the size of the training samples to ensure +a fair comparison. For clarity, we categorize the functions into six types: +regular functions, continuous functions with local non-differentiable points, +functions with jump discontinuities, functions with singularities, functions +with coherent oscillations, and noisy functions. Our experimental results +indicate that KAN does not always perform best. For some types of functions, +MLP outperforms or performs comparably to KAN. Furthermore, increasing the size +of training samples can improve performance to some extent. When noise is added +to functions, the irregular features are often obscured by the noise, making it +challenging for both MLP and KAN to extract these features effectively. We hope +these experiments provide valuable insights for future neural network research +and encourage further investigations to overcome these challenges. + +
+
+
+
+
+ + ☆ The Nah Bandit: Modeling User Non-compliance in Recommendation Systems + + +
+ Recommendation systems now pervade the digital world, ranging from +advertising to entertainment. However, it remains challenging to implement +effective recommendation systems in the physical world, such as in mobility or +health. This work focuses on a key challenge: in the physical world, it is +often easy for the user to opt out of taking any recommendation if they are not +to her liking, and to fall back to her baseline behavior. It is thus crucial in +cyber-physical recommendation systems to operate with an interaction model that +is aware of such user behavior, lest the user abandon the recommendations +altogether. This paper thus introduces the Nah Bandit, a tongue-in-cheek +reference to describe a Bandit problem where users can say `nah' to the +recommendation and opt for their preferred option instead. As such, this +problem lies in between a typical bandit setup and supervised learning. We +model the user non-compliance by parameterizing an anchoring effect of +recommendations on users. We then propose the Expert with Clustering (EWC) +algorithm, a hierarchical approach that incorporates feedback from both +recommended and non-recommended options to accelerate user preference learning. +In a recommendation scenario with $N$ users, $T$ rounds per user, and $K$ +clusters, EWC achieves a regret bound of $O(N\sqrt{T\log K} + NT)$, achieving +superior theoretical performance in the short term compared to LinUCB +algorithm. Experimental results also highlight that EWC outperforms both +supervised learning and traditional contextual bandit approaches. This +advancement reveals that effective use of non-compliance feedback can +accelerate preference learning and improve recommendation accuracy. This work +lays the foundation for future research in Nah Bandit, providing a robust +framework for more effective recommendation systems. + +
+
+ comment: 12 pages, 8 figures, under review +
+
+
+
+
+ + ☆ System States Forecasting of Microservices with Dynamic Spatio-Temporal + Data + + +
+ In the AIOps (Artificial Intelligence for IT Operations) era, accurately +forecasting system states is crucial. In microservices systems, this task +encounters the challenge of dynamic and complex spatio-temporal relationships +among microservice instances, primarily due to dynamic deployments, diverse +call paths, and cascading effects among instances. Current time-series +forecasting methods, which focus mainly on intrinsic patterns, are insufficient +in environments where spatial relationships are critical. Similarly, +spatio-temporal graph approaches often neglect the nature of temporal trend, +concentrating mostly on message passing between nodes. Moreover, current +research in microservices domain frequently underestimates the importance of +network metrics and topological structures in capturing the evolving dynamics +of systems. This paper introduces STMformer, a model tailored for forecasting +system states in microservices environments, capable of handling multi-node and +multivariate time series. Our method leverages dynamic network connection data +and topological information to assist in modeling the intricate spatio-temporal +relationships within the system. Additionally, we integrate the +PatchCrossAttention module to compute the impact of cascading effects globally. +We have developed a dataset based on a microservices system and conducted +comprehensive experiments with STMformer against leading methods. In both +short-term and long-term forecasting tasks, our model consistently achieved a +8.6% reduction in MAE(Mean Absolute Error) and a 2.2% reduction in MSE (Mean +Squared Error). The source code is available at +https://github.com/xuyifeiiie/STMformer. + +
+
+
+
+
+ + ☆ Quantum-inspired Interpretable Deep Learning Architecture for Text + Sentiment Analysis + + +
+ Text has become the predominant form of communication on social media, +embedding a wealth of emotional nuances. Consequently, the extraction of +emotional information from text is of paramount importance. Despite previous +research making some progress, existing text sentiment analysis models still +face challenges in integrating diverse semantic information and lack +interpretability. To address these issues, we propose a quantum-inspired deep +learning architecture that combines fundamental principles of quantum mechanics +(QM principles) with deep learning models for text sentiment analysis. +Specifically, we analyze the commonalities between text representation and QM +principles to design a quantum-inspired text representation method and further +develop a quantum-inspired text embedding layer. Additionally, we design a +feature extraction layer based on long short-term memory (LSTM) networks and +self-attention mechanisms (SAMs). Finally, we calculate the text density matrix +using the quantum complex numbers principle and apply 2D-convolution neural +networks (CNNs) for feature condensation and dimensionality reduction. Through +a series of visualization, comparative, and ablation experiments, we +demonstrate that our model not only shows significant advantages in accuracy +and efficiency compared to previous related models but also achieves a certain +level of interpretability by integrating QM principles. Our code is available +at QISA. + +
+
+
+
+
+ + ☆ Local Causal Discovery with Background Knowledge + + +
+ Causality plays a pivotal role in various fields of study. Based on the +framework of causal graphical models, previous works have proposed identifying +whether a variable is a cause or non-cause of a target in every Markov +equivalent graph solely by learning a local structure. However, the presence of +prior knowledge, often represented as a partially known causal graph, is common +in many causal modeling applications. Leveraging this prior knowledge allows +for the further identification of causal relationships. In this paper, we first +propose a method for learning the local structure using all types of causal +background knowledge, including direct causal information, non-ancestral +information and ancestral information. Then we introduce criteria for +identifying causal relationships based solely on the local structure in the +presence of prior knowledge. We also apply out method to fair machine learning, +and experiments involving local structure learning, causal relationship +identification, and fair machine learning demonstrate that our method is both +effective and efficient. + +
+
+
+
+
+ + ☆ IReCa: Intrinsic Reward-enhanced Context-aware Reinforcement Learning + for Human-AI Coordination + + +
+ In human-AI coordination scenarios, human agents usually exhibit asymmetric +behaviors that are significantly sparse and unpredictable compared to those of +AI agents. These characteristics introduce two primary challenges to human-AI +coordination: the effectiveness of obtaining sparse rewards and the efficiency +of training the AI agents. To tackle these challenges, we propose an Intrinsic +Reward-enhanced Context-aware (IReCa) reinforcement learning (RL) algorithm, +which leverages intrinsic rewards to facilitate the acquisition of sparse +rewards and utilizes environmental context to enhance training efficiency. Our +IReCa RL algorithm introduces three unique features: (i) it encourages the +exploration of sparse rewards by incorporating intrinsic rewards that +supplement traditional extrinsic rewards from the environment; (ii) it improves +the acquisition of sparse rewards by prioritizing the corresponding sparse +state-action pairs; and (iii) it enhances the training efficiency by optimizing +the exploration and exploitation through innovative context-aware weights of +extrinsic and intrinsic rewards. Extensive simulations executed in the +Overcooked layouts demonstrate that our IReCa RL algorithm can increase the +accumulated rewards by approximately 20% and reduce the epochs required for +convergence by approximately 67% compared to state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Incremental Structure Discovery of Classification via Sequential Monte + Carlo + + +
+ Gaussian Processes (GPs) provide a powerful framework for making predictions +and understanding uncertainty for classification with kernels and Bayesian +non-parametric learning. Building such models typically requires strong prior +knowledge to define preselect kernels, which could be ineffective for online +applications of classification that sequentially process data because features +of data may shift during the process. To alleviate the requirement of prior +knowledge used in GPs and learn new features from data that arrive +successively, this paper presents a novel method to automatically discover +models of classification on complex data with little prior knowledge. Our +method adapts a recently proposed technique for GP-based time-series structure +discovery, which integrates GPs and Sequential Monte Carlo (SMC). We extend the +technique to handle extra latent variables in GP classification, such that our +method can effectively and adaptively learn a-priori unknown structures of +classification from continuous input. In addition, our method adapts new batch +of data with updated structures of models. Our experiments show that our method +is able to automatically incorporate various features of kernels on synthesized +data and real-world data for classification. In the experiments of real-world +data, our method outperforms various classification methods on both online and +offline setting achieving a 10\% accuracy improvement on one benchmark. + +
+
+
+
+
+ + ☆ A Systematic Evaluation of Generated Time Series and Their Effects in + Self-Supervised Pretraining CIKM 2024 + + +
+ Self-supervised Pretrained Models (PTMs) have demonstrated remarkable +performance in computer vision and natural language processing tasks. These +successes have prompted researchers to design PTMs for time series data. In our +experiments, most self-supervised time series PTMs were surpassed by simple +supervised models. We hypothesize this undesired phenomenon may be caused by +data scarcity. In response, we test six time series generation methods, use the +generated data in pretraining in lieu of the real data, and examine the effects +on classification performance. Our results indicate that replacing a real-data +pretraining set with a greater volume of only generated samples produces +noticeable improvement. + +
+
+ comment: To appear in CIKM 2024 as a short paper; the version here is the + self-contained version that includes the non-mandatory supplementary material + available on the paper's companion website +
+
+
+
+
+ + ☆ Capturing the Complexity of Human Strategic Decision-Making with Machine + Learning + + +
+ Understanding how people behave in strategic settings--where they make +decisions based on their expectations about the behavior of others--is a +long-standing problem in the behavioral sciences. We conduct the largest study +to date of strategic decision-making in the context of initial play in +two-player matrix games, analyzing over 90,000 human decisions across more than +2,400 procedurally generated games that span a much wider space than previous +datasets. We show that a deep neural network trained on these data predicts +people's choices better than leading theories of strategic behavior, indicating +that there is systematic variation that is not explained by those theories. We +then modify the network to produce a new, interpretable behavioral model, +revealing what the original network learned about people: their ability to +optimally respond and their capacity to reason about others are dependent on +the complexity of individual games. This context-dependence is critical in +explaining deviations from the rational Nash equilibrium, response times, and +uncertainty in strategic decisions. More broadly, our results demonstrate how +machine learning can be applied beyond prediction to further help generate +novel explanations of complex human behavior. + +
+
+
+
+
+ + ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ☆ Efficient Data-Sketches and Fine-Tuning for Early Detection of + Distributional Drift in Medical Imaging + + +
+ Distributional drift detection is important in medical applications as it +helps ensure the accuracy and reliability of models by identifying changes in +the underlying data distribution that could affect diagnostic or treatment +decisions. However, current methods have limitations in detecting drift; for +example, the inclusion of abnormal datasets can lead to unfair comparisons. +This paper presents an accurate and sensitive approach to detect distributional +drift in CT-scan medical images by leveraging data-sketching and fine-tuning +techniques. We developed a robust baseline library model for real-time anomaly +detection, allowing for efficient comparison of incoming images and +identification of anomalies. Additionally, we fine-tuned a vision transformer +pre-trained model to extract relevant features using breast cancer images as an +example, significantly enhancing model accuracy to 99.11\%. Combining with +data-sketches and fine-tuning, our feature extraction evaluation demonstrated +that cosine similarity scores between similar datasets provide greater +improvements, from around 50\% increased to 100\%. Finally, the sensitivity +evaluation shows that our solutions are highly sensitive to even 1\% +salt-and-pepper and speckle noise, and it is not sensitive to lighting noise +(e.g., lighting conditions have no impact on data drift). The proposed methods +offer a scalable and reliable solution for maintaining the accuracy of +diagnostic models in dynamic clinical environments. + +
+
+
+
+
+ + ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention + + +
+ The Transformer architecture has revolutionized deep learning through its +Self-Attention mechanism, which effectively captures contextual information. +However, the memory footprint of Self-Attention presents significant challenges +for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by +grouping queries and mean-pooling the corresponding key-value heads - reducing +the number of overall parameters and memory requirements in a flexible manner +without adversely compromising model accuracy. In this work, we introduce +enhancements to GQA, focusing on two novel approaches that deviate from the +static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic +Key-Distributed GQA (DGQA), which leverage information from the norms of the +key heads to inform query allocation. Specifically, KDGQA looks at the ratios +of the norms of the key heads during each forward pass, while DGQA examines the +ratios of the norms as they evolve through training. Additionally, we present +Perturbed GQA (PGQA) as a case-study, which introduces variability in (static) +group formation via subtracting noise from the attention maps. Our experiments +with up-trained Vision Transformers, for Image Classification on datasets such +as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of +these variants in improving upon the original GQA through more informed and +adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of +up to 8% when utilizing DGQA in comparison to GQA and other variants. We +further analyze the impact of the number of Key-Value Heads on performance, +underscoring the importance of utilizing query-key affinities. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ Exploring Cross-model Neuronal Correlations in the Context of Predicting + Model Performance and Generalizability + + +
+ As Artificial Intelligence (AI) models are increasingly integrated into +critical systems, the need for a robust framework to establish the +trustworthiness of AI is increasingly paramount. While collaborative efforts +have established conceptual foundations for such a framework, there remains a +significant gap in developing concrete, technically robust methods for +assessing AI model quality and performance. A critical drawback in the +traditional methods for assessing the validity and generalizability of models +is their dependence on internal developer datasets, rendering it challenging to +independently assess and verify their performance claims. This paper introduces +a novel approach for assessing a newly trained model's performance based on +another known model by calculating correlation between neural networks. The +proposed method evaluates correlations by determining if, for each neuron in +one network, there exists a neuron in the other network that produces similar +output. This approach has implications for memory efficiency, allowing for the +use of smaller networks when high correlation exists between networks of +different sizes. Additionally, the method provides insights into robustness, +suggesting that if two highly correlated networks are compared and one +demonstrates robustness when operating in production environments, the other is +likely to exhibit similar robustness. This contribution advances the technical +toolkit for responsible AI, supporting more comprehensive and nuanced +evaluations of AI models to ensure their safe and effective deployment. + +
+
+
+
+
+ + ☆ Lifelong Reinforcement Learning via Neuromodulation + + +
+ Navigating multiple tasks$\unicode{x2014}$for instance in succession as in +continual or lifelong learning, or in distributions as in meta or multi-task +learning$\unicode{x2014}$requires some notion of adaptation. Evolution over +timescales of millennia has imbued humans and other animals with highly +effective adaptive learning and decision-making strategies. Central to these +functions are so-called neuromodulatory systems. In this work we introduce an +abstract framework for integrating theories and evidence from neuroscience and +the cognitive sciences into the design of adaptive artificial reinforcement +learning algorithms. We give a concrete instance of this framework built on +literature surrounding the neuromodulators Acetylcholine (ACh) and +Noradrenaline (NA), and empirically validate the effectiveness of the resulting +adaptive algorithm in a non-stationary multi-armed bandit problem. We conclude +with a theory-based experiment proposal providing an avenue to link our +framework back to efforts in experimental neuroscience. + +
+
+
+
+
+ + ☆ W-RAG: Weakly Supervised Dense Retrieval in RAG for Open-domain Question + Answering + + +
+ In knowledge-intensive tasks such as open-domain question answering (OpenQA), +Large Language Models (LLMs) often struggle to generate factual answers relying +solely on their internal (parametric) knowledge. To address this limitation, +Retrieval-Augmented Generation (RAG) systems enhance LLMs by retrieving +relevant information from external sources, thereby positioning the retriever +as a pivotal component. Although dense retrieval demonstrates state-of-the-art +performance, its training poses challenges due to the scarcity of ground-truth +evidence, largely attributed to the high costs of human annotation. In this +paper, we propose W-RAG by utilizing the ranking capabilities of LLMs to create +weakly labeled data for training dense retrievers. Specifically, we rerank the +top-$K$ passages retrieved via BM25 by assessing the probability that LLMs will +generate the correct answer based on the question and each passage. The +highest-ranking passages are then used as positive training examples for dense +retrieval. Our comprehensive experiments across four publicly available OpenQA +datasets demonstrate that our approach enhances both retrieval and OpenQA +performance compared to baseline models. + +
+
+
+
+
+ + ☆ A semi-centralized multi-agent RL framework for efficient irrigation + scheduling + + +
+ This paper proposes a Semi-Centralized Multi-Agent Reinforcement Learning +(SCMARL) approach for irrigation scheduling in spatially variable agricultural +fields, where management zones address spatial variability. The SCMARL +framework is hierarchical in nature, with a centralized coordinator agent at +the top level and decentralized local agents at the second level. The +coordinator agent makes daily binary irrigation decisions based on field-wide +conditions, which are communicated to the local agents. Local agents determine +appropriate irrigation amounts for specific management zones using local +conditions. The framework employs state augmentation approach to handle +non-stationarity in the local agents' environments. An extensive evaluation on +a large-scale field in Lethbridge, Canada, compares the SCMARL approach with a +learning-based multi-agent model predictive control scheduling approach, +highlighting its enhanced performance, resulting in water conservation and +improved Irrigation Water Use Efficiency (IWUE). Notably, the proposed approach +achieved a 4.0% savings in irrigation water while enhancing the IWUE by 6.3%. + +
+
+
+
+
+ + ☆ D5RL: Diverse Datasets for Data-Driven Deep Reinforcement Learning + + +
+ Offline reinforcement learning algorithms hold the promise of enabling +data-driven RL methods that do not require costly or dangerous real-world +exploration and benefit from large pre-collected datasets. This in turn can +facilitate real-world applications, as well as a more standardized approach to +RL research. Furthermore, offline RL methods can provide effective +initializations for online finetuning to overcome challenges with exploration. +However, evaluating progress on offline RL algorithms requires effective and +challenging benchmarks that capture properties of real-world tasks, provide a +range of task difficulties, and cover a range of challenges both in terms of +the parameters of the domain (e.g., length of the horizon, sparsity of rewards) +and the parameters of the data (e.g., narrow demonstration data or broad +exploratory data). While considerable progress in offline RL in recent years +has been enabled by simpler benchmark tasks, the most widely used datasets are +increasingly saturating in performance and may fail to reflect properties of +realistic tasks. We propose a new benchmark for offline RL that focuses on +realistic simulations of robotic manipulation and locomotion environments, +based on models of real-world robotic systems, and comprising a variety of data +sources, including scripted data, play-style data collected by human +teleoperators, and other data sources. Our proposed benchmark covers +state-based and image-based domains, and supports both offline RL and online +fine-tuning evaluation, with some of the tasks specifically designed to require +both pre-training and fine-tuning. We hope that our proposed benchmark will +facilitate further progress on both offline RL and fine-tuning algorithms. +Website with code, examples, tasks, and data is available at +\url{https://sites.google.com/view/d5rl/} + +
+
+ comment: RLC 2024 +
+
+
+
+
+ + ☆ Predictive uncertainty estimation in deep learning for lung carcinoma + classification in digital pathology under real dataset shifts + + +
+ Deep learning has shown tremendous progress in a wide range of digital +pathology and medical image classification tasks. Its integration into safe +clinical decision-making support requires robust and reliable models. However, +real-world data comes with diversities that often lie outside the intended +source distribution. Moreover, when test samples are dramatically different, +clinical decision-making is greatly affected. Quantifying predictive +uncertainty in models is crucial for well-calibrated predictions and +determining when (or not) to trust a model. Unfortunately, many works have +overlooked the importance of predictive uncertainty estimation. This paper +evaluates whether predictive uncertainty estimation adds robustness to deep +learning-based diagnostic decision-making systems. We investigate the effect of +various carcinoma distribution shift scenarios on predictive performance and +calibration. We first systematically investigate three popular methods for +improving predictive uncertainty: Monte Carlo dropout, deep ensemble, and +few-shot learning on lung adenocarcinoma classification as a primary disease in +whole slide images. Secondly, we compare the effectiveness of the methods in +terms of performance and calibration under clinically relevant distribution +shifts such as in-distribution shifts comprising primary disease sub-types and +other characterization analysis data; out-of-distribution shifts comprising +well-differentiated cases, different organ origin, and imaging modality shifts. +While studies on uncertainty estimation exist, to our best knowledge, no +rigorous large-scale benchmark compares predictive uncertainty estimation +including these dataset shifts for lung carcinoma classification. + +
+
+ comment: 17 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Random Gradient Masking as a Defensive Measure to Deep Leakage in + Federated Learning + + +
+ Federated Learning(FL), in theory, preserves privacy of individual clients' +data while producing quality machine learning models. However, attacks such as +Deep Leakage from Gradients(DLG) severely question the practicality of FL. In +this paper, we empirically evaluate the efficacy of four defensive methods +against DLG: Masking, Clipping, Pruning, and Noising. Masking, while only +previously studied as a way to compress information during parameter transfer, +shows surprisingly robust defensive utility when compared to the other three +established methods. Our experimentation is two-fold. We first evaluate the +minimum hyperparameter threshold for each method across MNIST, CIFAR-10, and +lfw datasets. Then, we train FL clients with each method and their minimum +threshold values to investigate the trade-off between DLG defense and training +performance. Results reveal that Masking and Clipping show near to none +degradation in performance while obfuscating enough information to effectively +defend against DLG. + +
+
+ comment: 13 pages, 5 figures, to be submitted to Applied Intelligence +
+
+
+
+
+ + ♻ ☆ On Model Compression for Neural Networks: Framework, Algorithm, and + Convergence Guarantee + + +
+ Model compression is a crucial part of deploying neural networks (NNs), +especially when the memory and storage of computing devices are limited in many +applications. This paper focuses on two model compression techniques: low-rank +approximation and weight pruning in neural networks, which are very popular +nowadays. However, training NN with low-rank approximation and weight pruning +always suffers significant accuracy loss and convergence issues. In this paper, +a holistic framework is proposed for model compression from a novel perspective +of nonconvex optimization by designing an appropriate objective function. Then, +we introduce NN-BCD, a block coordinate descent (BCD) algorithm to solve the +nonconvex optimization. One advantage of our algorithm is that an efficient +iteration scheme can be derived with closed-form, which is gradient-free. +Therefore, our algorithm will not suffer from vanishing/exploding gradient +problems. Furthermore, with the Kurdyka-{\L}ojasiewicz (K{\L}) property of our +objective function, we show that our algorithm globally converges to a critical +point at the rate of O(1/k), where k denotes the number of iterations. Lastly, +extensive experiments with tensor train decomposition and weight pruning +demonstrate the efficiency and superior performance of the proposed framework. +Our code implementation is available at https://github.com/ChenyangLi-97/NN-BCD + +
+
+ comment: 44 pages +
+
+
+
+
+ + ♻ ☆ A Distributed Privacy Preserving Model for the Detection of Alzheimer's + Disease + + +
+ BACKGROUND: Segmentation of medical data, concerns about personal health +information (PHI) breaches, and the direct and indirect costs of consolidating +and managing such segmented date should motivate diagnostic machine learning +(DML) researchers to identify privacy-preserving machine learning algorithms +that can train on distributed or decentralized datasets of different +modalities. Federated learning models provide such a decentralized machine +learning framework in which multiple investigators in possession of disparate +datasets and working on different devices or servers can train collaboratively +a global machine learning models without ever having to exchange local data and +thus can meet statutory PHI protections. To this end, a vertical federate +learning model is devised and tested for efficacy in the detection of +Alzheimer's Disease (AD). + METHODS: The second version of Open Access Series of Imaging Studies -- with +its panoply of demographic, imaging, and clinical assessment datasets -- was +used to test a multimodal vertical federated learning (VFL) model for AD +detection. + RESULTS: By training and validating this VFL model on the demographic, +clinical, and MRI data in OASIS-2, an 82.9\% accuracy rate is achieved, +consistent with previously reported results. + CONCLUSIONS: The VFL architecture proposed herein offers a novel distributed +architecture, enabling collaborative learning across diverse sources of medical +data while respecting statutory privacy constraints. By leveraging multiple +modalities of data, the robustness and accuracy of AD detection can be +enhanced. This model not only contributes to the advancement of federated +learning techniques but also holds promise for overcoming the hurdles posed by +data segmentation in medical research. + +
+
+ comment: 17 pages, 7 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ On the Impact of Uncertainty and Calibration on Likelihood-Ratio + Membership Inference Attacks + + +
+ In a membership inference attack (MIA), an attacker exploits the +overconfidence exhibited by typical machine learning models to determine +whether a specific data point was used to train a target model. In this paper, +we analyze the performance of the state-of-the-art likelihood ratio attack +(LiRA) within an information-theoretical framework that allows the +investigation of the impact of the aleatoric uncertainty in the true data +generation process, of the epistemic uncertainty caused by a limited training +data set, and of the calibration level of the target model. We compare three +different settings, in which the attacker receives decreasingly informative +feedback from the target model: confidence vector (CV) disclosure, in which the +output probability vector is released; true label confidence (TLC) disclosure, +in which only the probability assigned to the true label is made available by +the model; and decision set (DS) disclosure, in which an adaptive prediction +set is produced as in conformal prediction. We derive bounds on the advantage +of an MIA adversary with the aim of offering insights into the impact of +uncertainty and calibration on the effectiveness of MIAs. Simulation results +demonstrate that the derived analytical bounds predict well the effectiveness +of MIAs. + +
+
+ comment: 13 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Evetac: An Event-based Optical Tactile Sensor for Robotic Manipulation + + +
+ Optical tactile sensors have recently become popular. They provide high +spatial resolution, but struggle to offer fine temporal resolutions. To +overcome this shortcoming, we study the idea of replacing the RGB camera with +an event-based camera and introduce a new event-based optical tactile sensor +called Evetac. Along with hardware design, we develop touch processing +algorithms to process its measurements online at 1000 Hz. We devise an +efficient algorithm to track the elastomer's deformation through the imprinted +markers despite the sensor's sparse output. Benchmarking experiments +demonstrate Evetac's capabilities of sensing vibrations up to 498 Hz, +reconstructing shear forces, and significantly reducing data rates compared to +RGB optical tactile sensors. Moreover, Evetac's output and the marker tracking +provide meaningful features for learning data-driven slip detection and +prediction models. The learned models form the basis for a robust and adaptive +closed-loop grasp controller capable of handling a wide range of objects. We +believe that fast and efficient event-based tactile sensors like Evetac will be +essential for bringing human-like manipulation capabilities to robotics. The +sensor design is open-sourced at https://sites.google.com/view/evetac . + +
+
+ comment: Accepted at IEEE Transactions On Robotics. Project Website: + https://sites.google.com/view/evetac +
+
+
+
+
+ + ♻ ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aides to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+ + ♻ ☆ DPM: Clustering Sensitive Data through Separation + + +
+ Clustering is an important tool for data exploration where the goal is to +subdivide a data set into disjoint clusters that fit well into the underlying +data structure. When dealing with sensitive data, privacy-preserving algorithms +aim to approximate the non-private baseline while minimising the leakage of +sensitive information. State-of-the-art privacy-preserving clustering +algorithms tend to output clusters that are good in terms of the standard +metrics, inertia, silhouette score, and clustering accuracy, however, the +clustering result strongly deviates from the non-private KMeans baseline. In +this work, we present a privacy-preserving clustering algorithm called \DPM +that recursively separates a data set into clusters based on a geometrical +clustering approach. In addition, \DPM estimates most of the data-dependent +hyper-parameters in a privacy-preserving way. We prove that \DPM preserves +Differential Privacy and analyse the utility guarantees of \DPM. Finally, we +conduct an extensive empirical evaluation for synthetic and real-life data +sets. We show that \DPM achieves state-of-the-art utility on the standard +clustering metrics and yields a clustering result much closer to that of the +popular non-private KMeans algorithm without requiring the number of classes. + +
+
+ comment: The first two authors equally contributed to this work +
+
+
+
+
+ + ♻ ☆ Deep Learning Innovations for Underwater Waste Detection: An In-Depth + Analysis + + +
+ Addressing the issue of submerged underwater trash is crucial for +safeguarding aquatic ecosystems and preserving marine life. While identifying +debris present on the surface of water bodies is straightforward, assessing the +underwater submerged waste is a challenge due to the image distortions caused +by factors such as light refraction, absorption, suspended particles, color +shifts, and occlusion. This paper conducts a comprehensive review of +state-of-the-art architectures and on the existing datasets to establish a +baseline for submerged waste and trash detection. The primary goal remains to +establish the benchmark of the object localization techniques to be leveraged +by advanced underwater sensors and autonomous underwater vehicles. The ultimate +objective is to explore the underwater environment, to identify, and remove +underwater debris. The absence of benchmarks (dataset or algorithm) in many +researches emphasizes the need for a more robust algorithmic solution. Through +this research, we aim to give performance comparative analysis of various +underwater trash detection algorithms. + +
+
+
+
+
+ + ♻ ☆ Examining Common Paradigms in Multi-Task Learning + + +
+ While multi-task learning (MTL) has gained significant attention in recent +years, its underlying mechanisms remain poorly understood. Recent methods did +not yield consistent performance improvements over single task learning (STL) +baselines, underscoring the importance of gaining more profound insights about +challenges specific to MTL. In our study, we investigate paradigms in MTL in +the context of STL: First, the impact of the choice of optimizer has only been +mildly investigated in MTL. We show the pivotal role of common STL tools such +as the Adam optimizer in MTL empirically in various experiments. To further +investigate Adam's effectiveness, we theoretical derive a partial loss-scale +invariance under mild assumptions. Second, the notion of gradient conflicts has +often been phrased as a specific problem in MTL. We delve into the role of +gradient conflicts in MTL and compare it to STL. For angular gradient alignment +we find no evidence that this is a unique problem in MTL. We emphasize +differences in gradient magnitude as the main distinguishing factor. Overall, +we find surprising similarities between STL and MTL suggesting to consider +methods from both fields in a broader context. + +
+
+ comment: Accepted for publication in German Conference for Pattern Recognition + (GCPR), 2024 +
+
+
+
+
+ + ♻ ☆ Exact Tensor Completion Powered by Slim Transforms + + +
+ In this work, a tensor completion problem is studied, which aims to perfectly +recover the tensor from partial observations. The existing theoretical +guarantee requires the involved transform to be orthogonal, which hinders its +applications. In this paper, jumping out of the constraints of isotropy and +self-adjointness, the theoretical guarantee of exact tensor completion with +arbitrary linear transforms is established by directly operating the tensors in +the transform domain. With the enriched choices of transforms, a new analysis +obtained by the proof discloses why slim transforms outperform their square +counterparts from a theoretical level. Our model and proof greatly enhance the +flexibility of tensor completion and extensive experiments validate the +superiority of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Deep Learning: a Heuristic Three-stage Mechanism for Grid Searches to + Optimize the Future Risk Prediction of Breast Cancer Metastasis Using + EHR-based Clinical Data + + +
+ A grid search, at the cost of training and testing a large number of models, +is an effective way to optimize the prediction performance of deep learning +models. A challenging task concerning grid search is the time management. +Without a good time management scheme, a grid search can easily be set off as a +mission that will not finish in our lifetime. In this study, we introduce a +heuristic three-stage mechanism for managing the running time of low-budget +grid searches, and the sweet-spot grid search (SSGS) and randomized grid search +(RGS) strategies for improving model prediction performance, in predicting the +5-year, 10-year, and 15-year risk of breast cancer metastasis. We develop deep +feedforward neural network (DFNN) models and optimize them through grid +searches. We conduct eight cycles of grid searches by applying our three-stage +mechanism and SSGS and RGS strategies. We conduct various SHAP analyses +including unique ones that interpret the importance of the DFNN-model +hyperparameters. Our results show that grid search can greatly improve model +prediction. The grid searches we conducted improved the risk prediction of +5-year, 10-year, and 15-year breast cancer metastasis by 18.6%, 16.3%, and +17.3% respectively, over the average performance of all corresponding models we +trained using the RGS strategy. We not only demonstrate best model performance +but also characterize grid searches from various aspects such as their +capabilities of discovering decent models and the unit grid search time. The +three-stage mechanism worked effectively. It made our low-budget grid searches +feasible and manageable, and in the meantime helped improve model prediction +performance. Our SHAP analyses identified both clinical risk factors important +for the prediction of future risk of breast cancer metastasis, and DFNN-model +hyperparameters important to the prediction of performance scores. + +
+
+
+
+
+ + ♻ ☆ Separable Hamiltonian Neural Networks + + +
+ Hamiltonian neural networks (HNNs) are state-of-the-art models that regress +the vector field of a dynamical system under the learning bias of Hamilton's +equations. A recent observation is that embedding a bias regarding the additive +separability of the Hamiltonian reduces the regression complexity and improves +regression performance. We propose separable HNNs that embed additive +separability within HNNs using observational, learning, and inductive biases. +We show that the proposed models are more effective than the HNN at regressing +the Hamiltonian and the vector field. Consequently, the proposed models predict +the dynamics and conserve the total energy of the Hamiltonian system more +accurately. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ PlainMamba: Improving Non-Hierarchical Mamba in Visual Recognition BMVC 2024 + + +
+ We present PlainMamba: a simple non-hierarchical state space model (SSM) +designed for general visual recognition. The recent Mamba model has shown how +SSMs can be highly competitive with other architectures on sequential data and +initial attempts have been made to apply it to images. In this paper, we +further adapt the selective scanning process of Mamba to the visual domain, +enhancing its ability to learn features from two-dimensional images by (i) a +continuous 2D scanning process that improves spatial continuity by ensuring +adjacency of tokens in the scanning sequence, and (ii) direction-aware updating +which enables the model to discern the spatial relations of tokens by encoding +directional information. Our architecture is designed to be easy to use and +easy to scale, formed by stacking identical PlainMamba blocks, resulting in a +model with constant width throughout all layers. The architecture is further +simplified by removing the need for special tokens. We evaluate PlainMamba on a +variety of visual recognition tasks, achieving performance gains over previous +non-hierarchical models and is competitive with hierarchical alternatives. For +tasks requiring high-resolution inputs, in particular, PlainMamba requires much +less computing while maintaining high performance. Code and models are +available at: https://github.com/ChenhongyiYang/PlainMamba . + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Two Completely Parameter-Free Alternating Gradient Projection Algorithms + for Nonconvex-(strongly) Concave Minimax Problems + + +
+ Due to their importance in various emerging applications, efficient +algorithms for solving minimax problems have recently received increasing +attention. However, many existing algorithms require prior knowledge of the +problem parameters in order to achieve optimal iteration complexity. In this +paper, we propose two completely parameter-free alternating gradient projection +algorithms, i.e., the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm, to +solve the smooth nonconvex-strongly concave and nonconvex-concave minimax +problems respectively using a backtracking strategy, which does not require +prior knowledge of parameters such as the Lipschtiz constant $L$ or the +strongly concave constant $\mu$. Moreover, we show that the total number of +gradient calls of the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm to +obtain an $\varepsilon$-stationary point is upper bounded by $\mathcal{O}\left( +L\kappa^3\varepsilon^{-2} \right)$ and $\mathcal{O}\left( L^4\varepsilon^{-4} +\right)$ respectively, where $\kappa$ is the condition number. As far as we +know, the PF-AGP-NSC algorithm and the PF-AGP-NC algorithm are the first +completely parameter-free algorithms for solving nonconvex-strongly concave +minimax problems and nonconvex-concave minimax problems respectively. Numerical +results validate the efficiency of the proposed PF-AGP algorithm. + +
+
+
+
+
+ + ♻ ☆ Characterizing Multimodal Long-form Summarization: A Case Study on + Financial Reports + + +
+ As large language models (LLMs) expand the power of natural language +processing to handle long inputs, rigorous and systematic analyses are +necessary to understand their abilities and behavior. A salient application is +summarization, due to its ubiquity and controversy (e.g., researchers have +declared the death of summarization). In this paper, we use financial report +summarization as a case study because financial reports are not only long but +also use numbers and tables extensively. We propose a computational framework +for characterizing multimodal long-form summarization and investigate the +behavior of Claude 2.0/2.1, GPT-4/3.5, and Cohere. We find that GPT-3.5 and +Cohere fail to perform this summarization task meaningfully. For Claude 2 and +GPT-4, we analyze the extractiveness of the summary and identify a position +bias in LLMs. This position bias disappears after shuffling the input for +Claude, which suggests that Claude seems to recognize important information. We +also conduct a comprehensive investigation on the use of numeric data in +LLM-generated summaries and offer a taxonomy of numeric hallucination. We +employ prompt engineering to improve GPT-4's use of numbers with limited +success. Overall, our analyses highlight the strong capability of Claude 2 in +handling long multimodal inputs compared to GPT-4. The generated summaries and +evaluation code are available at +https://github.com/ChicagoHAI/characterizing-multimodal-long-form-summarization. + +
+
+
+
+
+ + ♻ ☆ End-to-end Autonomous Driving: Challenges and Frontiers + + +
+ The autonomous driving community has witnessed a rapid growth in approaches +that embrace an end-to-end algorithm framework, utilizing raw sensor input to +generate vehicle motion plans, instead of concentrating on individual tasks +such as detection and motion prediction. End-to-end systems, in comparison to +modular pipelines, benefit from joint feature optimization for perception and +planning. This field has flourished due to the availability of large-scale +datasets, closed-loop evaluation, and the increasing need for autonomous +driving algorithms to perform effectively in challenging scenarios. In this +survey, we provide a comprehensive analysis of more than 270 papers, covering +the motivation, roadmap, methodology, challenges, and future trends in +end-to-end autonomous driving. We delve into several critical challenges, +including multi-modality, interpretability, causal confusion, robustness, and +world models, amongst others. Additionally, we discuss current advancements in +foundation models and visual pre-training, as well as how to incorporate these +techniques within the end-to-end driving framework. we maintain an active +repository that contains up-to-date literature and open-source projects at +https://github.com/OpenDriveLab/End-to-end-Autonomous-Driving. + +
+
+ comment: Accepted by IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Inferring Effect Ordering Without Causal Effect Estimation + + +
+ Predictive models are often employed to guide interventions across various +domains, such as advertising, customer retention, and personalized medicine. +These models often do not estimate the actual effects of interventions but +serve as proxies, suggesting potential effectiveness based on predicted +outcomes. Our paper addresses the critical question of when and how these +predictive models can be interpreted causally, specifically focusing on using +the models for inferring effect ordering rather than precise effect sizes. We +formalize two assumptions, full latent mediation and latent monotonicity, that +are jointly sufficient for inferring effect ordering without direct causal +effect estimation. We explore the utility of these assumptions in assessing the +feasibility of proxies for inferring effect ordering in scenarios where there +is no data on how individuals behave when intervened or no data on the primary +outcome of interest. Additionally, we provide practical guidelines for +practitioners to make their own assessments about proxies. Our findings reveal +not only when it is possible to reasonably infer effect ordering from proxies, +but also conditions under which modeling these proxies can outperform direct +effect estimation. This study underscores the importance of broadening causal +inference to encompass alternative causal interpretations beyond effect +estimation, offering a foundation for future research to enhance +decision-making processes when direct effect estimation is not feasible. + +
+
+
+
+
+ + ♻ ☆ Learning From Scenarios for Stochastic Repairable Scheduling + + +
+ When optimizing problems with uncertain parameter values in a linear +objective, decision-focused learning enables end-to-end learning of these +values. We are interested in a stochastic scheduling problem, in which +processing times are uncertain, which brings uncertain values in the +constraints, and thus repair of an initial schedule may be needed. Historical +realizations of the stochastic processing times are available. We show how +existing decision-focused learning techniques based on stochastic smoothing can +be adapted to this scheduling problem. We include an extensive experimental +evaluation to investigate in which situations decision-focused learning +outperforms the state of the art for such situations: scenario-based stochastic +optimization. + +
+
+ comment: 8 pages, updated according to camera-ready version CPAIOR'24 +
+
+
+
+
+ + ♻ ☆ Enabling Regional Explainability by Automatic and Model-agnostic Rule + Extraction + + +
+ In Explainable AI, rule extraction translates model knowledge into logical +rules, such as IF-THEN statements, crucial for understanding patterns learned +by black-box models. This could significantly aid in fields like disease +diagnosis, disease progression estimation, or drug discovery. However, such +application domains often contain imbalanced data, with the class of interest +underrepresented. Existing methods inevitably compromise the performance of +rules for the minor class to maximise the overall performance. As the first +attempt in this field, we propose a model-agnostic approach for extracting +rules from specific subgroups of data, featuring automatic rule generation for +numerical features. This method enhances the regional explainability of machine +learning models and offers wider applicability compared to existing methods. We +additionally introduce a new method for selecting features to compose rules, +reducing computational costs in high-dimensional spaces. Experiments across +various datasets and models demonstrate the effectiveness of our methods. + +
+
+
+
+
+ + ♻ ☆ Identifying Important Group of Pixels using Interactions CVPR 2024 + + +
+ To better understand the behavior of image classifiers, it is useful to +visualize the contribution of individual pixels to the model prediction. In +this study, we propose a method, MoXI ($\textbf{Mo}$del e$\textbf{X}$planation +by $\textbf{I}$nteractions), that efficiently and accurately identifies a group +of pixels with high prediction confidence. The proposed method employs +game-theoretic concepts, Shapley values and interactions, taking into account +the effects of individual pixels and the cooperative influence of pixels on +model confidence. Theoretical analysis and experiments demonstrate that our +method better identifies the pixels that are highly contributing to the model +outputs than widely-used visualization by Grad-CAM, Attention rollout, and +Shapley value. While prior studies have suffered from the exponential +computational cost in the computation of Shapley value and interactions, we +show that this can be reduced to quadratic cost for our task. The code is +available at https://github.com/KosukeSumiyasu/MoXI. + +
+
+ comment: CVPR 2024 (update: minor typos, new references, Eqs. (12) and (13)) +
+
+
+
+
+ + ♻ ☆ A Spitting Image: Modular Superpixel Tokenization in Vision Transformers ECCV + + +
+ Vision Transformer (ViT) architectures traditionally employ a grid-based +approach to tokenization independent of the semantic content of an image. We +propose a modular superpixel tokenization strategy which decouples tokenization +and feature extraction; a shift from contemporary approaches where these are +treated as an undifferentiated whole. Using on-line content-aware tokenization +and scale- and shape-invariant positional embeddings, we perform experiments +and ablations that contrast our approach with patch-based tokenization and +randomized partitions as baselines. We show that our method significantly +improves the faithfulness of attributions, gives pixel-level granularity on +zero-shot unsupervised dense prediction tasks, while maintaining predictive +performance in classification tasks. Our approach provides a modular +tokenization framework commensurable with standard architectures, extending the +space of ViTs to a larger class of semantically-rich models. + +
+
+ comment: To appear in ECCV (MELEX) 2024 Workshop Proceedings +
+
+
+
+
+ + ♻ ☆ Label Dropout: Improved Deep Learning Echocardiography Segmentation + Using Multiple Datasets With Domain Shift and Partial Labelling MICCAI 2024 + + +
+ Echocardiography (echo) is the first imaging modality used when assessing +cardiac function. The measurement of functional biomarkers from echo relies +upon the segmentation of cardiac structures and deep learning models have been +proposed to automate the segmentation process. However, in order to translate +these tools to widespread clinical use it is important that the segmentation +models are robust to a wide variety of images (e.g. acquired from different +scanners, by operators with different levels of expertise etc.). To achieve +this level of robustness it is necessary that the models are trained with +multiple diverse datasets. A significant challenge faced when training with +multiple diverse datasets is the variation in label presence, i.e. the combined +data are often partially-labelled. Adaptations of the cross entropy loss +function have been proposed to deal with partially labelled data. In this paper +we show that training naively with such a loss function and multiple diverse +datasets can lead to a form of shortcut learning, where the model associates +label presence with domain characteristics, leading to a drop in performance. +To address this problem, we propose a novel label dropout scheme to break the +link between domain characteristics and the presence or absence of labels. We +demonstrate that label dropout improves echo segmentation Dice score by 62% and +25% on two cardiac structures when training using multiple diverse partially +labelled datasets. + +
+
+ comment: 10 pages, 5 figures, ASMUS 2024, Held in Conjunction with MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of trustworthy and +disinformation articles related to pro-Kremlin themes. It is sourced directly +from the debunk articles written by experts leading the EUvsDisinfo project. +Our dataset is the largest to-date resource in terms of the overall number of +articles and distinct languages. It also provides the largest topical and +temporal coverage. Using this dataset, we investigate the dissemination of +pro-Kremlin disinformation across different languages, uncovering +language-specific patterns targeting specific disinformation topics. We further +analyse the evolution of topic distribution over an eight-year period, noting a +significant surge in disinformation content before the full-scale invasion of +Ukraine in 2022. Lastly, we demonstrate the dataset's applicability in training +models to effectively distinguish between disinformation and trustworthy +content in multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Scalable Learning of Item Response Theory Models AISTATS 2024 + + +
+ Item Response Theory (IRT) models aim to assess latent abilities of $n$ +examinees along with latent difficulty characteristics of $m$ test items from +categorical data that indicates the quality of their corresponding answers. +Classical psychometric assessments are based on a relatively small number of +examinees and items, say a class of $200$ students solving an exam comprising +$10$ problems. More recent global large scale assessments such as PISA, or +internet studies, may lead to significantly increased numbers of participants. +Additionally, in the context of Machine Learning where algorithms take the role +of examinees and data analysis problems take the role of items, both $n$ and +$m$ may become very large, challenging the efficiency and scalability of +computations. To learn the latent variables in IRT models from large data, we +leverage the similarity of these models to logistic regression, which can be +approximated accurately using small weighted subsets called coresets. We +develop coresets for their use in alternating IRT training algorithms, +facilitating scalable learning from large data. + +
+
+ comment: Published in AISTATS 2024. V2: References updated +
+
+
+
+
+ + ♻ ☆ Asymmetrical estimator for training encapsulated deep photonic neural + networks + + +
+ Scalable isomorphic physical neural networks (PNNs) are emerging NN +acceleration paradigms for their high-bandwidth, in-propagation computation. +Despite backpropagation (BP)-based training is often the industry standard for +its robustness and fast gradient convergences, existing BP-PNN training methods +need to truncate the propagation of analogue signal at each layer and acquire +accurate hidden neuron readouts for deep networks. This compromises the +incentive of PNN for fast in-propagation processing. In addition, the required +readouts introduce massive bottlenecks due to the conversions between the +analogue-digital interfaces to shuttle information across. These factors limit +both the time and energy efficiency during training. Here we introduce the +asymmetrical training (AT) method, a BP-based method that can perform training +on an encapsulated deep network, where the information propagation is +maintained within the analogue domain until the output layer. AT's minimum +information access bypass analogue-digital interface bottleneck wherever +possible. For any deep network structure, AT offers significantly improved time +and energy efficiency compared to existing BP-PNN methods, and scales well for +large network sizes. We demonstrated AT's error-tolerant and calibration-free +training for encapsulated integrated photonic deep networks to achieve near +ideal BP performances. AT's well-behaved training is demonstrated repeatably +across different datasets and network structures + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties ECCV 2024 + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Accepted to ECCV 2024. Code is available at + https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ Conditional Fairness for Generative AIs + + +
+ The deployment of generative AI (GenAI) models raises significant fairness +concerns, addressed in this paper through novel characterization and +enforcement techniques specific to GenAI. Unlike standard AI performing +specific tasks, GenAI's broad functionality requires "conditional fairness" +tailored to the context being generated, such as demographic fairness in +generating images of poor people versus successful business leaders. We define +two fairness levels: the first evaluates fairness in generated outputs, +independent of prompts and models; the second assesses inherent fairness with +neutral prompts. Given the complexity of GenAI and challenges in fairness +specifications, we focus on bounding the worst case, considering a GenAI system +unfair if the distance between appearances of a specific group exceeds preset +thresholds. We also explore combinatorial testing for accessing relative +completeness in intersectional fairness. By bounding the worst case, we develop +a prompt injection scheme within an agent-based framework to enforce +conditional fairness with minimal intervention, validated on state-of-the-art +GenAI systems. + +
+
+
+
+
+ + ♻ ☆ Compressed Federated Reinforcement Learning with a Generative Model ECML-PKDD 2024 + + +
+ Reinforcement learning has recently gained unprecedented popularity, yet it +still grapples with sample inefficiency. Addressing this challenge, federated +reinforcement learning (FedRL) has emerged, wherein agents collaboratively +learn a single policy by aggregating local estimations. However, this +aggregation step incurs significant communication costs. In this paper, we +propose CompFedRL, a communication-efficient FedRL approach incorporating both +\textit{periodic aggregation} and (direct/error-feedback) compression +mechanisms. Specifically, we consider compressed federated $Q$-learning with a +generative model setup, where a central server learns an optimal $Q$-function +by periodically aggregating compressed $Q$-estimates from local agents. For the +first time, we characterize the impact of these two mechanisms (which have +remained elusive) by providing a finite-time analysis of our algorithm, +demonstrating strong convergence behaviors when utilizing either direct or +error-feedback compression. Our bounds indicate improved solution accuracy +concerning the number of agents and other federated hyperparameters while +simultaneously reducing communication costs. To corroborate our theory, we also +conduct in-depth numerical experiments to verify our findings, considering +Top-$K$ and Sparsified-$K$ sparsification operators. + +
+
+ comment: European Conference on Machine Learning and Principles and Practice + of Knowledge Discovery in Databases (ECML-PKDD 2024) +
+
+
+
+
+ + ♻ ☆ RNNs, CNNs and Transformers in Human Action Recognition: A Survey and a + Hybrid Model + + +
+ Human Action Recognition (HAR) encompasses the task of monitoring human +activities across various domains, including but not limited to medical, +educational, entertainment, visual surveillance, video retrieval, and the +identification of anomalous activities. Over the past decade, the field of HAR +has witnessed substantial progress by leveraging Convolutional Neural Networks +(CNNs) to effectively extract and comprehend intricate information, thereby +enhancing the overall performance of HAR systems. Recently, the domain of +computer vision has witnessed the emergence of Vision Transformers (ViTs) as a +potent solution. The efficacy of transformer architecture has been validated +beyond the confines of image analysis, extending their applicability to diverse +video-related tasks. Notably, within this landscape, the research community has +shown keen interest in HAR, acknowledging its manifold utility and widespread +adoption across various domains. This article aims to present an encompassing +survey that focuses on CNNs and the evolution of Recurrent Neural Networks +(RNNs) to ViTs given their importance in the domain of HAR. By conducting a +thorough examination of existing literature and exploring emerging trends, this +study undertakes a critical analysis and synthesis of the accumulated knowledge +in this field. Additionally, it investigates the ongoing efforts to develop +hybrid approaches. Following this direction, this article presents a novel +hybrid model that seeks to integrate the inherent strengths of CNNs and ViTs. + +
+
+
+
+
+ + ♻ ☆ Research on the Spatial Data Intelligent Large Model + + +
+ This report focuses on spatial data intelligent large models, delving into +the principles, methods, and cutting-edge applications of these models. It +provides an in-depth discussion on the definition, development history, current +status, and trends of spatial data intelligent large models, as well as the +challenges they face. The report systematically elucidates the key technologies +of spatial data intelligent large models and their applications in urban +environments, aerospace remote sensing, geography, transportation, and other +scenarios. Additionally, it summarizes the latest application cases of spatial +data intelligent large models in themes such as urban development, multimodal +systems, remote sensing, smart transportation, and resource environments. +Finally, the report concludes with an overview and outlook on the development +prospects of spatial data intelligent large models. + +
+
+ comment: V1 and V2 are in Chinese language, other versions are in English +
+
+
+
+
+ + ♻ ☆ Training-free Graph Neural Networks and the Power of Labels as Features + + +
+ We propose training-free graph neural networks (TFGNNs), which can be used +without training and can also be improved with optional training, for +transductive node classification. We first advocate labels as features (LaF), +which is an admissible but not explored technique. We show that LaF provably +enhances the expressive power of graph neural networks. We design TFGNNs based +on this analysis. In the experiments, we confirm that TFGNNs outperform +existing GNNs in the training-free setting and converge with much fewer +training iterations than traditional GNNs. + +
+
+ comment: TMLR 2024 +
+
+
+
+
+ + ♻ ☆ LiD-FL: Towards List-Decodable Federated Learning + + +
+ Federated learning is often used in environments with many unverified +participants. Therefore, federated learning under adversarial attacks receives +significant attention. This paper proposes an algorithmic framework for +list-decodable federated learning, where a central server maintains a list of +models, with at least one guaranteed to perform well. The framework has no +strict restriction on the fraction of honest workers, extending the +applicability of Byzantine federated learning to the scenario with more than +half adversaries. Under proper assumptions on the loss function, we prove a +convergence theorem for our method. Experimental results, including image +classification tasks with both convex and non-convex losses, demonstrate that +the proposed algorithm can withstand the malicious majority under various +attacks. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Probabilistic Load Forecasting Based on Adaptive Online Learning + + +
+ Load forecasting is crucial for multiple energy management tasks such as +scheduling generation capacity, planning supply and demand, and minimizing +energy trade costs. Such relevance has increased even more in recent years due +to the integration of renewable energies, electric cars, and microgrids. +Conventional load forecasting techniques obtain single-value load forecasts by +exploiting consumption patterns of past load demand. However, such techniques +cannot assess intrinsic uncertainties in load demand, and cannot capture +dynamic changes in consumption patterns. To address these problems, this paper +presents a method for probabilistic load forecasting based on the adaptive +online learning of hidden Markov models. We propose learning and forecasting +techniques with theoretical guarantees, and experimentally assess their +performance in multiple scenarios. In particular, we develop adaptive online +learning techniques that update model parameters recursively, and sequential +prediction techniques that obtain probabilistic forecasts using the most recent +parameters. The performance of the method is evaluated using multiple datasets +corresponding with regions that have different sizes and display assorted +time-varying consumption patterns. The results show that the proposed method +can significantly improve the performance of existing techniques for a wide +range of scenarios. + +
+
+ comment: \c{opyright} 2021 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ PsyDI: Towards a Personalized and Progressively In-depth Chatbot for + Psychological Measurements + + +
+ In the field of psychology, traditional assessment methods, such as +standardized scales, are frequently critiqued for their static nature, lack of +personalization, and reduced participant engagement, while comprehensive +counseling evaluations are often inaccessible. The complexity of quantifying +psychological traits further limits these methods. Despite advances with large +language models (LLMs), many still depend on single-round Question-and-Answer +interactions. To bridge this gap, we introduce PsyDI, a personalized and +progressively in-depth chatbot designed for psychological measurements, +exemplified by its application in the Myers-Briggs Type Indicator (MBTI) +framework. PsyDI leverages user-related multi-modal information and engages in +customized, multi-turn interactions to provide personalized, easily accessible +measurements, while ensuring precise MBTI type determination. To address the +challenge of unquantifiable psychological traits, we introduce a novel training +paradigm that involves learning the ranking of proxy variables associated with +these traits, culminating in a robust score model for MBTI measurements. The +score model enables PsyDI to conduct comprehensive and precise measurements +through multi-turn interactions within a unified estimation context. Through +various experiments, we validate the efficacy of both the score model and the +PsyDI pipeline, demonstrating its potential to serve as a general framework for +psychological measurements. Furthermore, the online deployment of PsyDI has +garnered substantial user engagement, with over 3,000 visits, resulting in the +collection of numerous multi-turn dialogues annotated with MBTI types, which +facilitates further research. The source code for the training and web service +components is publicly available as a part of OpenDILab at: +https://github.com/opendilab/PsyDI + +
+
+ comment: 29 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Judgement Citation Retrieval using Contextual Similarity + + +
+ Traditionally in the domain of legal research, the retrieval of pertinent +citations from intricate case descriptions has demanded manual effort and +keyword-based search applications that mandate expertise in understanding legal +jargon. Legal case descriptions hold pivotal information for legal +professionals and researchers, necessitating more efficient and automated +approaches. We propose a methodology that combines natural language processing +(NLP) and machine learning techniques to enhance the organization and +utilization of legal case descriptions. This approach revolves around the +creation of textual embeddings with the help of state-of-art embedding models. +Our methodology addresses two primary objectives: unsupervised clustering and +supervised citation retrieval, both designed to automate the citation +extraction process. Although the proposed methodology can be used for any +dataset, we employed the Supreme Court of The United States (SCOTUS) dataset, +yielding remarkable results. Our methodology achieved an impressive accuracy +rate of 90.9%. By automating labor-intensive processes, we pave the way for a +more efficient, time-saving, and accessible landscape in legal research, +benefiting legal professionals, academics, and researchers. + +
+
+ comment: 14 pages, 16 images +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and Abstract Concepts: The Case of Natural Numbers + + +
+ Contrastive Learning (CL) has been successfully applied to classification and +other downstream tasks related to concrete concepts, such as objects contained +in the ImageNet dataset. No attempts seem to have been made so far in applying +this promising scheme to more abstract entities. A prominent example of these +could be the concept of (discrete) Quantity. CL can be frequently interpreted +as a self-supervised scheme guided by some profound and ubiquitous conservation +principle (e.g. conservation of identity in object classification tasks). In +this introductory work we apply a suitable conservation principle to the +semi-abstract concept of natural numbers by which discrete quantities can be +estimated or predicted. We experimentally show, by means of a toy problem, that +contrastive learning can be trained to count at a glance with high accuracy +both at human as well as at super-human ranges.. We compare this with the +results of a trained-to-count at a glance supervised learning (SL) neural +network scheme of similar architecture. We show that both schemes exhibit +similar good performance on baseline experiments, where the distributions of +the training and testing stages are equal. Importantly, we demonstrate that in +some generalization scenarios, where training and testing distributions differ, +CL boasts more robust and much better error performance. + +
+
+
+
+
+ + ♻ ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Understanding sentences that contain mathematical expressions in text form +poses significant challenges. To address this, the importance of converting +these expressions into a compiled formula is highlighted. For instance, the +expression ``x equals minus b plus or minus the square root of b squared minus +four a c, all over two a'' from automatic speech recognition (ASR) is more +readily comprehensible when displayed as a compiled formula $x = \frac{-b \pm +\sqrt{b^2 - 4ac}}{2a}$. To develop a text-to-formula conversion system, we can +break down the process into text-to-LaTeX and LaTeX-to-formula conversions, +with the latter managed by various existing LaTeX engines. However, the former +approach has been notably hindered by the severe scarcity of text-to-LaTeX +paired data, which presents a significant challenge in this field. In this +context, we introduce MathBridge, the first extensive dataset for translating +mathematical spoken expressions into LaTeX, to establish a robust baseline for +future research on text-to-LaTeX translation. MathBridge comprises +approximately 23 million LaTeX formulas paired with the corresponding spoken +English expressions. Through comprehensive evaluations, including fine-tuning +and testing with data, we discovered that MathBridge significantly enhances the +capabilities of pretrained language models for text-to-LaTeX translation. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. Our findings indicate the need +for a new metric, specifically for text-to-LaTeX conversion evaluations. + +
+
+ comment: 9page, 6 figures +
+
+
+
+
+ + ♻ ☆ A Non-negative VAE:the Generalized Gamma Belief Network + + +
+ The gamma belief network (GBN), often regarded as a deep topic model, has +demonstrated its potential for uncovering multi-layer interpretable latent +representations in text data. Its notable capability to acquire interpretable +latent factors is partially attributed to sparse and non-negative +gamma-distributed latent variables. However, the existing GBN and its +variations are constrained by the linear generative model, thereby limiting +their expressiveness and applicability. To address this limitation, we +introduce the generalized gamma belief network (Generalized GBN) in this paper, +which extends the original linear generative model to a more expressive +non-linear generative model. Since the parameters of the Generalized GBN no +longer possess an analytic conditional posterior, we further propose an +upward-downward Weibull inference network to approximate the posterior +distribution of the latent variables. The parameters of both the generative +model and the inference network are jointly trained within the variational +inference framework. Finally, we conduct comprehensive experiments on both +expressivity and disentangled representation learning tasks to evaluate the +performance of the Generalized GBN against state-of-the-art Gaussian +variational autoencoders serving as baselines. + +
+
+
+
+
+ + ♻ ☆ Hedge Fund Portfolio Construction Using PolyModel Theory and + iTransformer + + +
+ When constructing portfolios, a key problem is that a lot of financial time +series data are sparse, making it challenging to apply machine learning +methods. Polymodel theory can solve this issue and demonstrate superiority in +portfolio construction from various aspects. To implement the PolyModel theory +for constructing a hedge fund portfolio, we begin by identifying an asset pool, +utilizing over 10,000 hedge funds for the past 29 years' data. PolyModel theory +also involves choosing a wide-ranging set of risk factors, which includes +various financial indices, currencies, and commodity prices. This comprehensive +selection mirrors the complexities of the real-world environment. Leveraging on +the PolyModel theory, we create quantitative measures such as Long-term Alpha, +Long-term Ratio, and SVaR. We also use more classical measures like the Sharpe +ratio or Morningstar's MRAR. To enhance the performance of the constructed +portfolio, we also employ the latest deep learning techniques (iTransformer) to +capture the upward trend, while efficiently controlling the downside, using all +the features. The iTransformer model is specifically designed to address the +challenges in high-dimensional time series forecasting and could largely +improve our strategies. More precisely, our strategies achieve better Sharpe +ratio and annualized return. The above process enables us to create multiple +portfolio strategies aiming for high returns and low risks when compared to +various benchmarks. + +
+
+
+
+
+ + ♻ ☆ SurvReLU: Inherently Interpretable Survival Analysis via Deep ReLU + Networks + + +
+ Survival analysis models time-to-event distributions with censorship. +Recently, deep survival models using neural networks have dominated due to +their representational power and state-of-the-art performance. However, their +"black-box" nature hinders interpretability, which is crucial in real-world +applications. In contrast, "white-box" tree-based survival models offer better +interpretability but struggle to converge to global optima due to greedy +expansion. In this paper, we bridge the gap between previous deep survival +models and traditional tree-based survival models through deep rectified linear +unit (ReLU) networks. We show that a deliberately constructed deep ReLU network +(SurvReLU) can harness the interpretability of tree-based structures with the +representational power of deep survival models. Empirical studies on both +simulated and real survival benchmark datasets show the effectiveness of the +proposed SurvReLU in terms of performance and interoperability. The code is +available at \href{https://github.com/xs018/SurvReLU}{\color{magenta}{ +https://github.com/xs018/SurvReLU}}. + +
+
+
+
+
+ + ♻ ☆ There is No Silver Bullet: Benchmarking Methods in Predictive + Combinatorial Optimization + + +
+ Predictive combinatorial optimization, where the parameters of combinatorial +optimization (CO) are unknown at the decision-making time, is the precise +modeling of many real-world applications, including energy cost-aware +scheduling and budget allocation on advertising. Tackling such a problem +usually involves a prediction model and a CO solver. These two modules are +integrated into the predictive CO pipeline following two design principles: +``Predict-then-Optimize (PtO)'', which learns predictions by supervised +training and subsequently solves CO using predicted coefficients, while the +other, named ``Predict-and-Optimize (PnO)'', directly optimizes towards the +ultimate decision quality and claims to yield better decisions than traditional +PtO approaches. However, there lacks a systematic benchmark of both approaches, +including the specific design choices at the module level, as well as an +evaluation dataset that covers representative real-world scenarios. To this +end, we develop a modular framework to benchmark 11 existing PtO/PnO methods on +8 problems, including a new industrial dataset for combinatorial advertising +that will be released. Our study shows that PnO approaches are better than PtO +on 7 out of 8 benchmarks, but there is no silver bullet found for the specific +design choices of PnO. A comprehensive categorization of current approaches and +integration of typical scenarios are provided under a unified benchmark. +Therefore, this paper could serve as a comprehensive benchmark for future PnO +approach development and also offer fast prototyping for application-focused +development. + +
+
+
+
+
+ + ♻ ☆ Decentralized and Uncoordinated Learning of Stable Matchings: A + Game-Theoretic Approach + + +
+ We consider the problem of learning stable matchings with unknown preferences +in a decentralized and uncoordinated manner, where "decentralized" means that +players make decisions individually without the influence of a central +platform, and "uncoordinated" means that players do not need to synchronize +their decisions using pre-specified rules. First, we provide a game formulation +for this problem with known preferences, where the set of pure Nash equilibria +(NE) coincides with the set of stable matchings, and mixed NE can be rounded to +a stable matching. Then, we show that for hierarchical markets, applying the +exponential weight (EXP) learning algorithm to the stable matching game +achieves logarithmic regret in a fully decentralized and uncoordinated fashion. +Moreover, we show that EXP converges locally and exponentially fast to a stable +matching in general markets. We also introduce another decentralized and +uncoordinated learning algorithm that globally converges to a stable matching +with arbitrarily high probability. Finally, we provide stronger feedback +conditions under which it is possible to drive the market faster toward an +approximate stable matching. Our proposed game-theoretic framework bridges the +discrete problem of learning stable matchings with the problem of learning NE +in continuous-action games. + +
+
+
+
+
+ + ♻ ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.778 and an AP value of 0.426 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ♻ ☆ A Study on Large Language Models' Limitations in Multiple-Choice + Question Answering + + +
+ The widespread adoption of Large Language Models (LLMs) has become +commonplace, particularly with the emergence of open-source models. More +importantly, smaller models are well-suited for integration into consumer +devices and are frequently employed either as standalone solutions or as +subroutines in various AI tasks. Despite their ubiquitous use, there is no +systematic analysis of their specific capabilities and limitations. In this +study, we tackle one of the most widely used tasks - answering Multiple Choice +Question (MCQ). We analyze 26 small open-source models and find that 65% of the +models do not understand the task, only 4 models properly select an answer from +the given choices, and only 5 of these models are choice order independent. +These results are rather alarming given the extensive use of MCQ tests with +these models. We recommend exercising caution and testing task understanding +before using MCQ to evaluate LLMs in any field whatsoever. + +
+
+
+
+
+ + ♻ ☆ Detecting Anomalies in Dynamic Graphs via Memory enhanced Normality + + +
+ Anomaly detection in dynamic graphs presents a significant challenge due to +the temporal evolution of graph structures and attributes. The conventional +approaches that tackle this problem typically employ an unsupervised learning +framework, capturing normality patterns with exclusive normal data during +training and identifying deviations as anomalies during testing. However, these +methods face critical drawbacks: they either only depend on proxy tasks for +representation without directly pinpointing normal patterns, or they neglect to +differentiate between spatial and temporal normality patterns. More recent +methods that use contrastive learning with negative sampling also face high +computational costs, limiting their scalability to large graphs. To address +these challenges, we introduce a novel Spatial-Temporal memories-enhanced graph +autoencoder (STRIPE). Initially, STRIPE employs Graph Neural Networks (GNNs) +and gated temporal convolution layers to extract spatial and temporal features. +Then STRIPE incorporates separate spatial and temporal memory networks to +capture and store prototypes of normal patterns, respectively. These stored +patterns are retrieved and integrated with encoded graph embeddings through a +mutual attention mechanism. Finally, the integrated features are fed into the +decoder to reconstruct the graph streams which serve as the proxy task for +anomaly detection. This comprehensive approach not only minimizes +reconstruction errors but also emphasizes the compactness and distinctiveness +of the embeddings w.r.t. the nearest memory prototypes. Extensive experiments +on six benchmark datasets demonstrate the effectiveness and efficiency of +STRIPE, where STRIPE significantly outperforms existing methods with 5.8% +improvement in AUC scores and 4.62X faster in training time. + +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Robust Active Learning (RoAL): Countering Dynamic Adversaries in Active + Learning with Elastic Weight Consolidation + + +
+ Despite significant advancements in active learning and adversarial attacks, +the intersection of these two fields remains underexplored, particularly in +developing robust active learning frameworks against dynamic adversarial +threats. The challenge of developing robust active learning frameworks under +dynamic adversarial attacks is critical, as these attacks can lead to +catastrophic forgetting within the active learning cycle. This paper introduces +Robust Active Learning (RoAL), a novel approach designed to address this issue +by integrating Elastic Weight Consolidation (EWC) into the active learning +process. Our contributions are threefold: First, we propose a new dynamic +adversarial attack that poses significant threats to active learning +frameworks. Second, we introduce a novel method that combines EWC with active +learning to mitigate catastrophic forgetting caused by dynamic adversarial +attacks. Finally, we conduct extensive experimental evaluations to demonstrate +the efficacy of our approach. The results show that RoAL not only effectively +counters dynamic adversarial threats but also significantly reduces the impact +of catastrophic forgetting, thereby enhancing the robustness and performance of +active learning systems in adversarial environments. + +
+
+
+
+
+ + ♻ ☆ Efficient Data-Driven MPC for Demand Response of Commercial Buildings + + +
+ Model predictive control (MPC) has been shown to significantly improve the +energy efficiency of buildings while maintaining thermal comfort. Data-driven +approaches based on neural networks have been proposed to facilitate system +modelling. However, such approaches are generally nonconvex and result in +computationally intractable optimization problems. In this work, we design a +readily implementable energy management method for small commercial buildings. +We then leverage our approach to formulate a real-time demand bidding strategy. +We propose a data-driven and mixed-integer convex MPC which is solved via +derivative-free optimization given a limited computational time of 5 minutes to +respect operational constraints. We consider rooftop unit heating, ventilation, +and air conditioning systems with discrete controls to accurately model the +operation of most commercial buildings. Our approach uses an input convex +recurrent neural network to model the thermal dynamics. We apply our approach +in several demand response (DR) settings, including a demand bidding, a +time-of-use, and a critical peak rebate program. Controller performance is +evaluated on a state-of-the-art building simulation. The proposed approach +improves thermal comfort while reducing energy consumption and cost through DR +participation, when compared to other data-driven approaches or a set-point +controller. + +
+
+
+
+
+ + ♻ ☆ Harm Amplification in Text-to-Image Models + + +
+ Text-to-image (T2I) models have emerged as a significant advancement in +generative AI; however, there exist safety concerns regarding their potential +to produce harmful image outputs even when users input seemingly safe prompts. +This phenomenon, where T2I models generate harmful representations that were +not explicit in the input prompt, poses a potentially greater risk than +adversarial prompts, leaving users unintentionally exposed to harms. Our paper +addresses this issue by formalizing a definition for this phenomenon which we +term harm amplification. We further contribute to the field by developing a +framework of methodologies to quantify harm amplification in which we consider +the harm of the model output in the context of user input. We then empirically +examine how to apply these different methodologies to simulate real-world +deployment scenarios including a quantification of disparate impacts across +genders resulting from harm amplification. Together, our work aims to offer +researchers tools to comprehensively address safety challenges in T2I systems +and contribute to the responsible deployment of generative AI models. + +
+
+
+
+
+ + ♻ ☆ MMP++: Motion Manifold Primitives with Parametric Curve Models + + +
+ Motion Manifold Primitives (MMP), a manifold-based approach for encoding +basic motion skills, can produce diverse trajectories, enabling the system to +adapt to unseen constraints. Nonetheless, we argue that current MMP models lack +crucial functionalities of movement primitives, such as temporal and via-points +modulation, found in traditional approaches. This shortfall primarily stems +from MMP's reliance on discrete-time trajectories. To overcome these +limitations, we introduce Motion Manifold Primitives++ (MMP++), a new model +that integrates the strengths of both MMP and traditional methods by +incorporating parametric curve representations into the MMP framework. +Furthermore, we identify a significant challenge with MMP++: performance +degradation due to geometric distortions in the latent space, meaning that +similar motions are not closely positioned. To address this, Isometric Motion +Manifold Primitives++ (IMMP++) is proposed to ensure the latent space +accurately preserves the manifold's geometry. Our experimental results across +various applications, including 2-DoF planar motions, 7-DoF robot arm motions, +and SE(3) trajectory planning, show that MMP++ and IMMP++ outperform existing +methods in trajectory generation tasks, achieving substantial improvements in +some cases. Moreover, they enable the modulation of latent coordinates and +via-points, thereby allowing efficient online adaptation to dynamic +environments. + +
+
+ comment: 15 pages. The paper will appear in the IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ CMDA-OT: Collaborative Multi-source Domain Adaptation Through Optimal + Transport + + +
+ Multi-source Domain Adaptation (MDA) seeks to adapt models trained on data +from multiple labeled source domains to perform effectively on an unlabeled +target domain data, assuming access to sources data. To address the challenges +of model adaptation and data privacy, we introduce Collaborative MDA Through +Optimal Transport (CMDA-OT), a novel framework consisting of two key phases. In +the first phase, each source domain is independently adapted to the target +domain using optimal transport methods. In the second phase, a centralized +collaborative learning architecture is employed, which aggregates the N models +from the N sources without accessing their data, thereby safeguarding privacy. +During this process, the server leverages a small set of pseudo-labeled samples +from the target domain, known as the target validation subset, to refine and +guide the adaptation. This dual-phase approach not only improves model +performance on the target domain but also addresses vital privacy challenges +inherent in domain adaptation. + +
+
+
+
+
+ + ♻ ☆ DP-MemArc: Differential Privacy Transfer Learning for Memory Efficient + Language Models + + +
+ Large language models have repeatedly shown outstanding performance across +diverse applications. However, deploying these models can inadvertently risk +user privacy. The significant memory demands during training pose a major +challenge in terms of resource consumption. This substantial size places a +heavy load on memory resources, raising considerable practical concerns. In +this paper, we introduce DP-MemArc, a novel training framework aimed at +reducing the memory costs of large language models while emphasizing the +protection of user data privacy. DP-MemArc incorporates side network or +reversible network designs to support a variety of differential privacy +memory-efficient fine-tuning schemes. Our approach not only achieves in memory +optimization but also ensures robust privacy protection, keeping user data +secure and confidential. Extensive experiments have demonstrated that DP-MemArc +effectively provides differential privacy-efficient fine-tuning across +different task scenarios. + +
+
+ comment: 9 pages second version +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification using Variational Inference for Biomedical + Image Segmentation + + +
+ Deep learning motivated by convolutional neural networks has been highly +successful in a range of medical imaging problems like image classification, +image segmentation, image synthesis etc. However for validation and +interpretability, not only do we need the predictions made by the model but +also how confident it is while making those predictions. This is important in +safety critical applications for the people to accept it. In this work, we used +an encoder decoder architecture based on variational inference techniques for +segmenting brain tumour images. We evaluate our work on the publicly available +BRATS dataset using Dice Similarity Coefficient (DSC) and Intersection Over +Union (IOU) as the evaluation metrics. Our model is able to segment brain +tumours while taking into account both aleatoric uncertainty and epistemic +uncertainty in a principled bayesian manner. + +
+
+
+
+
+ + ♻ ☆ RAGSys: Item-Cold-Start Recommender as RAG System + + +
+ Large Language Models (LLM) hold immense promise for real-world applications, +but their generic knowledge often falls short of domain-specific needs. +Fine-tuning, a common approach, can suffer from catastrophic forgetting and +hinder generalizability. In-Context Learning (ICL) offers an alternative, which +can leverage Retrieval-Augmented Generation (RAG) to provide LLMs with relevant +demonstrations for few-shot learning tasks. This paper explores the desired +qualities of a demonstration retrieval system for ICL. We argue that ICL +retrieval in this context resembles item-cold-start recommender systems, +prioritizing discovery and maximizing information gain over strict relevance. +We propose a novel evaluation method that measures the LLM's subsequent +performance on NLP tasks, eliminating the need for subjective diversity scores. +Our findings demonstrate the critical role of diversity and quality bias in +retrieved demonstrations for effective ICL, and highlight the potential of +recommender system techniques in this domain. + +
+
+
+
+
+ + ♻ ☆ Efficient Imitation Learning with Conservative World Models + + +
+ We tackle the problem of policy learning from expert demonstrations without a +reward function. A central challenge in this space is that these policies fail +upon deployment due to issues of distributional shift, environment +stochasticity, or compounding errors. Adversarial imitation learning alleviates +this issue but requires additional on-policy training samples for stability, +which presents a challenge in realistic domains due to inefficient learning and +high sample complexity. One approach to this issue is to learn a world model of +the environment, and use synthetic data for policy training. While successful +in prior works, we argue that this is sub-optimal due to additional +distribution shifts between the learned model and the real environment. +Instead, we re-frame imitation learning as a fine-tuning problem, rather than a +pure reinforcement learning one. Drawing theoretical connections to offline RL +and fine-tuning algorithms, we argue that standard online world model +algorithms are not well suited to the imitation learning problem. We derive a +principled conservative optimization bound and demonstrate empirically that it +leads to improved performance on two very challenging manipulation environments +from high-dimensional raw pixel observations. We set a new state-of-the-art +performance on the Franka Kitchen environment from images, requiring only 10 +demos on no reward labels, as well as solving a complex dexterity manipulation +task. + +
+
+ comment: Oral presentation, L4DC 2024 +
+
+
+
+
+ + ♻ ☆ Deep Convolutional Autoencoder for Assessment of Anomalies in + Multi-stream Sensor Data SC + + +
+ This work investigates a practical and novel method for automated +unsupervised fault detection in vehicles using a fully convolutional +autoencoder. The results demonstrate the algorithm we developed can detect +anomalies which correspond to powertrain faults by learning patterns in the +multivariate time-series data of hybrid-electric vehicle powertrain sensors. +Data was collected by engineers at Ford Motor Company from numerous sensors +over several drive cycle variations. This study provides evidence of the +anomaly detecting capability of our trained autoencoder and investigates the +suitability of our autoencoder relative to other unsupervised methods for +automatic fault detection in this data set. Preliminary results of testing the +autoencoder on the powertrain sensor data indicate the data reconstruction +approach availed by the autoencoder is a robust technique for identifying the +abnormal sequences in the multivariate series. These results support that +irregularities in hybrid-electric vehicles' powertrains are conveyed via sensor +signals in the embedded electronic communication system, and therefore can be +identified mechanistically with a trained algorithm. Additional unsupervised +methods are tested and show the autoencoder performs better at fault detection +than outlier detectors and other novel deep learning techniques. + +
+
+ comment: SSCI2022, 7 pages, 3 Tables, 3 Figures +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ A Multi-task Adversarial Attack Against Face Authentication + + +
+ Deep-learning-based identity management systems, such as face authentication +systems, are vulnerable to adversarial attacks. However, existing attacks are +typically designed for single-task purposes, which means they are tailored to +exploit vulnerabilities unique to the individual target rather than being +adaptable for multiple users or systems. This limitation makes them unsuitable +for certain attack scenarios, such as morphing, universal, transferable, and +counter attacks. In this paper, we propose a multi-task adversarial attack +algorithm called MTADV that are adaptable for multiple users or systems. By +interpreting these scenarios as multi-task attacks, MTADV is applicable to both +single- and multi-task attacks, and feasible in the white- and gray-box +settings. Furthermore, MTADV is effective against various face datasets, +including LFW, CelebA, and CelebA-HQ, and can work with different deep learning +models, such as FaceNet, InsightFace, and CurricularFace. Importantly, MTADV +retains its feasibility as a single-task attack targeting a single user/system. +To the best of our knowledge, MTADV is the first adversarial attack method that +can target all of the aforementioned scenarios in one algorithm. + +
+
+ comment: Accepted by ACM Transactions on Multimedia Computing, Communications, + and Applications +
+
+
+
+
+ + ☆ When Video Coding Meets Multimodal Large Language Models: A Unified + Paradigm for Video Coding + + +
+ Existing codecs are designed to eliminate intrinsic redundancies to create a +compact representation for compression. However, strong external priors from +Multimodal Large Language Models (MLLMs) have not been explicitly explored in +video compression. Herein, we introduce a unified paradigm for Cross-Modality +Video Coding (CMVC), which is a pioneering approach to explore multimodality +representation and video generative models in video coding. Specifically, on +the encoder side, we disentangle a video into spatial content and motion +components, which are subsequently transformed into distinct modalities to +achieve very compact representation by leveraging MLLMs. During decoding, +previously encoded components and video generation models are leveraged to +create multiple encoding-decoding modes that optimize video reconstruction +quality for specific decoding requirements, including Text-Text-to-Video (TT2V) +mode to ensure high-quality semantic information and Image-Text-to-Video (IT2V) +mode to achieve superb perceptual consistency. In addition, we propose an +efficient frame interpolation model for IT2V mode via Low-Rank Adaption (LoRA) +tuning to guarantee perceptual quality, which allows the generated motion cues +to behave smoothly. Experiments on benchmarks indicate that TT2V achieves +effective semantic reconstruction, while IT2V exhibits competitive perceptual +consistency. These results highlight potential directions for future research +in video coding. + +
+
+
+
+
+ + ☆ Joint Optimization of Buffer Delay and HARQ for Video Communications + + +
+ To improve the quality of experience (QoE) in video communication over lossy +networks, this paper presents a transmission method that jointly optimizes +buffer delay and Hybrid Automatic Repeat request (HARQ), referred to as +BD-HARQ. This method operates on packet group and employs dynamic buffer delay +combined with HARQ strategy for transmission. By defining the QoE based on +metrics such as buffer delay, Forward Error Correction (FEC) redundancy, and +data recovery rate, the proposed method derives its closed-form expression +through rigorous mathematical modeling and analysis. The optimal transmission +parameters, i.e., the buffer delay and the FEC redundancy, are then determined +and implemented, guaranteeing the real-time performance, transmission +efficiency, and data recovery rate of video communication. Experimental results +demonstrate that the proposed method aligns well with its theoretical +expectations, and that it can provide up to 13.7% higher QoE compared to +existing methods and increase the tolerance for packet loss rate from 15%-22% +to up to 31% while maintaining a high QoE. + +
+
+ comment: 6 pages, 5figures +
+
+
+
+
+ + ♻ ☆ A Multi-Stream Fusion Approach with One-Class Learning for Audio-Visual + Deepfake Detection + + +
+ This paper addresses the challenge of developing a robust audio-visual +deepfake detection model. In practical use cases, new generation algorithms are +continually emerging, and these algorithms are not encountered during the +development of detection methods. This calls for the generalization ability of +the method. Additionally, to ensure the credibility of detection methods, it is +beneficial for the model to interpret which cues from the video indicate it is +fake. Motivated by these considerations, we then propose a multi-stream fusion +approach with one-class learning as a representation-level regularization +technique. We study the generalization problem of audio-visual deepfake +detection by creating a new benchmark by extending and re-splitting the +existing FakeAVCeleb dataset. The benchmark contains four categories of fake +videos (Real Audio-Fake Visual, Fake Audio-Fake Visual, Fake Audio-Real Visual, +and Unsynchronized videos). The experimental results demonstrate that our +approach surpasses the previous models by a large margin. Furthermore, our +proposed framework offers interpretability, indicating which modality the model +identifies as more likely to be fake. The source code is released at +https://github.com/bok-bok/MSOC. + +
+
+
+
+
+ + ♻ ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 65 + +
+
+
+ + ☆ The Death of Schema Linking? Text-to-SQL in the Age of Well-Reasoned + Language Models + + +
+ Schema linking is a crucial step in Text-to-SQL pipelines, which translate +natural language queries into SQL. The goal of schema linking is to retrieve +relevant tables and columns (signal) while disregarding irrelevant ones +(noise). However, imperfect schema linking can often exclude essential columns +needed for accurate query generation. In this work, we revisit the need for +schema linking when using the latest generation of large language models +(LLMs). We find empirically that newer models are adept at identifying relevant +schema elements during generation, without the need for explicit schema +linking. This allows Text-to-SQL pipelines to bypass schema linking entirely +and instead pass the full database schema to the LLM, eliminating the risk of +excluding necessary information. Furthermore, as alternatives to schema +linking, we propose techniques that improve Text-to-SQL accuracy without +compromising on essential schema information. Our approach achieves 71.83\% +execution accuracy on the BIRD benchmark, ranking first at the time of +submission. + +
+
+
+
+
+ + ☆ Quantifying over Optimum Answer Sets + + +
+ Answer Set Programming with Quantifiers (ASP(Q)) has been introduced to +provide a natural extension of ASP modeling to problems in the polynomial +hierarchy (PH). However, ASP(Q) lacks a method for encoding in an elegant and +compact way problems requiring a polynomial number of calls to an oracle in +$\Sigma_n^p$ (that is, problems in $\Delta_{n+1}^p$). Such problems include, in +particular, optimization problems. In this paper we propose an extension of +ASP(Q), in which component programs may contain weak constraints. Weak +constraints can be used both for expressing local optimization within +quantified component programs and for modeling global optimization criteria. We +showcase the modeling capabilities of the new formalism through various +application scenarios. Further, we study its computational properties obtaining +complexity results and unveiling non-obvious characteristics of ASP(Q) programs +with weak constraints. + +
+
+
+
+
+ + ☆ Enhanced Detection of Conversational Mental Manipulation Through + Advanced Prompting Techniques EMNLP 2024 + + +
+ This study presents a comprehensive, long-term project to explore the +effectiveness of various prompting techniques in detecting dialogical mental +manipulation. We implement Chain-of-Thought prompting with Zero-Shot and +Few-Shot settings on a binary mental manipulation detection task, building upon +existing work conducted with Zero-Shot and Few- Shot prompting. Our primary +objective is to decipher why certain prompting techniques display superior +performance, so as to craft a novel framework tailored for detection of mental +manipulation. Preliminary findings suggest that advanced prompting techniques +may not be suitable for more complex models, if they are not trained through +example-based learning. + +
+
+ comment: Accepted at WiNLP @ EMNLP 2024 +
+
+
+
+
+ + ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ☆ Spoken Stereoset: On Evaluating Social Bias Toward Speaker in Speech + Large Language Models + + +
+ Warning: This paper may contain texts with uncomfortable content. + Large Language Models (LLMs) have achieved remarkable performance in various +tasks, including those involving multimodal data like speech. However, these +models often exhibit biases due to the nature of their training data. Recently, +more Speech Large Language Models (SLLMs) have emerged, underscoring the urgent +need to address these biases. This study introduces Spoken Stereoset, a dataset +specifically designed to evaluate social biases in SLLMs. By examining how +different models respond to speech from diverse demographic groups, we aim to +identify these biases. Our experiments reveal significant insights into their +performance and bias levels. The findings indicate that while most models show +minimal bias, some still exhibit slightly stereotypical or anti-stereotypical +tendencies. + +
+
+
+
+
+ + ☆ Alignment-Enhanced Decoding:Defending via Token-Level Adaptive Refining + of Probability Distributions + + +
+ Large language models are susceptible to jailbreak attacks, which can result +in the generation of harmful content. While prior defenses mitigate these risks +by perturbing or inspecting inputs, they ignore competing objectives, the +underlying cause of alignment failures. In this paper, we propose +Alignment-Enhanced Decoding (AED), a novel defense that employs adaptive +decoding to address the root causes of jailbreak issues. We first define the +Competitive Index to quantify alignment failures and utilize feedback from +self-evaluation to compute post-alignment logits. Then, AED adaptively combines +AED and post-alignment logits with the original logits to obtain harmless and +helpful distributions. Consequently, our method enhances safety alignment while +maintaining helpfulness. We conduct experiments across five models and four +common jailbreaks, with the results validating the effectiveness of our +approach. Code is available at https://github.com/GIGABaozi/AED.git. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ See It All: Contextualized Late Aggregation for 3D Dense Captioning ACL 2024 + + +
+ 3D dense captioning is a task to localize objects in a 3D scene and generate +descriptive sentences for each object. Recent approaches in 3D dense captioning +have adopted transformer encoder-decoder frameworks from object detection to +build an end-to-end pipeline without hand-crafted components. However, these +approaches struggle with contradicting objectives where a single query +attention has to simultaneously view both the tightly localized object regions +and contextual environment. To overcome this challenge, we introduce SIA +(See-It-All), a transformer pipeline that engages in 3D dense captioning with a +novel paradigm called late aggregation. SIA simultaneously decodes two sets of +queries-context query and instance query. The instance query focuses on +localization and object attribute descriptions, while the context query +versatilely captures the region-of-interest of relationships between multiple +objects or with the global scene, then aggregated afterwards (i.e., late +aggregation) via simple distance-based measures. To further enhance the quality +of contextualized caption generation, we design a novel aggregator to generate +a fully informed caption based on the surrounding context, the global +environment, and object instances. Extensive experiments on two of the most +widely-used 3D dense captioning datasets demonstrate that our proposed method +achieves a significant improvement over prior methods. + +
+
+ comment: Accepted to ACL 2024 Findings +
+
+
+
+
+ + ☆ Hierarchical Working Memory and a New Magic Number + + +
+ The extremely limited working memory span, typically around four items, +contrasts sharply with our everyday experience of processing much larger +streams of sensory information concurrently. This disparity suggests that +working memory can organize information into compact representations such as +chunks, yet the underlying neural mechanisms remain largely unknown. Here, we +propose a recurrent neural network model for chunking within the framework of +the synaptic theory of working memory. We showed that by selectively +suppressing groups of stimuli, the network can maintain and retrieve the +stimuli in chunks, hence exceeding the basic capacity. Moreover, we show that +our model can dynamically construct hierarchical representations within working +memory through hierarchical chunking. A consequence of this proposed mechanism +is a new limit on the number of items that can be stored and subsequently +retrieved from working memory, depending only on the basic working memory +capacity when chunking is not invoked. Predictions from our model were +confirmed by analyzing single-unit responses in epileptic patients and memory +experiments with verbal material. Our work provides a novel conceptual and +analytical framework for understanding the on-the-fly organization of +information in the brain that is crucial for cognition. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation + Integrating Web Search and Knowledge Graphs KDD + + +
+ Large Language Models (LLMs) have greatly contributed to the development of +adaptive intelligent agents and are positioned as an important way to achieve +Artificial General Intelligence (AGI). However, LLMs are prone to produce +factually incorrect information and often produce "phantom" content that +undermines their reliability, which poses a serious challenge for their +deployment in real-world scenarios. Enhancing LLMs by combining external +databases and information retrieval mechanisms is an effective path. To address +the above challenges, we propose a new approach called WeKnow-RAG, which +integrates Web search and Knowledge Graphs into a "Retrieval-Augmented +Generation (RAG)" system. First, the accuracy and reliability of LLM responses +are improved by combining the structured representation of Knowledge Graphs +with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes +domain-specific knowledge graphs to satisfy a variety of queries and domains, +thereby improving performance on factual information and complex reasoning +tasks by employing multi-stage web page retrieval techniques using both sparse +and dense retrieval methods. Our approach effectively balances the efficiency +and accuracy of information retrieval, thus improving the overall retrieval +process. Finally, we also integrate a self-assessment mechanism for the LLM to +evaluate the trustworthiness of the answers it generates. Our approach proves +its outstanding effectiveness in a wide range of offline experiments and online +submissions. + +
+
+ comment: 8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta + KDD Cup 2024 CRAG Challenge +
+
+
+
+
+ + ☆ Assessing the Role of Lexical Semantics in Cross-lingual Transfer + through Controlled Manipulations + + +
+ While cross-linguistic model transfer is effective in many settings, there is +still limited understanding of the conditions under which it works. In this +paper, we focus on assessing the role of lexical semantics in cross-lingual +transfer, as we compare its impact to that of other language properties. +Examining each language property individually, we systematically analyze how +differences between English and a target language influence the capacity to +align the language with an English pretrained representation space. We do so by +artificially manipulating the English sentences in ways that mimic specific +characteristics of the target language, and reporting the effect of each +manipulation on the quality of alignment with the representation space. We show +that while properties such as the script or word order only have a limited +impact on alignment quality, the degree of lexical matching between the two +languages, which we define using a measure of translation entropy, greatly +affects it. + +
+
+
+
+
+ + ☆ Transformers and Large Language Models for Efficient Intrusion Detection + Systems: A Comprehensive Survey + + +
+ With significant advancements in Transformers LLMs, NLP has extended its +reach into many research fields due to its enhanced capabilities in text +generation and user interaction. One field benefiting greatly from these +advancements is cybersecurity. In cybersecurity, many parameters that need to +be protected and exchanged between senders and receivers are in the form of +text and tabular data, making NLP a valuable tool in enhancing the security +measures of communication protocols. This survey paper provides a comprehensive +analysis of the utilization of Transformers and LLMs in cyber-threat detection +systems. The methodology of paper selection and bibliometric analysis is +outlined to establish a rigorous framework for evaluating existing research. +The fundamentals of Transformers are discussed, including background +information on various cyber-attacks and datasets commonly used in this field. +The survey explores the application of Transformers in IDSs, focusing on +different architectures such as Attention-based models, LLMs like BERT and GPT, +CNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others. +Furthermore, it explores the diverse environments and applications where +Transformers and LLMs-based IDS have been implemented, including computer +networks, IoT devices, critical infrastructure protection, cloud computing, +SDN, as well as in autonomous vehicles. The paper also addresses research +challenges and future directions in this area, identifying key issues such as +interpretability, scalability, and adaptability to evolving threats, and more. +Finally, the conclusion summarizes the findings and highlights the significance +of Transformers and LLMs in enhancing cyber-threat detection capabilities, +while also outlining potential avenues for further research and development. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.04760 by other authors +
+
+
+
+
+ + ☆ MathScape: Evaluating MLLMs in multimodal Math Scenarios through a + Hierarchical Benchmark + + +
+ With the development of Multimodal Large Language Models (MLLMs), the +evaluation of multimodal models in the context of mathematical problems has +become a valuable research field. Multimodal visual-textual mathematical +reasoning serves as a critical indicator for evaluating the comprehension and +complex multi-step quantitative reasoning abilities of MLLMs. However, previous +multimodal math benchmarks have not sufficiently integrated visual and textual +information. To address this gap, we proposed MathScape, a new benchmark that +emphasizes the understanding and application of combined visual and textual +information. MathScape is designed to evaluate photo-based math problem +scenarios, assessing the theoretical understanding and application ability of +MLLMs through a categorical hierarchical approach. We conduct a +multi-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark +is challenging even for the most sophisticated models. By analyzing the +evaluation results, we identify the limitations of MLLMs, offering valuable +insights for enhancing model performance. + +
+
+
+
+
+ + ☆ Development of a Multi-Agent Clinical Decision Support System for Korean + Triage and Acuity Scale (KTAS)-Based Triage and Treatment Planning in + Emergency Departments + + +
+ Emergency department (ED) overcrowding and the complexity of rapid +decision-making in critical care settings pose significant challenges to +healthcare systems worldwide. While clinical decision support systems (CDSS) +have shown promise, the integration of large language models (LLMs) offers new +possibilities for enhancing triage accuracy and clinical decision-making. This +study presents an LLM-driven CDSS designed to assist ED physicians and nurses +in patient triage, treatment planning, and overall emergency care management. + We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM, +orchestrated by CrewAI and Langchain. The system comprises four AI agents +emulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED +Coordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for +triage assessment and integrates with the RxNorm API for medication management. + The model was evaluated using the Asclepius dataset, with performance +assessed by a clinical emergency medicine specialist. The CDSS demonstrated +high accuracy in triage decision-making compared to the baseline of a +single-agent system. Furthermore, the system exhibited strong performance in +critical areas, including primary diagnosis, critical findings identification, +disposition decision-making, treatment planning, and resource allocation. + Our multi-agent CDSS demonstrates significant potential for supporting +comprehensive emergency care management. By leveraging state-of-the-art AI +technologies, this system offers a scalable and adaptable tool that could +enhance emergency medical care delivery, potentially alleviating ED +overcrowding and improving patient outcomes. This work contributes to the +growing field of AI applications in emergency medicine and offers a promising +direction for future research and clinical implementation. + +
+
+
+
+
+ + ☆ Large Language Models Know What Makes Exemplary Contexts + + +
+ In-context learning (ICL) has proven to be a significant capability with the +advancement of Large Language models (LLMs). By instructing LLMs using few-shot +demonstrative examples, ICL enables them to perform a wide range of tasks +without needing to update millions of parameters. This paper presents a unified +framework for LLMs that allows them to self-select influential in-context +examples to compose their contexts; self-rank candidates with different +demonstration compositions; self-optimize the demonstration selection and +ordering through reinforcement learning. Specifically, our method designs a +parameter-efficient retrieval head that generates the optimized demonstration +after training with rewards from LLM's own preference. Experimental results +validate the proposed method's effectiveness in enhancing ICL performance. +Additionally, our approach effectively identifies and selects the most +representative examples for the current task, and includes more diversity in +retrieval. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ A Study on Bias Detection and Classification in Natural Language + Processing + + +
+ Human biases have been shown to influence the performance of models and +algorithms in various fields, including Natural Language Processing. While the +study of this phenomenon is garnering focus in recent years, the available +resources are still relatively scarce, often focusing on different forms or +manifestations of biases. The aim of our work is twofold: 1) gather +publicly-available datasets and determine how to better combine them to +effectively train models in the task of hate speech detection and +classification; 2) analyse the main issues with these datasets, such as +scarcity, skewed resources, and reliance on non-persistent data. We discuss +these issues in tandem with the development of our experiments, in which we +show that the combinations of different datasets greatly impact the models' +performance. + +
+
+ comment: 31 pages, 15 Tables, 4 Figures +
+
+
+
+
+ + ☆ Bridging and Modeling Correlations in Pairwise Data for Direct + Preference Optimization + + +
+ Direct preference optimization (DPO), a widely adopted offline preference +optimization algorithm, aims to align large language models (LLMs) with +human-desired behaviors using pairwise preference data. However, the winning +response and the losing response within pairwise data are generated isolatedly, +leading to weak correlations between them as well as suboptimal alignment +performance. To address this issue, we propose an effective framework named +BMC, for bridging and modeling correlations in pairwise data. Firstly, we +increase the consistency and informativeness of the pairwise preference signals +by targeted modifications, synthesizing a pseudo winning response through +improving the losing response based on the winning response. Secondly, we +identify that DPO alone is insufficient to model these correlations and capture +nuanced variations. Therefore, we propose learning token-level correlations by +dynamically leveraging the policy model's confidence during training. +Comprehensive experiments on QA, math, and instruction-following tasks +demonstrate the effectiveness of our approach, significantly surpassing +competitive baselines, including DPO. Additionally, our in-depth quantitative +analysis reveals the reasons behind our method's superior performance over DPO +and showcases its versatility to other DPO variants. + +
+
+ comment: 18 pages, 8 figures, 8 tables, working in progress +
+
+
+
+
+ + ☆ Large Language Models Prompting With Episodic Memory + + +
+ Prompt optimization is essential for enhancing the performance of Large +Language Models (LLMs) in a range of Natural Language Processing (NLP) tasks, +particularly in scenarios of few-shot learning where training examples are +incorporated directly into the prompt. Despite the growing interest in +optimizing prompts with few-shot examples, existing methods for prompt +optimization are often resource-intensive or perform inadequately. In this +work, we propose PrOmpting with Episodic Memory (POEM), a novel prompt +optimization technique that is simple, efficient, and demonstrates strong +generalization capabilities. We approach prompt optimization as a Reinforcement +Learning (RL) challenge, using episodic memory to archive combinations of input +data, permutations of few-shot examples, and the rewards observed during +training. In the testing phase, we optimize the sequence of examples for each +test query by selecting the sequence that yields the highest total rewards from +the top-k most similar training examples in the episodic memory. Our results +show that POEM outperforms recent techniques like TEMPERA and RLPrompt by over +5.3% in various text classification tasks. Furthermore, our approach adapts +well to broader language understanding tasks, consistently outperforming +conventional heuristic methods for ordering examples. + +
+
+
+
+
+ + ☆ From Brazilian Portuguese to European Portuguese + + +
+ Brazilian Portuguese and European Portuguese are two varieties of the same +language and, despite their close similarities, they exhibit several +differences. However, there is a significant disproportion in the availability +of resources between the two variants, with Brazilian Portuguese having more +abundant resources. This inequity can impact the quality of translation +services accessible to European Portuguese speakers. To address this issue, we +propose the development of a Brazilian Portuguese to European Portuguese +translation system, leveraging recent advancements in neural architectures and +models. To evaluate the performance of such systems, we manually curated a gold +test set comprising 500 sentences across five different topics. Each sentence +in the gold test set has two distinct references, facilitating a +straightforward evaluation of future translation models. We experimented with +various models by fine-tuning existing Large Language Models using parallel +data extracted from movie subtitles and TED Talks transcripts in both Brazilian +and European Portuguese. Our evaluation involved the use of conventional +automatic metrics as well as a human evaluation. In addition, all models were +compared against ChatGPT 3.5 Turbo, which currently yields the best results. + +
+
+ comment: 12 pages, 8 tables +
+
+
+
+
+ + ☆ Fact or Fiction? Improving Fact Verification with Knowledge Graphs + through Simplified Subgraph Retrievals + + +
+ Despite recent success in natural language processing (NLP), fact +verification still remains a difficult task. Due to misinformation spreading +increasingly fast, attention has been directed towards automatically verifying +the correctness of claims. In the domain of NLP, this is usually done by +training supervised machine learning models to verify claims by utilizing +evidence from trustworthy corpora. We present efficient methods for verifying +claims on a dataset where the evidence is in the form of structured knowledge +graphs. We use the FactKG dataset, which is constructed from the DBpedia +knowledge graph extracted from Wikipedia. By simplifying the evidence retrieval +process, from fine-tuned language models to simple logical retrievals, we are +able to construct models that both require less computational resources and +achieve better test-set accuracy. + +
+
+ comment: 10 pages, 3 figures, appendix +
+
+
+
+
+ + ☆ CMU's IWSLT 2024 Simultaneous Speech Translation System + + +
+ This paper describes CMU's submission to the IWSLT 2024 Simultaneous Speech +Translation (SST) task for translating English speech to German text in a +streaming manner. Our end-to-end speech-to-text (ST) system integrates the +WavLM speech encoder, a modality adapter, and the Llama2-7B-Base model as the +decoder. We employ a two-stage training approach: initially, we align the +representations of speech and text, followed by full fine-tuning. Both stages +are trained on MuST-c v2 data with cross-entropy loss. We adapt our offline ST +model for SST using a simple fixed hold-n policy. Experiments show that our +model obtains an offline BLEU score of 31.1 and a BLEU score of 29.5 under 2 +seconds latency on the MuST-C-v2 tst-COMMON. + +
+
+
+
+
+ + ☆ LiveFC: A System for Live Fact-Checking of Audio Streams + + +
+ The advances in the digital era have led to rapid dissemination of +information. This has also aggravated the spread of misinformation and +disinformation. This has potentially serious consequences, such as civil +unrest. While fact-checking aims to combat this, manual fact-checking is +cumbersome and not scalable. While automated fact-checking approaches exist, +they do not operate in real-time and do not always account for spread of +misinformation through different modalities. This is particularly important as +proactive fact-checking on live streams in real-time can help people be +informed of false narratives and prevent catastrophic consequences that may +cause civil unrest. This is particularly relevant with the rapid dissemination +of information through video on social media platforms or other streams like +political rallies and debates. Hence, in this work we develop a platform named +\name{}, that can aid in fact-checking live audio streams in real-time. \name{} +has a user-friendly interface that displays the claims detected along with +their veracity and evidence for live streams with associated speakers for +claims from respective segments. The app can be accessed at +http://livefc.factiverse.ai and a screen recording of the demo can be found at +https://bit.ly/3WVAoIw. + +
+
+ comment: Under Review, 11 pages +
+
+
+
+
+ + ☆ Exploring Retrieval Augmented Generation in Arabic + + +
+ Recently, Retrieval Augmented Generation (RAG) has emerged as a powerful +technique in natural language processing, combining the strengths of +retrieval-based and generation-based models to enhance text generation tasks. +However, the application of RAG in Arabic, a language with unique +characteristics and resource constraints, remains underexplored. This paper +presents a comprehensive case study on the implementation and evaluation of RAG +for Arabic text. The work focuses on exploring various semantic embedding +models in the retrieval stage and several LLMs in the generation stage, in +order to investigate what works and what doesn't in the context of Arabic. The +work also touches upon the issue of variations between document dialect and +query dialect in the retrieval stage. Results show that existing semantic +embedding models and LLMs can be effectively employed to build Arabic RAG +pipelines. + +
+
+
+
+
+ + ☆ Knowledge in Superposition: Unveiling the Failures of Lifelong Knowledge + Editing for Large Language Models + + +
+ Knowledge editing aims to update outdated or incorrect knowledge in large +language models (LLMs). However, current knowledge editing methods have limited +scalability for lifelong editing. This study explores the fundamental reason +why knowledge editing fails in lifelong editing. We begin with the closed-form +solution derived from linear associative memory, which underpins +state-of-the-art knowledge editing methods. We extend the solution from single +editing to lifelong editing, and through rigorous mathematical derivation, +identify an interference term in the final solution, suggesting that editing +knowledge may impact irrelevant knowledge. Further analysis of the interference +term reveals a close relationship with superposition between knowledge +representations. When knowledge superposition does not exist in language +models, the interference term vanishes, allowing for lossless knowledge +editing. Experiments across numerous language models reveal that knowledge +superposition is universal, exhibiting high kurtosis, zero mean, and +heavy-tailed distributions with clear scaling laws. Ultimately, by combining +theory and experiments, we demonstrate that knowledge superposition is the +fundamental reason for the failure of lifelong editing. Moreover, this is the +first study to investigate knowledge editing from the perspective of +superposition and provides a comprehensive observation of superposition across +numerous real-world language models. Code available at +https://github.com/ChenhuiHu/knowledge_in_superposition. + +
+
+
+
+
+ + ☆ Aquila2 Technical Report + + +
+ This paper introduces the Aquila2 series, which comprises a wide range of +bilingual models with parameter sizes of 7, 34, and 70 billion. These models +are trained based on an innovative framework named HeuriMentor (HM), which +offers real-time insights into model convergence and enhances the training +process and data management. The HM System, comprising the Adaptive Training +Engine (ATE), Training State Monitor (TSM), and Data Management Unit (DMU), +allows for precise monitoring of the model's training progress and enables +efficient optimization of data distribution, thereby enhancing training +effectiveness. Extensive evaluations show that the Aquila2 model series +performs comparably well on both English and Chinese benchmarks. Specifically, +Aquila2-34B demonstrates only a slight decrease in performance when quantized +to Int4. Furthermore, we have made our training code +(https://github.com/FlagOpen/FlagScale) and model weights +(https://github.com/FlagAI-Open/Aquila2) publicly available to support ongoing +research and the development of applications. + +
+
+
+
+
+ + ☆ A Quantum-Inspired Analysis of Human Disambiguation Processes + + +
+ Formal languages are essential for computer programming and are constructed +to be easily processed by computers. In contrast, natural languages are much +more challenging and instigated the field of Natural Language Processing (NLP). +One major obstacle is the ubiquity of ambiguities. Recent advances in NLP have +led to the development of large language models, which can resolve ambiguities +with high accuracy. At the same time, quantum computers have gained much +attention in recent years as they can solve some computational problems faster +than classical computers. This new computing paradigm has reached the fields of +machine learning and NLP, where hybrid classical-quantum learning algorithms +have emerged. However, more research is needed to identify which NLP tasks +could benefit from a genuine quantum advantage. In this thesis, we applied +formalisms arising from foundational quantum mechanics, such as contextuality +and causality, to study ambiguities arising from linguistics. By doing so, we +also reproduced psycholinguistic results relating to the human disambiguation +process. These results were subsequently used to predict human behaviour and +outperformed current NLP methods. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ DataVisT5: A Pre-trained Language Model for Jointly Understanding Text + and Data Visualization + + +
+ Data visualization (DV) is the fundamental and premise tool to improve the +efficiency in conveying the insights behind the big data, which has been widely +accepted in existing data-driven world. Task automation in DV, such as +converting natural language queries to visualizations (i.e., text-to-vis), +generating explanations from visualizations (i.e., vis-to-text), answering +DV-related questions in free form (i.e. FeVisQA), and explicating tabular data +(i.e., table-to-text), is vital for advancing the field. Despite their +potential, the application of pre-trained language models (PLMs) like T5 and +BERT in DV has been limited by high costs and challenges in handling +cross-modal information, leading to few studies on PLMs for DV. We introduce +\textbf{DataVisT5}, a novel PLM tailored for DV that enhances the T5 +architecture through a hybrid objective pre-training and multi-task fine-tuning +strategy, integrating text and DV datasets to effectively interpret cross-modal +semantics. Extensive evaluations on public datasets show that DataVisT5 +consistently outperforms current state-of-the-art models on various DV-related +tasks. We anticipate that DataVisT5 will not only inspire further research on +vertical PLMs but also expand the range of applications for PLMs. + +
+
+
+
+
+ + ☆ Do GPT Language Models Suffer From Split Personality Disorder? The + Advent Of Substrate-Free Psychometrics + + +
+ Previous research on emergence in large language models shows these display +apparent human-like abilities and psychological latent traits. However, results +are partly contradicting in expression and magnitude of these latent traits, +yet agree on the worrisome tendencies to score high on the Dark Triad of +narcissism, psychopathy, and Machiavellianism, which, together with a track +record of derailments, demands more rigorous research on safety of these +models. We provided a state of the art language model with the same personality +questionnaire in nine languages, and performed Bayesian analysis of Gaussian +Mixture Model, finding evidence for a deeper-rooted issue. Our results suggest +both interlingual and intralingual instabilities, which indicate that current +language models do not develop a consistent core personality. This can lead to +unsafe behaviour of artificial intelligence systems that are based on these +foundation models, and are increasingly integrated in human life. We +subsequently discuss the shortcomings of modern psychometrics, abstract it, and +provide a framework for its species-neutral, substrate-free formulation. + +
+
+ comment: 37 pages, 7 figures, 3 tables, date v1: Mar 26 2023 +
+
+
+
+
+ + ☆ Only One Relation Possible? Modeling the Ambiguity in Event Temporal + Relation Extraction + + +
+ Event Temporal Relation Extraction (ETRE) aims to identify the temporal +relationship between two events, which plays an important role in natural +language understanding. Most previous works follow a single-label +classification style, classifying an event pair into either a specific temporal +relation (e.g., \textit{Before}, \textit{After}), or a special label +\textit{Vague} when there may be multiple possible temporal relations between +the pair. In our work, instead of directly making predictions on +\textit{Vague}, we propose a multi-label classification solution for ETRE +(METRE) to infer the possibility of each temporal relation independently, where +we treat \textit{Vague} as the cases when there is more than one possible +relation between two events. We design a speculation mechanism to explore the +possible relations hidden behind \textit{Vague}, which enables the latent +information to be used efficiently. Experiments on TB-Dense, MATRES and UDS-T +show that our method can effectively utilize the \textit{Vague} instances to +improve the recognition for specific temporal relations and outperforms most +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Ranking-Based Hybrid + Training and Multimodal Fusion + + +
+ Visual Question Answering (VQA) is a challenging task that requires systems +to provide accurate answers to questions based on image content. Current VQA +models struggle with complex questions due to limitations in capturing and +integrating multimodal information effectively. To address these challenges, we +propose the Rank VQA model, which leverages a ranking-inspired hybrid training +strategy to enhance VQA performance. The Rank VQA model integrates high-quality +visual features extracted using the Faster R-CNN model and rich semantic text +features obtained from a pre-trained BERT model. These features are fused +through a sophisticated multimodal fusion technique employing multi-head +self-attention mechanisms. Additionally, a ranking learning module is +incorporated to optimize the relative ranking of answers, thus improving answer +accuracy. The hybrid training strategy combines classification and ranking +losses, enhancing the model's generalization ability and robustness across +diverse datasets. Experimental results demonstrate the effectiveness of the +Rank VQA model. Our model significantly outperforms existing state-of-the-art +models on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of +both accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank +VQA is evident in its ability to handle complex questions that require +understanding nuanced details and making sophisticated inferences from the +image and text. This work highlights the effectiveness of a ranking-based +hybrid training strategy in improving VQA performance and lays the groundwork +for further research in multimodal learning methods. + +
+
+ comment: Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal + Fusion, Ranking Learning, Hybrid Training Strategy +
+
+
+
+
+ + ☆ Training Language Models on the Knowledge Graph: Insights on + Hallucinations and Their Detectability + + +
+ While many capabilities of language models (LMs) improve with increased +training budget, the influence of scale on hallucinations is not yet fully +understood. Hallucinations come in many forms, and there is no universally +accepted definition. We thus focus on studying only those hallucinations where +a correct answer appears verbatim in the training set. To fully control the +training data content, we construct a knowledge graph (KG)-based dataset, and +use it to train a set of increasingly large LMs. We find that for a fixed +dataset, larger and longer-trained LMs hallucinate less. However, hallucinating +on $\leq5$% of the training data requires an order of magnitude larger model, +and thus an order of magnitude more compute, than Hoffmann et al. (2022) +reported was optimal. Given this costliness, we study how hallucination +detectors depend on scale. While we see detector size improves performance on +fixed LM's outputs, we find an inverse relationship between the scale of the LM +and the detectability of its hallucinations. + +
+
+ comment: Published at COLM 2024. 16 pages, 11 figures +
+
+
+
+
+ + ☆ SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion + Recognition INTERSPEECH 2024 + + +
+ Speech emotion recognition (SER) has made significant strides with the advent +of powerful self-supervised learning (SSL) models. However, the generalization +of these models to diverse languages and emotional expressions remains a +challenge. We propose a large-scale benchmark to evaluate the robustness and +adaptability of state-of-the-art SER models in both in-domain and out-of-domain +settings. Our benchmark includes a diverse set of multilingual datasets, +focusing on less commonly used corpora to assess generalization to new data. We +employ logit adjustment to account for varying class distributions and +establish a single dataset cluster for systematic evaluation. Surprisingly, we +find that the Whisper model, primarily designed for automatic speech +recognition, outperforms dedicated SSL models in cross-lingual SER. Our results +highlight the need for more robust and generalizable SER models, and our +benchmark serves as a valuable resource to drive future research in this +direction. + +
+
+ comment: Accepted at INTERSPEECH 2024 +
+
+
+
+
+ + ☆ ONSEP: A Novel Online Neural-Symbolic Framework for Event Prediction + Based on Large Language Model ACL 2024 + + +
+ In the realm of event prediction, temporal knowledge graph forecasting (TKGF) +stands as a pivotal technique. Previous approaches face the challenges of not +utilizing experience during testing and relying on a single short-term history, +which limits adaptation to evolving data. In this paper, we introduce the +Online Neural-Symbolic Event Prediction (ONSEP) framework, which innovates by +integrating dynamic causal rule mining (DCRM) and dual history augmented +generation (DHAG). DCRM dynamically constructs causal rules from real-time +data, allowing for swift adaptation to new causal relationships. In parallel, +DHAG merges short-term and long-term historical contexts, leveraging a +bi-branch approach to enrich event prediction. Our framework demonstrates +notable performance enhancements across diverse datasets, with significant +Hit@k (k=1,3,10) improvements, showcasing its ability to augment large language +models (LLMs) for event prediction without necessitating extensive retraining. +The ONSEP framework not only advances the field of TKGF but also underscores +the potential of neural-symbolic approaches in adapting to dynamic data +environments. + +
+
+ comment: 16 pages, ACL 2024 Findings +
+
+
+
+
+ + ☆ CodeMirage: Hallucinations in Code Generated by Large Language Models IJCAI 2024 + + +
+ Large Language Models (LLMs) have shown promising potentials in program +generation and no-code automation. However, LLMs are prone to generate +hallucinations, i.e., they generate text which sounds plausible but is +incorrect. Although there has been a recent surge in research on LLM +hallucinations for text generation, similar hallucination phenomenon can happen +in code generation. Sometimes the generated code can have syntactical or +logical errors as well as more advanced issues like security vulnerabilities, +memory leaks, etc. Given the wide adaptation of LLMs to enhance efficiency in +code generation and development in general, it becomes imperative to +investigate hallucinations in code generation. To the best of our knowledge, +this is the first attempt at studying hallucinations in the code generated by +LLMs. We start by introducing the code hallucination definition and a +comprehensive taxonomy of code hallucination types. We propose the first +benchmark CodeMirage dataset for code hallucinations. The benchmark contains +1,137 GPT-3.5 generated hallucinated code snippets for Python programming +problems from two base datasets - HumanEval and MBPP. We then propose the +methodology for code hallucination detection and experiment with open source +LLMs such as CodeLLaMA as well as OpenAI's GPT-3.5 and GPT-4 models using +one-shot prompt. We find that GPT-4 performs the best on HumanEval dataset and +gives comparable results to the fine-tuned CodeBERT baseline on MBPP dataset. +Towards the end, we discuss various mitigation strategies for code +hallucinations and conclude our work. + +
+
+ comment: Accepted at AutoMates @ IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Agent Instructs Large Language Models to be General Zero-Shot Reasoners ICML 2024 + + +
+ We introduce a method to improve the zero-shot reasoning abilities of large +language models on general language understanding tasks. Specifically, we build +an autonomous agent to instruct the reasoning process of large language models. +We show this approach further unleashes the zero-shot reasoning abilities of +large language models to more tasks. We study the performance of our method on +a wide set of datasets spanning generation, classification, and reasoning. We +show that our method generalizes to most tasks and obtains state-of-the-art +zero-shot performance on 20 of the 29 datasets that we evaluate. For instance, +our method boosts the performance of state-of-the-art large language models by +a large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and +GPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement +in reasoning is striking, with an average increase of 10.5%. With our method, +Llama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%. + +
+
+ comment: Accepted to ICML 2024 +
+
+
+
+
+ + ♻ ☆ Massive Activations in Large Language Models + + +
+ We observe an empirical phenomenon in Large Language Models (LLMs) -- very +few activations exhibit significantly larger values than others (e.g., 100,000 +times larger). We call them massive activations. First, we demonstrate the +widespread existence of massive activations across various LLMs and +characterize their locations. Second, we find their values largely stay +constant regardless of the input, and they function as indispensable bias terms +in LLMs. Third, these massive activations lead to the concentration of +attention probabilities to their corresponding tokens, and further, implicit +bias terms in the self-attention output. Last, we also study massive +activations in Vision Transformers. Code is available at +https://github.com/locuslab/massive-activations. + +
+
+ comment: First Conference on Language Modeling (COLM), 2024. Website at + https://eric-mingjie.github.io/massive-activations/index.html +
+
+
+
+
+ + ♻ ☆ An Event Structure-aware Generative Model for Biomedical Event + Extraction + + +
+ Biomedical Event Extraction (BEE) is a challenging task that involves +modeling complex relationships between fine-grained entities in biomedical +text. Most existing BEE models rely on classification methods that ignore label +semantics and argument dependencies in the data. Although generative models +that use prompts are increasingly being used for event extraction, they face +two main challenges: creating effective prompts for the biomedical domain and +dealing with events with complex structures in the text. To address these +limitations, we propose GenBEE, a generative model enhanced with +structure-aware prefixes for biomedical event extraction. GenBEE constructs +event prompts that leverage knowledge distilled from large language models +(LLMs), thereby incorporating both label semantics and argument dependency +relationships. Additionally, GenBEE introduces a structural prefix learning +module that generates structure-aware prefixes with structural prompts, +enriching the generation process with structural features. Extensive +experiments on three benchmark datasets demonstrate the effectiveness of GenBEE +and it achieves state-of-the-art performance on the MLEE and GE11 datasets. +Moreover, our analysis shows that the structural prefixes effectively bridge +the gap between structural prompts and the representation space of generative +models, enabling better integration of event structural information. + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Exploring LLM Multi-Agents for ICD Coding + + +
+ To address the limitations of Large Language Models (LLMs) in the +International Classification of Diseases (ICD) coding task, where they often +produce inaccurate and incomplete prediction results due to the +high-dimensional and skewed distribution of the ICD codes, and often lack +interpretability and reliability as well. We introduce an innovative +multi-agent approach for ICD coding which mimics the ICD coding assignment +procedure in real-world settings, comprising five distinct agents: the patient, +physician, coder, reviewer, and adjuster. Each agent utilizes an LLM-based +model tailored to their specific role within the coding process. We also +integrate the system with Electronic Health Record (HER)'s SOAP (subjective, +objective, assessment and plan) structure to boost the performances. We compare +our method with a system of agents designed solely by LLMs and other strong +baselines and evaluate it using the Medical Information Mart for Intensive Care +III (MIMIC-III) dataset. Our multi-agent coding framework significantly +outperforms Zero-shot Chain of Thought (CoT) prompting and self-consistency +with CoT (CoT-SC) in coding common and rare ICD codes. An ablation study +validates the effectiveness of the designated agent roles. it also outperforms +the LLM-designed agent system. Moreover, our method achieves comparable results +to state-of-the-art ICD coding methods that require extensive pre-training or +fine-tuning, and outperforms them in rare code accuracy, and explainability. +Additionally, we demonstrate the method's practical applicability by presenting +its performance in scenarios not limited by the common or rare ICD code +constraints.The proposed multi-agent method for ICD coding effectively mimics +the real-world coding process and improves performance on both common and rare +codes. + +
+
+ comment: 12pages +
+
+
+
+
+ + ♻ ☆ Amuro & Char: Analyzing the Relationship between Pre-Training and + Fine-Tuning of Large Language Models + + +
+ The development of large language models leads to the formation of a +pre-train-then-align paradigm, in which the model is typically pre-trained on a +large text corpus and undergoes a tuning stage to align the model with human +preference or downstream tasks. In this work, we investigate the relationship +between pre-training and fine-tuning by fine-tuning multiple intermediate +pre-trained model checkpoints. Our results on 18 datasets suggest that i) +continual pre-training improves the model in a latent way that unveils after +fine-tuning; ii) with extra fine-tuning, the datasets that the model does not +demonstrate capability gain much more than those that the model performs well +during the pre-training stage; iii) although model benefits significantly +through supervised fine-tuning, it may forget previously known domain knowledge +and the tasks that are not seen during fine-tuning; iv) the model resembles +high sensitivity to evaluation prompts after supervised fine-tuning, but this +sensitivity can be alleviated by more pre-training. + +
+
+
+
+
+ + ♻ ☆ $\texttt{COSMIC}$: Mutual Information for Task-Agnostic Summarization + Evaluation ACL 2024 + + +
+ Assessing the quality of summarizers poses significant challenges. In +response, we propose a novel task-oriented evaluation approach that assesses +summarizers based on their capacity to produce summaries that are useful for +downstream tasks, while preserving task outcomes. We theoretically establish a +direct relationship between the resulting error probability of these tasks and +the mutual information between source texts and generated summaries. We +introduce $\texttt{COSMIC}$ as a practical implementation of this metric, +demonstrating its strong correlation with human judgment-based metrics and its +effectiveness in predicting downstream task performance. Comparative analyses +against established metrics like $\texttt{BERTScore}$ and $\texttt{ROUGE}$ +highlight the competitive performance of $\texttt{COSMIC}$. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ LLM Voting: Human Choices and AI Collective Decision Making AAAI + + +
+ This paper investigates the voting behaviors of Large Language Models (LLMs), +specifically GPT-4 and LLaMA-2, their biases, and how they align with human +voting patterns. Our methodology involved using a dataset from a human voting +experiment to establish a baseline for human preferences and conducting a +corresponding experiment with LLM agents. We observed that the choice of voting +methods and the presentation order influenced LLM voting outcomes. We found +that varying the persona can reduce some of these biases and enhance alignment +with human choices. While the Chain-of-Thought approach did not improve +prediction accuracy, it has potential for AI explainability in the voting +process. We also identified a trade-off between preference diversity and +alignment accuracy in LLMs, influenced by different temperature settings. Our +findings indicate that LLMs may lead to less diverse collective outcomes and +biased assumptions when used in voting scenarios, emphasizing the need for +cautious integration of LLMs into democratic processes. + +
+
+ comment: Accepted in AAAI Conference on AI, Ethics, and Society (AIES) +
+
+
+
+
+ + ♻ ☆ Lost in Overlap: Exploring Watermark Collision in LLMs + + +
+ The proliferation of large language models (LLMs) in generating content +raises concerns about text copyright. Watermarking methods, particularly +logit-based approaches, embed imperceptible identifiers into text to address +these challenges. However, the widespread usage of watermarking across diverse +LLMs has led to an inevitable issue known as watermark collision during common +tasks, such as paraphrasing or translation. In this paper, we introduce +watermark collision as a novel and general philosophy for watermark attacks, +aimed at enhancing attack performance on top of any other attacking methods. We +also provide a comprehensive demonstration that watermark collision poses a +threat to all logit-based watermark algorithms, impacting not only specific +attack scenarios but also downstream applications. + +
+
+ comment: Long Paper, 7 pages +
+
+
+
+
+ + ♻ ☆ Tree Attention: Topology-aware Decoding for Long-Context Attention on + GPU clusters + + +
+ Self-attention is the core mathematical operation of modern transformer +architectures and is also a significant computational bottleneck due to its +quadratic complexity in the sequence length. In this work, we derive the scalar +energy function whose gradient computes the self-attention block, thus +elucidating the theoretical underpinnings of self-attention, providing a +Bayesian interpretation of the operation and linking it closely with +energy-based models such as Hopfield Networks. Our formulation reveals that the +reduction across the sequence axis can be efficiently computed in parallel +through a tree reduction. Our algorithm, for parallelizing attention +computation across multiple GPUs enables cross-device decoding to be performed +asymptotically faster (up to 8x faster in our experiments) than alternative +approaches such as Ring Attention, while also requiring significantly less +communication volume and incurring 2x less peak memory. Our code is publicly +available here: \url{https://github.com/Zyphra/tree_attention}. + +
+
+
+
+
+ + ♻ ☆ WavLLM: Towards Robust and Adaptive Speech Large Language Model + + +
+ The recent advancements in large language models (LLMs) have revolutionized +the field of natural language processing, progressively broadening their scope +to multimodal perception and generation. However, effectively integrating +listening capabilities into LLMs poses significant challenges, particularly +with respect to generalizing across varied contexts and executing complex +auditory tasks. In this work, we introduce WavLLM, a robust and adaptive speech +large language model with dual encoders, and a prompt-aware LoRA weight +adapter, optimized by a two-stage curriculum learning approach. Leveraging dual +encoders, we decouple different types of speech information, utilizing a +Whisper encoder to process the semantic content of speech, and a WavLM encoder +to capture the unique characteristics of the speaker's identity. Within the +curriculum learning framework, WavLLM first builds its foundational +capabilities by optimizing on mixed elementary single tasks, followed by +advanced multi-task training on more complex tasks such as combinations of the +elementary tasks. To enhance the flexibility and adherence to different tasks +and instructions, a prompt-aware LoRA weight adapter is introduced in the +second advanced multi-task training stage. We validate the proposed model on +universal speech benchmarks including tasks such as ASR, ST, SV, ER, and also +apply it to specialized datasets like Gaokao English listening comprehension +set for SQA, and speech Chain-of-Thought (CoT) evaluation set. Experiments +demonstrate that the proposed model achieves state-of-the-art performance +across a range of speech tasks on the same model size, exhibiting robust +generalization capabilities in executing complex tasks using CoT approach. +Furthermore, our model successfully completes Gaokao tasks without specialized +training. The codes, models, audio, and Gaokao evaluation set can be accessed +at \url{aka.ms/wavllm}. + +
+
+
+
+
+ + ♻ ☆ FIPO: Free-form Instruction-oriented Prompt Optimization with Preference + Dataset and Modular Fine-tuning Schema + + +
+ When the quality of naive prompts is carefully optimized by human experts, +the task performance of large language models (LLMs) can be significantly +improved. However, expert-based prompt optimizations are expensive. Herein, +some works have proposed Automatic Prompt Optimization (APO), to optimize naive +prompts according to task outputs of given in-box testing models, with the help +of advanced LLMs (e.g., GPT-4) in an ad-hoc way. Although effective, existing +schemes suffer from poor generalization ability and privacy risk. To this end, +we collect the first large-scale Prompt Optimization Preference dataset (POP), +fine-tune offline local LLM-based optimizers, then fairly test with various +downstream models. Our method allows accurate optimization of the core task +instruction part within the naive prompt in a model-agnostic manner, and thus +is named Free-from Instruction-oriented Prompt Optimization (FIPO). In +specific, FIPO uses a modular APO template that dynamically integrate the naive +task instruction, optional instruction responses, and optional ground truth to +produce finely optimized prompts. The POP dataset is meticulously constructed +using advanced LLMs, undergoing rigorous cross-validation by human experts and +analytical models. Leveraging insights from the data with Tulu2 models and +diverse fine-tuning strategies, we validate the efficacy of FIPO framework +across five public benchmarks and six testing models. Check codes and data +here: https://github.com/LuJunru/FIPO_Project. + +
+
+
+
+
+ + ♻ ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention for +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model, which +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear scalability and fast processing of long sequences. We replace +the Transformer-based backbone with a pre-trained Mamba-2 model and explore +methods for integrating 2D visual selective scanning mechanisms into multimodal +learning while also trying various visual encoders and Mamba-2 model variants. +Our extensive experiments in various multimodal benchmark tests demonstrate the +competitive performance of ML-Mamba and highlight the potential of state space +models in multimodal tasks. The experimental results show that: (1) we +empirically explore how to effectively apply the 2D vision selective scan +mechanism for multimodal learning. We propose a novel multimodal connector +called the Mamba-2 Scan Connector (MSC), which enhances representational +capabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling while faster inference speed; (3) Compared to multimodal models +utilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference +performance and effectiveness. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.13600, + arXiv:2406.07537 by other authors +
+
+
+
+
+ + ♻ ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: Fix HTML view. That is, fix the heap (strikethrough) order of .tex + files using the auxiliary Arxiv Readme XXX +
+
+
+
+
+ + ♻ ☆ BioRAG: A RAG-LLM Framework for Biological Question Reasoning + + +
+ The question-answering system for Life science research, which is +characterized by the rapid pace of discovery, evolving insights, and complex +interactions among knowledge entities, presents unique challenges in +maintaining a comprehensive knowledge warehouse and accurate information +retrieval. To address these issues, we introduce BioRAG, a novel +Retrieval-Augmented Generation (RAG) with the Large Language Models (LLMs) +framework. Our approach starts with parsing, indexing, and segmenting an +extensive collection of 22 million scientific papers as the basic knowledge, +followed by training a specialized embedding model tailored to this domain. +Additionally, we enhance the vector retrieval process by incorporating a +domain-specific knowledge hierarchy, which aids in modeling the intricate +interrelationships among each query and context. For queries requiring the most +current information, BioRAG deconstructs the question and employs an iterative +retrieval process incorporated with the search engine for step-by-step +reasoning. Rigorous experiments have demonstrated that our model outperforms +fine-tuned LLM, LLM with search engines, and other scientific RAG frameworks +across multiple life science question-answering tasks. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Model Attribution in LLM-Generated Disinformation: A Domain + Generalization Approach with Supervised Contrastive Learning + + +
+ Model attribution for LLM-generated disinformation poses a significant +challenge in understanding its origins and mitigating its spread. This task is +especially challenging because modern large language models (LLMs) produce +disinformation with human-like quality. Additionally, the diversity in +prompting methods used to generate disinformation complicates accurate source +attribution. These methods introduce domain-specific features that can mask the +fundamental characteristics of the models. In this paper, we introduce the +concept of model attribution as a domain generalization problem, where each +prompting method represents a unique domain. We argue that an effective +attribution model must be invariant to these domain-specific features. It +should also be proficient in identifying the originating models across all +scenarios, reflecting real-world detection challenges. To address this, we +introduce a novel approach based on Supervised Contrastive Learning. This +method is designed to enhance the model's robustness to variations in prompts +and focuses on distinguishing between different source LLMs. We evaluate our +model through rigorous experiments involving three common prompting methods: +``open-ended'', ``rewriting'', and ``paraphrasing'', and three advanced LLMs: +``llama 2'', ``chatgpt'', and ``vicuna''. Our results demonstrate the +effectiveness of our approach in model attribution tasks, achieving +state-of-the-art performance across diverse and unseen datasets. + +
+
+ comment: 10 pages, 2 figures, accepted at DSAA 2024 +
+
+
+
+
+ + ♻ ☆ Chain-of-Factors Paper-Reviewer Matching + + +
+ With the rapid increase in paper submissions to academic conferences, the +need for automated and accurate paper-reviewer matching is more critical than +ever. Previous efforts in this area have considered various factors to assess +the relevance of a reviewer's expertise to a paper, such as the semantic +similarity, shared topics, and citation connections between the paper and the +reviewer's previous works. However, most of these studies focus on only one +factor, resulting in an incomplete evaluation of the paper-reviewer relevance. +To address this issue, we propose a unified model for paper-reviewer matching +that jointly considers semantic, topic, and citation factors. To be specific, +during training, we instruction-tune a contextualized language model shared +across all factors to capture their commonalities and characteristics; during +inference, we chain the three factors to enable step-by-step, coarse-to-fine +search for qualified reviewers given a submission. Experiments on four datasets +(one of which is newly contributed by us) spanning various fields such as +machine learning, computer vision, information retrieval, and data mining +consistently demonstrate the effectiveness of our proposed Chain-of-Factors +model in comparison with state-of-the-art paper-reviewer matching methods and +scientific pre-trained language models. + +
+
+
+
+
+ + ♻ ☆ SentenceVAE: Enable Next-sentence Prediction for Large Language Models + with Faster Speed, Higher Accuracy and Longer Context + + +
+ Current large language models (LLMs) primarily utilize next-token prediction +method for inference, which significantly impedes their processing speed. In +this paper, we introduce a novel inference methodology termed next-sentence +prediction, aiming at enhancing the inference efficiency of LLMs. We present +Sentence Variational Autoencoder (SentenceVAE), which includes a Sentence +Encoder to compress multiple tokens in a sentence into a single token, and a +Sentence Decoder to reconstruct it. By integrating SentenceVAE into the input +and output layers of LLMs, we develop Sentence-level LLMs (SLLMs) that employ a +sentence-by-sentence inference method. In addition, the SentenceVAE module of +SLLMs can maintain the integrity of the original semantic content by segmenting +the context into sentences, thereby improving accuracy while boosting inference +speed. Moreover, compared to previous LLMs, SLLMs process fewer tokens over +equivalent context length, significantly reducing memory demands for +self-attention computation and facilitating the handling of longer context. +Extensive experiments on Wanjuan dataset have revealed that the proposed method +can accelerate inference speed by 204~365%, reduce perplexity (PPL) to 46~75% +of its original metric, and decrease memory overhead by 86~91% for the +equivalent context length, compared to previous token-by-token methods. + +
+
+ comment: update the article +
+
+
+
+
+ + ♻ ☆ Multi-layer Sequence Labeling-based Joint Biomedical Event Extraction NLPCC2024 + + +
+ In recent years, biomedical event extraction has been dominated by +complicated pipeline and joint methods, which need to be simplified. In +addition, existing work has not effectively utilized trigger word information +explicitly. Hence, we propose MLSL, a method based on multi-layer sequence +labeling for joint biomedical event extraction. MLSL does not introduce prior +knowledge and complex structures. Moreover, it explicitly incorporates the +information of candidate trigger words into the sequence labeling to learn the +interaction relationships between trigger words and argument roles. Based on +this, MLSL can learn well with just a simple workflow. Extensive +experimentation demonstrates the superiority of MLSL in terms of extraction +performance compared to other state-of-the-art methods. + +
+
+ comment: 13 pages, 3 figures, accepted by NLPCC2024 +
+
+
+
+
+ + ♻ ☆ \textit{re}CSE: Portable Reshaping Features for Sentence Embedding in + Self-supervised Contrastive Learning + + +
+ We propose \textit{re}CSE, a self supervised contrastive learning sentence +representation framework based on feature reshaping. This framework is +different from the current advanced models that use discrete data augmentation +methods, but instead reshapes the input features of the original sentence, +aggregates the global information of each token in the sentence, and alleviates +the common problems of representation polarity and GPU memory consumption +linear increase in current advanced models. In addition, our \textit{re}CSE has +achieved competitive performance in semantic similarity tasks. And the +experiment proves that our proposed feature reshaping method has strong +universality, which can be transplanted to other self supervised contrastive +learning frameworks and enhance their representation ability, even achieving +state-of-the-art performance. Our code is available at +https://github.com/heavenhellchen/reCSE. + +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ On the Hidden Mystery of OCR in Large Multimodal Models + + +
+ Large models have recently played a dominant role in natural language +processing and multimodal vision-language learning. However, their +effectiveness in text-related visual tasks remains relatively unexplored. In +this paper, we conducted a comprehensive evaluation of Large Multimodal Models, +such as GPT4V and Gemini, in various text-related visual tasks including Text +Recognition, Scene Text-Centric Visual Question Answering (VQA), +Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten +Mathematical Expression Recognition (HMER). To facilitate the assessment of +Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we +propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29 +datasets, making it the most comprehensive OCR evaluation benchmark available. +Furthermore, our study reveals both the strengths and weaknesses of these +models, particularly in handling multilingual text, handwritten text, +non-semantic text, and mathematical expression recognition. Most importantly, +the baseline results presented in this study could provide a foundational +framework for the conception and assessment of innovative strategies targeted +at enhancing zero-shot multimodal techniques. The evaluation pipeline and +benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR. + +
+
+
+
+
+ + ♻ ☆ BSRBF-KAN: A combination of B-splines and Radial Basis Functions in + Kolmogorov-Arnold Networks + + +
+ In this paper, we introduce BSRBF-KAN, a Kolmogorov Arnold Network (KAN) that +combines B-splines and radial basis functions (RBFs) to fit input vectors +during data training. We perform experiments with BSRBF-KAN, multi-layer +perception (MLP), and other popular KANs, including EfficientKAN, FastKAN, +FasterKAN, and GottliebKAN over the MNIST and Fashion-MNIST datasets. BSRBF-KAN +shows stability in 5 training runs with a competitive average accuracy of +97.55% on MNIST and 89.33% on Fashion-MNIST and obtains convergence better than +other networks. We expect BSRBF-KAN to open many combinations of mathematical +functions to design KANs. Our repo is publicly available at: +https://github.com/hoangthangta/BSRBF_KAN. + +
+
+ comment: 8 pages, 1 figure, 3 tables +
+
+
+
+
+ + ♻ ☆ V-STaR: Training Verifiers for Self-Taught Reasoners + + +
+ Common self-improvement approaches for large language models (LLMs), such as +STaR, iteratively fine-tune LLMs on self-generated solutions to improve their +problem-solving ability. However, these approaches discard the large amounts of +incorrect solutions generated during this process, potentially neglecting +valuable information in such solutions. To address this shortcoming, we propose +V-STaR that utilizes both the correct and incorrect solutions generated during +the self-improvement process to train a verifier using DPO that judges +correctness of model-generated solutions. This verifier is used at inference +time to select one solution among many candidate solutions. Running V-STaR for +multiple iterations results in progressively better reasoners and verifiers, +delivering a 4% to 17% test accuracy improvement over existing self-improvement +and verification approaches on common code generation and math reasoning +benchmarks with LLaMA2 models. + +
+
+
+
+
+ + ♻ ☆ OpenEP: Open-Ended Future Event Prediction + + +
+ Future event prediction (FEP) is a long-standing and crucial task in the +world, as understanding the evolution of events enables early risk +identification, informed decision-making, and strategic planning. Existing work +typically treats event prediction as classification tasks and confines the +outcomes of future events to a fixed scope, such as yes/no questions, candidate +set, and taxonomy, which is difficult to include all possible outcomes of +future events. In this paper, we introduce OpenEP (an Open-Ended Future Event +Prediction task), which generates flexible and diverse predictions aligned with +real-world scenarios. This is mainly reflected in two aspects: firstly, the +predictive questions are diverse, covering different stages of event +development and perspectives; secondly, the outcomes are flexible, without +constraints on scope or format. To facilitate the study of this task, we +construct OpenEPBench, an open-ended future event prediction dataset. For +question construction, we pose questions from seven perspectives, including +location, time, event development, event outcome, event impact, event response, +and other, to facilitate an in-depth analysis and understanding of the +comprehensive evolution of events. For outcome construction, we collect +free-form text containing the outcomes as ground truth to provide semantically +complete and detail-enriched outcomes. Furthermore, we propose StkFEP, a +stakeholder-enhanced future event prediction framework, that incorporates event +characteristics for open-ended settings. Our method extracts stakeholders +involved in events to extend questions to gather diverse information. We also +collect historically events that are relevant and similar to the question to +reveal potential evolutionary patterns. Experiment results indicate that +accurately predicting future events in open-ended settings is challenging for +existing LLMs. + +
+
+
+
+
+ + ♻ ☆ On Speeding Up Language Model Evaluation + + +
+ Developing prompt-based methods with Large Language Models (LLMs) requires +making numerous decisions, which give rise to a combinatorial search problem. +For example, selecting the right pre-trained LLM, prompt, and hyperparameters +to attain the best performance for a task typically necessitates evaluating an +expoential number of candidates on large validation sets. This exhaustive +evaluation can be time-consuming and costly, as both inference and evaluation +of LLM-based approaches are resource-intensive. Worse, a lot of computation is +wasted: Many hyper-parameter settings are non-competitive, and many samples +from the validation set are highly correlated - providing little or no new +information. So, if the goal is to identify the best method, it can be done far +more efficiently if the validation samples and methods are selected adaptively. +In this paper, we propose a novel method to address this challenge. We lean on +low-rank matrix factorization to fill in missing evaluations and on multi-armed +bandits to sequentially identify the next (method, validation sample)-pair to +evaluate. We carefully assess the efficacy of our approach on several +competitive benchmark problems and show that it can identify the top-performing +method using only 5-15% of the typically needed resources -- resulting in a +staggering 85-95% LLM cost savings. + +
+
+
+
+
+ + ♻ ☆ How Much are Large Language Models Contaminated? A Comprehensive Survey + and the LLMSanitize Library + + +
+ With the rise of Large Language Models (LLMs) in recent years, abundant new +opportunities are emerging, but also new challenges, among which contamination +is quickly becoming critical. Business applications and fundraising in AI have +reached a scale at which a few percentage points gained on popular +question-answering benchmarks could translate into dozens of millions of +dollars, placing high pressure on model integrity. At the same time, it is +becoming harder and harder to keep track of the data that LLMs have seen; if +not impossible with closed-source models like GPT-4 and Claude-3 not divulging +any information on the training set. As a result, contamination becomes a major +issue: LLMs' performance may not be reliable anymore, as the high performance +may be at least partly due to their previous exposure to the data. This +limitation jeopardizes the entire progress in the field of NLP, yet, there +remains a lack of methods on how to efficiently detect contamination.In this +paper, we survey all recent work on contamination detection with LLMs, and help +the community track contamination levels of LLMs by releasing an open-source +Python library named LLMSanitize implementing major contamination detection +algorithms. + +
+
+ comment: 8 pages, 1 figure, 1 table +
+
+
+
+
+ + ♻ ☆ CoTFormer: A Chain-of-Thought Driven Architecture with Budget-Adaptive + Computation Cost at Inference + + +
+ Scaling language models to larger and deeper sizes has led to significant +boosts in performance. Even though the size of these models limits their +application in compute-constrained environments, the race to continually +develop ever larger and deeper foundational models is underway. At the same +time -- regardless of the model size -- task-specific techniques continue to +play a pivotal role in achieving optimal downstream performance. One of these +techniques, called Chain-of-Thought (CoT), is particularly interesting since, +as we point out in this work, it resembles employing a deeper transformer +through re-applying the model multiple times. However, a key subtlety in +computing the attention of past tokens differentiates CoT from simply applying +the model several times. Based on this insight, we propose CoTFormer, a novel +architecture which closely mimics CoT at the token level, allowing us to obtain +significantly improved accuracies close to much larger models. While applying +CoT introduces additional computation costs, we compensate for it by leveraging +CoTFormer's special compatibility with token-wise variable depth. Through a +compute adaptive model -- which automatically allocates the compute to tokens +that need it most -- we show that it is possible to reduce the computation cost +significantly without any reduction in accuracy, and with further compute cost +reductions possible while maintaining a competitive accuracy. + +
+
+
+
+
+ + ♻ ☆ On Tables with Numbers, with Numbers + + +
+ This paper is a critical reflection on the epistemic culture of contemporary +computational linguistics, framed in the context of its growing obsession with +tables with numbers. We argue against tables with numbers on the basis of their +epistemic irrelevance, their environmental impact, their role in enabling and +exacerbating social inequalities, and their deep ties to commercial +applications and profit-driven research. We substantiate our arguments with +empirical evidence drawn from a meta-analysis of computational linguistics +research over the last decade. + +
+
+ comment: v2: corrected Figure 2 scale and caption (thanks go to Ernest Davis) +
+
+
+
+
+ + ♻ ☆ Can LLMs Replace Economic Choice Prediction Labs? The Case of + Language-based Persuasion Games + + +
+ Human choice prediction in economic contexts is crucial for applications in +marketing, finance, public policy, and more. This task, however, is often +constrained by the difficulties in acquiring human choice data. With most +experimental economics studies focusing on simple choice settings, the AI +community has explored whether LLMs can substitute for humans in these +predictions and examined more complex experimental economics settings. However, +a key question remains: can LLMs generate training data for human choice +prediction? We explore this in language-based persuasion games, a complex +economic setting involving natural language in strategic interactions. Our +experiments show that models trained on LLM-generated data can effectively +predict human behavior in these games and even outperform models trained on +actual human data. + +
+
+
+
+
+ + ♻ ☆ Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality + Parallel Data Outperforms Traditional Web-Crawled Data + + +
+ Recent research in neural machine translation (NMT) has shown that training +on high-quality machine-generated data can outperform training on +human-generated data. This work accompanies the first-ever release of a +LLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and +multi-sentence examples. We perform extensive experiments to demonstrate the +quality of our dataset in terms of its downstream impact on NMT model +performance. We find that training from scratch on our (machine-generated) +dataset outperforms training on the (web-crawled) WMT'23 training dataset +(which is 300 times larger), and also outperforms training on the top-quality +subset of the WMT'23 training dataset. We also find that performing +self-distillation by finetuning the LLM which generated this dataset +outperforms the LLM's strong few-shot baseline. These findings corroborate the +quality of our dataset, and demonstrate the value of high-quality +machine-generated data in improving performance of NMT models. + +
+
+
+
+
+ + ♻ ☆ Risks from Language Models for Automated Mental Healthcare: Ethics and + Structure for Implementation + + +
+ Amidst the growing interest in developing task-autonomous AI for automated +mental health care, this paper addresses the ethical and practical challenges +associated with the issue and proposes a structured framework that delineates +levels of autonomy, outlines ethical requirements, and defines beneficial +default behaviors for AI agents in the context of mental health support. We +also evaluate fourteen state-of-the-art language models (ten off-the-shelf, +four fine-tuned) using 16 mental health-related questionnaires designed to +reflect various mental health conditions, such as psychosis, mania, depression, +suicidal thoughts, and homicidal tendencies. The questionnaire design and +response evaluations were conducted by mental health clinicians (M.D.s). We +find that existing language models are insufficient to match the standard +provided by human professionals who can navigate nuances and appreciate +context. This is due to a range of issues, including overly cautious or +sycophantic responses and the absence of necessary safeguards. Alarmingly, we +find that most of the tested models could cause harm if accessed in mental +health emergencies, failing to protect users and potentially exacerbating +existing symptoms. We explore solutions to enhance the safety of current +models. Before the release of increasingly task-autonomous AI systems in mental +health, it is crucial to ensure that these models can reliably detect and +manage symptoms of common psychiatric disorders to prevent harm to users. This +involves aligning with the ethical framework and default behaviors outlined in +our study. We contend that model developers are responsible for refining their +systems per these guidelines to safeguard against the risks posed by current AI +technologies to user mental health and safety. + Trigger warning: Contains and discusses examples of sensitive mental health +topics, including suicide and self-harm. + +
+
+ comment: Updated with fine-tuned model results to match CoLM accepted + camera-ready version +
+
+
+
+
+ + ♻ ☆ Persona Inconstancy in Multi-Agent LLM Collaboration: Conformity, + Confabulation, and Impersonation + + +
+ Multi-agent AI systems can be used for simulating collective decision-making +in scientific and practical applications. They can also be used to introduce a +diverse group discussion step in chatbot pipelines, enhancing the cultural +sensitivity of the chatbot's responses. These applications, however, are +predicated on the ability of AI agents to reliably adopt assigned personas and +mimic human interactions. To see whether LLM agents satisfy these requirements, +we examine AI agent ensembles engaged in cross-national collaboration and +debate by analyzing their private responses and chat transcripts. Our findings +suggest that multi-agent discussions can support collective AI decisions that +more often reflect diverse perspectives, yet this effect is tempered by the +agents' susceptibility to conformity due to perceived peer pressure and +occasional challenges in maintaining consistent personas and opinions. +Instructions that encourage debate in support of one's opinions rather than +collaboration increase the rate of inconstancy. Without addressing the factors +we identify, the full potential of multi-agent frameworks for producing more +culturally diverse AI outputs or more realistic simulations of group +decision-making may remain untapped. + +
+
+ comment: 16 pages, 8 figures, 3 tables +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 124 + +
+
+
+ + ☆ Knowledge Distillation with Refined Logits + + +
+ Recent research on knowledge distillation has increasingly focused on logit +distillation because of its simplicity, effectiveness, and versatility in model +compression. In this paper, we introduce Refined Logit Distillation (RLD) to +address the limitations of current logit distillation methods. Our approach is +motivated by the observation that even high-performing teacher models can make +incorrect predictions, creating a conflict between the standard distillation +loss and the cross-entropy loss. This conflict can undermine the consistency of +the student model's learning objectives. Previous attempts to use labels to +empirically correct teacher predictions may undermine the class correlation. In +contrast, our RLD employs labeling information to dynamically refine teacher +logits. In this way, our method can effectively eliminate misleading +information from the teacher while preserving crucial class correlations, thus +enhancing the value and efficiency of distilled knowledge. Experimental results +on CIFAR-100 and ImageNet demonstrate its superiority over existing methods. +The code is provided at \text{https://github.com/zju-SWJ/RLD}. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ End-to-end Semantic-centric Video-based Multimodal Affective Computing + + +
+ In the pathway toward Artificial General Intelligence (AGI), understanding +human's affection is essential to enhance machine's cognition abilities. For +achieving more sensual human-AI interaction, Multimodal Affective Computing +(MAC) in human-spoken videos has attracted increasing attention. However, +previous methods are mainly devoted to designing multimodal fusion algorithms, +suffering from two issues: semantic imbalance caused by diverse pre-processing +operations and semantic mismatch raised by inconsistent affection content +contained in different modalities comparing with the multimodal ground truth. +Besides, the usage of manual features extractors make they fail in building +end-to-end pipeline for multiple MAC downstream tasks. To address above +challenges, we propose a novel end-to-end framework named SemanticMAC to +compute multimodal semantic-centric affection for human-spoken videos. We +firstly employ pre-trained Transformer model in multimodal data pre-processing +and design Affective Perceiver module to capture unimodal affective +information. Moreover, we present a semantic-centric approach to unify +multimodal representation learning in three ways, including gated feature +interaction, multi-task pseudo label generation, and intra-/inter-sample +contrastive learning. Finally, SemanticMAC effectively learn specific- and +shared-semantic representations in the guidance of semantic-centric labels. +Extensive experimental results demonstrate that our approach surpass the +state-of-the-art methods on 7 public datasets in four MAC downstream tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Detecting Near-Duplicate Face Images + + +
+ Near-duplicate images are often generated when applying repeated photometric +and geometric transformations that produce imperceptible variants of the +original image. Consequently, a deluge of near-duplicates can be circulated +online posing copyright infringement concerns. The concerns are more severe +when biometric data is altered through such nuanced transformations. In this +work, we address the challenge of near-duplicate detection in face images by, +firstly, identifying the original image from a set of near-duplicates and, +secondly, deducing the relationship between the original image and the +near-duplicates. We construct a tree-like structure, called an Image Phylogeny +Tree (IPT) using a graph-theoretic approach to estimate the relationship, i.e., +determine the sequence in which they have been generated. We further extend our +method to create an ensemble of IPTs known as Image Phylogeny Forests (IPFs). +We rigorously evaluate our method to demonstrate robustness across other +modalities, unseen transformations by latest generative models and IPT +configurations, thereby significantly advancing the state-of-the-art +performance by 42% on IPF reconstruction accuracy. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ RSD-DOG : A New Image Descriptor based on Second Order Derivatives + + +
+ This paper introduces the new and powerful image patch descriptor based on +second order image statistics/derivatives. Here, the image patch is treated as +a 3D surface with intensity being the 3rd dimension. The considered 3D surface +has a rich set of second order features/statistics such as ridges, valleys, +cliffs and so on, that can be easily captured by using the difference of +rotating semi Gaussian filters. The originality of this method is based on +successfully combining the response of the directional filters with that of the +Difference of Gaussian (DOG) approach. The obtained descriptor shows a good +discriminative power when dealing with the variations in illumination, scale, +rotation, blur, viewpoint and compression. The experiments on image matching, +demonstrates the advantage of the obtained descriptor when compared to its +first order counterparts such as SIFT, DAISY, GLOH, GIST and LIDRIC. + +
+
+
+
+
+ + ☆ A Spitting Image: Modular Superpixel Tokenization in Vision Transformers ECCV + + +
+ Vision Transformer (ViT) architectures traditionally employ a grid-based +approach to tokenization independent of the semantic content of an image. We +propose a modular superpixel tokenization strategy which decouples tokenization +and feature extraction; a shift from contemporary approaches where these are +treated as an undifferentiated whole. Using on-line content-aware tokenization +and scale- and shape-invariant positional embeddings, we perform experiments +and ablations that contrast our approach with patch-based tokenization and +randomized partitions as baselines. We show that our method significantly +improves the faithfulness of attributions, gives pixel-level granularity on +zero-shot unsupervised dense prediction tasks, while maintaining predictive +performance in classification tasks. Our approach provides a modular +tokenization framework commensurable with standard architectures, extending the +space of ViTs to a larger class of semantically-rich models. + +
+
+ comment: To appear in ECCV (MELEX) 2024 Workshop Proceedings +
+
+
+
+
+ + ☆ G$^2$V$^2$former: Graph Guided Video Vision Transformer for Face + Anti-Spoofing + + +
+ In videos containing spoofed faces, we may uncover the spoofing evidence +based on either photometric or dynamic abnormality, even a combination of both. +Prevailing face anti-spoofing (FAS) approaches generally concentrate on the +single-frame scenario, however, purely photometric-driven methods overlook the +dynamic spoofing clues that may be exposed over time. This may lead FAS systems +to conclude incorrect judgments, especially in cases where it is easily +distinguishable in terms of dynamics but challenging to discern in terms of +photometrics. To this end, we propose the Graph Guided Video Vision Transformer +(G$^2$V$^2$former), which combines faces with facial landmarks for photometric +and dynamic feature fusion. We factorize the attention into space and time, and +fuse them via a spatiotemporal block. Specifically, we design a novel temporal +attention called Kronecker temporal attention, which has a wider receptive +field, and is beneficial for capturing dynamic information. Moreover, we +leverage the low-semantic motion of facial landmarks to guide the high-semantic +change of facial expressions based on the motivation that regions containing +landmarks may reveal more dynamic clues. Extensive experiments on nine +benchmark datasets demonstrate that our method achieves superior performance +under various scenarios. The codes will be released soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ☆ See It All: Contextualized Late Aggregation for 3D Dense Captioning ACL 2024 + + +
+ 3D dense captioning is a task to localize objects in a 3D scene and generate +descriptive sentences for each object. Recent approaches in 3D dense captioning +have adopted transformer encoder-decoder frameworks from object detection to +build an end-to-end pipeline without hand-crafted components. However, these +approaches struggle with contradicting objectives where a single query +attention has to simultaneously view both the tightly localized object regions +and contextual environment. To overcome this challenge, we introduce SIA +(See-It-All), a transformer pipeline that engages in 3D dense captioning with a +novel paradigm called late aggregation. SIA simultaneously decodes two sets of +queries-context query and instance query. The instance query focuses on +localization and object attribute descriptions, while the context query +versatilely captures the region-of-interest of relationships between multiple +objects or with the global scene, then aggregated afterwards (i.e., late +aggregation) via simple distance-based measures. To further enhance the quality +of contextualized caption generation, we design a novel aggregator to generate +a fully informed caption based on the surrounding context, the global +environment, and object instances. Extensive experiments on two of the most +widely-used 3D dense captioning datasets demonstrate that our proposed method +achieves a significant improvement over prior methods. + +
+
+ comment: Accepted to ACL 2024 Findings +
+
+
+
+
+ + ☆ Boosting Unconstrained Face Recognition with Targeted Style Adversary + + +
+ While deep face recognition models have demonstrated remarkable performance, +they often struggle on the inputs from domains beyond their training data. +Recent attempts aim to expand the training set by relying on computationally +expensive and inherently challenging image-space augmentation of image +generation modules. In an orthogonal direction, we present a simple yet +effective method to expand the training data by interpolating between +instance-level feature statistics across labeled and unlabeled sets. Our +method, dubbed Targeted Style Adversary (TSA), is motivated by two +observations: (i) the input domain is reflected in feature statistics, and (ii) +face recognition model performance is influenced by style information. Shifting +towards an unlabeled style implicitly synthesizes challenging training +instances. We devise a recognizability metric to constraint our framework to +preserve the inherent identity-related information of labeled instances. The +efficacy of our method is demonstrated through evaluations on unconstrained +benchmarks, outperforming or being on par with its competitors while offering +nearly a 70\% improvement in training speed and 40\% less memory consumption. + +
+
+
+
+
+ + ☆ Rethinking the Key Factors for the Generalization of Remote Sensing + Stereo Matching Networks + + +
+ Stereo matching, a critical step of 3D reconstruction, has fully shifted +towards deep learning due to its strong feature representation of remote +sensing images. However, ground truth for stereo matching task relies on +expensive airborne LiDAR data, thus making it difficult to obtain enough +samples for supervised learning. To improve the generalization ability of +stereo matching networks on cross-domain data from different sensors and +scenarios, in this paper, we dedicate to study key training factors from three +perspectives. (1) For the selection of training dataset, it is important to +select data with similar regional target distribution as the test set instead +of utilizing data from the same sensor. (2) For model structure, cascaded +structure that flexibly adapts to different sizes of features is preferred. (3) +For training manner, unsupervised methods generalize better than supervised +methods, and we design an unsupervised early-stop strategy to help retain the +best model with pre-trained weights as the basis. Extensive experiments are +conducted to support the previous findings, on the basis of which we present an +unsupervised stereo matching network with good generalization performance. We +release the source code and the datasets at +https://github.com/Elenairene/RKF_RSSM to reproduce the results and encourage +future work. + +
+
+ comment: submitted to IEEE jstars +
+
+
+
+
+ + ☆ Panacea+: Panoramic and Controllable Video Generation for Autonomous + Driving + + +
+ The field of autonomous driving increasingly demands high-quality annotated +video training data. In this paper, we propose Panacea+, a powerful and +universally applicable framework for generating video data in driving scenes. +Built upon the foundation of our previous work, Panacea, Panacea+ adopts a +multi-view appearance noise prior mechanism and a super-resolution module for +enhanced consistency and increased resolution. Extensive experiments show that +the generated video samples from Panacea+ greatly benefit a wide range of tasks +on different datasets, including 3D object tracking, 3D object detection, and +lane detection tasks on the nuScenes and Argoverse 2 dataset. These results +strongly prove Panacea+ to be a valuable data generation framework for +autonomous driving. + +
+
+ comment: Project page: https://panacea-ad.github.io/. arXiv admin note: text + overlap with arXiv:2311.16813 +
+
+
+
+
+ + ☆ Disentangle and denoise: Tackling context misalignment for video moment + retrieval + + +
+ Video Moment Retrieval, which aims to locate in-context video moments +according to a natural language query, is an essential task for cross-modal +grounding. Existing methods focus on enhancing the cross-modal interactions +between all moments and the textual description for video understanding. +However, constantly interacting with all locations is unreasonable because of +uneven semantic distribution across the timeline and noisy visual backgrounds. +This paper proposes a cross-modal Context Denoising Network (CDNet) for +accurate moment retrieval by disentangling complex correlations and denoising +irrelevant dynamics.Specifically, we propose a query-guided semantic +disentanglement (QSD) to decouple video moments by estimating alignment levels +according to the global and fine-grained correlation. A Context-aware Dynamic +Denoisement (CDD) is proposed to enhance understanding of aligned +spatial-temporal details by learning a group of query-relevant offsets. +Extensive experiments on public benchmarks demonstrate that the proposed CDNet +achieves state-of-the-art performances. + +
+
+
+
+
+ + ☆ Progressive Radiance Distillation for Inverse Rendering with Gaussian + Splatting + + +
+ We propose progressive radiance distillation, an inverse rendering method +that combines physically-based rendering with Gaussian-based radiance field +rendering using a distillation progress map. Taking multi-view images as input, +our method starts from a pre-trained radiance field guidance, and distills +physically-based light and material parameters from the radiance field using an +image-fitting process. The distillation progress map is initialized to a small +value, which favors radiance field rendering. During early iterations when +fitted light and material parameters are far from convergence, the radiance +field fallback ensures the sanity of image loss gradients and avoids local +minima that attracts under-fit states. As fitted parameters converge, the +physical model gradually takes over and the distillation progress increases +correspondingly. In presence of light paths unmodeled by the physical model, +the distillation progress never finishes on affected pixels and the learned +radiance field stays in the final rendering. With this designed tolerance for +physical model limitations, we prevent unmodeled color components from leaking +into light and material parameters, alleviating relighting artifacts. +Meanwhile, the remaining radiance field compensates for the limitations of the +physical model, guaranteeing high-quality novel views synthesis. Experimental +results demonstrate that our method significantly outperforms state-of-the-art +techniques quality-wise in both novel view synthesis and relighting. The idea +of progressive radiance distillation is not limited to Gaussian splatting. We +show that it also has positive effects for prominently specular scenes when +adapted to a mesh-based inverse rendering method. + +
+
+
+
+
+ + ☆ Transformers and Large Language Models for Efficient Intrusion Detection + Systems: A Comprehensive Survey + + +
+ With significant advancements in Transformers LLMs, NLP has extended its +reach into many research fields due to its enhanced capabilities in text +generation and user interaction. One field benefiting greatly from these +advancements is cybersecurity. In cybersecurity, many parameters that need to +be protected and exchanged between senders and receivers are in the form of +text and tabular data, making NLP a valuable tool in enhancing the security +measures of communication protocols. This survey paper provides a comprehensive +analysis of the utilization of Transformers and LLMs in cyber-threat detection +systems. The methodology of paper selection and bibliometric analysis is +outlined to establish a rigorous framework for evaluating existing research. +The fundamentals of Transformers are discussed, including background +information on various cyber-attacks and datasets commonly used in this field. +The survey explores the application of Transformers in IDSs, focusing on +different architectures such as Attention-based models, LLMs like BERT and GPT, +CNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others. +Furthermore, it explores the diverse environments and applications where +Transformers and LLMs-based IDS have been implemented, including computer +networks, IoT devices, critical infrastructure protection, cloud computing, +SDN, as well as in autonomous vehicles. The paper also addresses research +challenges and future directions in this area, identifying key issues such as +interpretability, scalability, and adaptability to evolving threats, and more. +Finally, the conclusion summarizes the findings and highlights the significance +of Transformers and LLMs in enhancing cyber-threat detection capabilities, +while also outlining potential avenues for further research and development. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.04760 by other authors +
+
+
+
+
+ + ☆ MetaSeg: MetaFormer-based Global Contexts-aware Network for Efficient + Semantic Segmentation WACV 2024 + + +
+ Beyond the Transformer, it is important to explore how to exploit the +capacity of the MetaFormer, an architecture that is fundamental to the +performance improvements of the Transformer. Previous studies have exploited it +only for the backbone network. Unlike previous studies, we explore the capacity +of the Metaformer architecture more extensively in the semantic segmentation +task. We propose a powerful semantic segmentation network, MetaSeg, which +leverages the Metaformer architecture from the backbone to the decoder. Our +MetaSeg shows that the MetaFormer architecture plays a significant role in +capturing the useful contexts for the decoder as well as for the backbone. In +addition, recent segmentation methods have shown that using a CNN-based +backbone for extracting the spatial information and a decoder for extracting +the global information is more effective than using a transformer-based +backbone with a CNN-based decoder. This motivates us to adopt the CNN-based +backbone using the MetaFormer block and design our MetaFormer-based decoder, +which consists of a novel self-attention module to capture the global contexts. +To consider both the global contexts extraction and the computational +efficiency of the self-attention for semantic segmentation, we propose a +Channel Reduction Attention (CRA) module that reduces the channel dimension of +the query and key into the one dimension. In this way, our proposed MetaSeg +outperforms the previous state-of-the-art methods with more efficient +computational costs on popular semantic segmentation and a medical image +segmentation benchmark, including ADE20K, Cityscapes, COCO-stuff, and Synapse. +The code is available at \url{https://github.com/hyunwoo137/MetaSeg}. + +
+
+ comment: Accepted by WACV 2024 +
+
+
+
+
+ + ☆ Sonic: Fast and Transferable Data Poisoning on Clustering Algorithms + + +
+ Data poisoning attacks on clustering algorithms have received limited +attention, with existing methods struggling to scale efficiently as dataset +sizes and feature counts increase. These attacks typically require +re-clustering the entire dataset multiple times to generate predictions and +assess the attacker's objectives, significantly hindering their scalability. +This paper addresses these limitations by proposing Sonic, a novel genetic data +poisoning attack that leverages incremental and scalable clustering algorithms, +e.g., FISHDBC, as surrogates to accelerate poisoning attacks against +graph-based and density-based clustering methods, such as HDBSCAN. We +empirically demonstrate the effectiveness and efficiency of Sonic in poisoning +the target clustering algorithms. We then conduct a comprehensive analysis of +the factors affecting the scalability and transferability of poisoning attacks +against clustering algorithms, and we conclude by examining the robustness of +hyperparameters in our attack strategy Sonic. + +
+
+ comment: preprint paper +
+
+
+
+
+ + ☆ MathScape: Evaluating MLLMs in multimodal Math Scenarios through a + Hierarchical Benchmark + + +
+ With the development of Multimodal Large Language Models (MLLMs), the +evaluation of multimodal models in the context of mathematical problems has +become a valuable research field. Multimodal visual-textual mathematical +reasoning serves as a critical indicator for evaluating the comprehension and +complex multi-step quantitative reasoning abilities of MLLMs. However, previous +multimodal math benchmarks have not sufficiently integrated visual and textual +information. To address this gap, we proposed MathScape, a new benchmark that +emphasizes the understanding and application of combined visual and textual +information. MathScape is designed to evaluate photo-based math problem +scenarios, assessing the theoretical understanding and application ability of +MLLMs through a categorical hierarchical approach. We conduct a +multi-dimensional evaluation on 11 advanced MLLMs, revealing that our benchmark +is challenging even for the most sophisticated models. By analyzing the +evaluation results, we identify the limitations of MLLMs, offering valuable +insights for enhancing model performance. + +
+
+
+
+
+ + ☆ DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model + + +
+ The flat lensless camera design reduces the camera size and weight +significantly. In this design, the camera lens is replaced by another optical +element that interferes with the incoming light. The image is recovered from +the raw sensor measurements using a reconstruction algorithm. Yet, the quality +of the reconstructed images is not satisfactory. To mitigate this, we propose +utilizing a pre-trained diffusion model with a control network and a learned +separable transformation for reconstruction. This allows us to build a +prototype flat camera with high-quality imaging, presenting state-of-the-art +results in both terms of quality and perceptuality. We demonstrate its ability +to leverage also textual descriptions of the captured scene to further enhance +reconstruction. Our reconstruction method which leverages the strong +capabilities of a pre-trained diffusion model can be used in other imaging +systems for improved reconstruction results. + +
+
+
+
+
+ + ☆ 3D Gaussian Editing with A Single Image + + +
+ The modeling and manipulation of 3D scenes captured from the real world are +pivotal in various applications, attracting growing research interest. While +previous works on editing have achieved interesting results through +manipulating 3D meshes, they often require accurately reconstructed meshes to +perform editing, which limits their application in 3D content generation. To +address this gap, we introduce a novel single-image-driven 3D scene editing +approach based on 3D Gaussian Splatting, enabling intuitive manipulation via +directly editing the content on a 2D image plane. Our method learns to optimize +the 3D Gaussians to align with an edited version of the image rendered from a +user-specified viewpoint of the original scene. To capture long-range object +deformation, we introduce positional loss into the optimization process of 3D +Gaussian Splatting and enable gradient propagation through reparameterization. +To handle occluded 3D Gaussians when rendering from the specified viewpoint, we +build an anchor-based structure and employ a coarse-to-fine optimization +strategy capable of handling long-range deformation while maintaining +structural stability. Furthermore, we design a novel masking strategy to +adaptively identify non-rigid deformation regions for fine-scale modeling. +Extensive experiments show the effectiveness of our method in handling +geometric details, long-range, and non-rigid deformation, demonstrating +superior editing flexibility and quality compared to previous approaches. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Cross-aware Early Fusion with Stage-divided Vision and Language + Transformer Encoders for Referring Image Segmentation + + +
+ Referring segmentation aims to segment a target object related to a natural +language expression. Key challenges of this task are understanding the meaning +of complex and ambiguous language expressions and determining the relevant +regions in the image with multiple objects by referring to the expression. +Recent models have focused on the early fusion with the language features at +the intermediate stage of the vision encoder, but these approaches have a +limitation that the language features cannot refer to the visual information. +To address this issue, this paper proposes a novel architecture, Cross-aware +early fusion with stage-divided Vision and Language Transformer encoders +(CrossVLT), which allows both language and vision encoders to perform the early +fusion for improving the ability of the cross-modal context modeling. Unlike +previous methods, our method enables the vision and language features to refer +to each other's information at each stage to mutually enhance the robustness of +both encoders. Furthermore, unlike the conventional scheme that relies solely +on the high-level features for the cross-modal alignment, we introduce a +feature-based alignment scheme that enables the low-level to high-level +features of the vision and language encoders to engage in the cross-modal +alignment. By aligning the intermediate cross-modal features in all encoder +stages, this scheme leads to effective cross-modal fusion. In this way, the +proposed approach is simple but effective for referring image segmentation, and +it outperforms the previous state-of-the-art methods on three public +benchmarks. + +
+
+ comment: Published in IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ☆ Improved 3D Whole Heart Geometry from Sparse CMR Slices + + +
+ Cardiac magnetic resonance (CMR) imaging and computed tomography (CT) are two +common non-invasive imaging methods for assessing patients with cardiovascular +disease. CMR typically acquires multiple sparse 2D slices, with unavoidable +respiratory motion artefacts between slices, whereas CT acquires isotropic +dense data but uses ionising radiation. In this study, we explore the +combination of Slice Shifting Algorithm (SSA), Spatial Transformer Network +(STN), and Label Transformer Network (LTN) to: 1) correct respiratory motion +between segmented slices, and 2) transform sparse segmentation data into dense +segmentation. All combinations were validated using synthetic motion-corrupted +CMR slice segmentation generated from CT in 1699 cases, where the dense CT +serves as the ground truth. In 199 testing cases, SSA-LTN achieved the best +results for Dice score and Huasdorff distance (94.0% and 4.7 mm respectively, +average over 5 labels) but gave topological errors in 8 cases. STN was +effective as a plug-in tool for correcting all topological errors with minimal +impact on overall performance (93.5% and 5.0 mm respectively). SSA also proves +to be a valuable plug-in tool, enhancing performance over both STN-based and +LTN-based models. The code for these different combinations is available at +https://github.com/XESchong/STACOM2024. + +
+
+ comment: 13 pages, STACOM2024 +
+
+
+
+
+ + ☆ Towards Real-time Video Compressive Sensing on Mobile Devices ACM MM 2024 + + +
+ Video Snapshot Compressive Imaging (SCI) uses a low-speed 2D camera to +capture high-speed scenes as snapshot compressed measurements, followed by a +reconstruction algorithm to retrieve the high-speed video frames. The fast +evolving mobile devices and existing high-performance video SCI reconstruction +algorithms motivate us to develop mobile reconstruction methods for real-world +applications. Yet, it is still challenging to deploy previous reconstruction +algorithms on mobile devices due to the complex inference process, let alone +real-time mobile reconstruction. To the best of our knowledge, there is no +video SCI reconstruction model designed to run on the mobile devices. Towards +this end, in this paper, we present an effective approach for video SCI +reconstruction, dubbed MobileSCI, which can run at real-time speed on the +mobile devices for the first time. Specifically, we first build a U-shaped 2D +convolution-based architecture, which is much more efficient and +mobile-friendly than previous state-of-the-art reconstruction methods. Besides, +an efficient feature mixing block, based on the channel splitting and shuffling +mechanisms, is introduced as a novel bottleneck block of our proposed MobileSCI +to alleviate the computational burden. Finally, a customized knowledge +distillation strategy is utilized to further improve the reconstruction +quality. Extensive results on both simulated and real data show that our +proposed MobileSCI can achieve superior reconstruction quality with high +efficiency on the mobile devices. Particularly, we can reconstruct a 256 X 256 +X 8 snapshot compressed measurement with real-time performance (about 35 FPS) +on an iPhone 15. Code is available at https://github.com/mcao92/MobileSCI. + +
+
+ comment: 9 pages, Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Evidential Graph Contrastive Alignment for Source-Free Blending-Target + Domain Adaptation + + +
+ In this paper, we firstly tackle a more realistic Domain Adaptation (DA) +setting: Source-Free Blending-Target Domain Adaptation (SF-BTDA), where we can +not access to source domain data while facing mixed multiple target domains +without any domain labels in prior. Compared to existing DA scenarios, SF-BTDA +generally faces the co-existence of different label shifts in different +targets, along with noisy target pseudo labels generated from the source model. +In this paper, we propose a new method called Evidential Contrastive Alignment +(ECA) to decouple the blending target domain and alleviate the effect from +noisy target pseudo labels. First, to improve the quality of pseudo target +labels, we propose a calibrated evidential learning module to iteratively +improve both the accuracy and certainty of the resulting model and adaptively +generate high-quality pseudo target labels. Second, we design a graph +contrastive learning with the domain distance matrix and confidence-uncertainty +criterion, to minimize the distribution gap of samples of a same class in the +blended target domains, which alleviates the co-existence of different label +shifts in blended targets. We conduct a new benchmark based on three standard +DA datasets and ECA outperforms other methods with considerable gains and +achieves comparable results compared with those that have domain labels or +source data in prior. + +
+
+
+
+
+ + ☆ Whitening Consistently Improves Self-Supervised Learning + + +
+ Self-supervised learning (SSL) has been shown to be a powerful approach for +learning visual representations. In this study, we propose incorporating ZCA +whitening as the final layer of the encoder in self-supervised learning to +enhance the quality of learned features by normalizing and decorrelating them. +Although whitening has been utilized in SSL in previous works, its potential to +universally improve any SSL model has not been explored. We demonstrate that +adding whitening as the last layer of SSL pretrained encoders is independent of +the self-supervised learning method and encoder architecture, thus it improves +performance for a wide range of SSL methods across multiple encoder +architectures and datasets. Our experiments show that whitening is capable of +improving linear and k-NN probing accuracy by 1-5%. Additionally, we propose +metrics that allow for a comprehensive analysis of the learned features, +provide insights into the quality of the representations and help identify +collapse patterns. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ DIffSteISR: Harnessing Diffusion Prior for Superior Real-world Stereo + Image Super-Resolution + + +
+ We introduce DiffSteISR, a pioneering framework for reconstructing real-world +stereo images. DiffSteISR utilizes the powerful prior knowledge embedded in +pre-trained text-to-image model to efficiently recover the lost texture details +in low-resolution stereo images. Specifically, DiffSteISR implements a +time-aware stereo cross attention with temperature adapter (TASCATA) to guide +the diffusion process, ensuring that the generated left and right views exhibit +high texture consistency thereby reducing disparity error between the +super-resolved images and the ground truth (GT) images. Additionally, a stereo +omni attention control network (SOA ControlNet) is proposed to enhance the +consistency of super-resolved images with GT images in the pixel, perceptual, +and distribution space. Finally, DiffSteISR incorporates a stereo semantic +extractor (SSE) to capture unique viewpoint soft semantic information and +shared hard tag semantic information, thereby effectively improving the +semantic accuracy and consistency of the generated left and right images. +Extensive experimental results demonstrate that DiffSteISR accurately +reconstructs natural and precise textures from low-resolution stereo images +while maintaining a high consistency of semantic and texture between the left +and right views. + +
+
+
+
+
+ + ☆ CNN-JEPA: Self-Supervised Pretraining Convolutional Neural Networks + Using Joint Embedding Predictive Architecture + + +
+ Self-supervised learning (SSL) has become an important approach in +pretraining large neural networks, enabling unprecedented scaling of model and +dataset sizes. While recent advances like I-JEPA have shown promising results +for Vision Transformers, adapting such methods to Convolutional Neural Networks +(CNNs) presents unique challenges. In this paper, we introduce CNN-JEPA, a +novel SSL method that successfully applies the joint embedding predictive +architecture approach to CNNs. Our method incorporates a sparse CNN encoder to +handle masked inputs, a fully convolutional predictor using depthwise separable +convolutions, and an improved masking strategy. We demonstrate that CNN-JEPA +outperforms I-JEPA with ViT architectures on ImageNet-100, achieving 73.3% +linear top-1 accuracy with a standard ResNet-50 encoder. Compared to other +CNN-based SSL methods, CNN-JEPA requires 17-35% less training time for the same +number of epochs and approaches the linear and k-NN top-1 accuracies of BYOL, +SimCLR, and VICReg. Our approach offers a simpler, more efficient alternative +to existing SSL methods for CNNs, requiring minimal augmentations and no +separate projector network. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation + Approach + + +
+ In this paper, we construct a large-scale benchmark dataset for +Ground-to-Aerial Video-based person Re-Identification, named G2A-VReID, which +comprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct +identities. To our knowledge, this is the first dataset for video ReID under +Ground-to-Aerial scenarios. G2A-VReID dataset has the following +characteristics: 1) Drastic view changes; 2) Large number of annotated +identities; 3) Rich outdoor scenarios; 4) Huge difference in resolution. +Additionally, we propose a new benchmark approach for cross-platform ReID by +transforming the cross-platform visual alignment problem into visual-semantic +alignment through vision-language model (i.e., CLIP) and applying a +parameter-efficient Video Set-Level-Adapter module to adapt image-based +foundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further +reduce the great discrepancy across the platforms, we also devise the +platform-bridge prompts for efficient visual feature alignment. Extensive +experiments demonstrate the superiority of the proposed method on all existing +video ReID datasets and our proposed G2A-VReID dataset. + +
+
+
+
+
+ + ☆ Attention-Guided Perturbation for Unsupervised Image Anomaly Detection + + +
+ Reconstruction-based methods have significantly advanced modern unsupervised +anomaly detection. However, the strong capacity of neural networks often +violates the underlying assumptions by reconstructing abnormal samples well. To +alleviate this issue, we present a simple yet effective reconstruction +framework named Attention-Guided Pertuation Network (AGPNet), which learns to +add perturbation noise with an attention mask, for accurate unsupervised +anomaly detection. Specifically, it consists of two branches, \ie, a plain +reconstruction branch and an auxiliary attention-based perturbation branch. The +reconstruction branch is simply a plain reconstruction network that learns to +reconstruct normal samples, while the auxiliary branch aims to produce +attention masks to guide the noise perturbation process for normal samples from +easy to hard. By doing so, we are expecting to synthesize hard yet more +informative anomalies for training, which enable the reconstruction branch to +learn important inherent normal patterns both comprehensively and efficiently. +Extensive experiments are conducted on three popular benchmarks covering +MVTec-AD, VisA, and MVTec-3D, and show that our framework obtains leading +anomaly detection performance under various setups including few-shot, +one-class, and multi-class setups. + +
+
+
+
+
+ + ☆ OMR: Occlusion-Aware Memory-Based Refinement for Video Lane Detection ECCV 2024 + + +
+ A novel algorithm for video lane detection is proposed in this paper. First, +we extract a feature map for a current frame and detect a latent mask for +obstacles occluding lanes. Then, we enhance the feature map by developing an +occlusion-aware memory-based refinement (OMR) module. It takes the obstacle +mask and feature map from the current frame, previous output, and memory +information as input, and processes them recursively in a video. Moreover, we +apply a novel data augmentation scheme for training the OMR module effectively. +Experimental results show that the proposed algorithm outperforms existing +techniques on video lane datasets. Our codes are available at +https://github.com/dongkwonjin/OMR. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ GRFormer: Grouped Residual Self-Attention for Lightweight Single Image + Super-Resolution ACM MM 2024 + + +
+ Previous works have shown that reducing parameter overhead and computations +for transformer-based single image super-resolution (SISR) models (e.g., +SwinIR) usually leads to a reduction of performance. In this paper, we present +GRFormer, an efficient and lightweight method, which not only reduces the +parameter overhead and computations, but also greatly improves performance. The +core of GRFormer is Grouped Residual Self-Attention (GRSA), which is +specifically oriented towards two fundamental components. Firstly, it +introduces a novel grouped residual layer (GRL) to replace the Query, Key, +Value (QKV) linear layer in self-attention, aimed at efficiently reducing +parameter overhead, computations, and performance loss at the same time. +Secondly, it integrates a compact Exponential-Space Relative Position Bias +(ES-RPB) as a substitute for the original relative position bias to improve the +ability to represent position information while further minimizing the +parameter count. Extensive experimental results demonstrate that GRFormer +outperforms state-of-the-art transformer-based methods for $\times$2, $\times$3 +and $\times$4 SISR tasks, notably outperforming SOTA by a maximum PSNR of +0.23dB when trained on the DIV2K dataset, while reducing the number of +parameter and MACs by about \textbf{60\%} and \textbf{49\% } in only +self-attention module respectively. We hope that our simple and effective +method that can easily applied to SR models based on window-division +self-attention can serve as a useful tool for further research in image +super-resolution. The code is available at +\url{https://github.com/sisrformer/GRFormer}. + +
+
+ comment: Accepted for ACM MM 2024 +
+
+
+
+
+ + ☆ DeCo: Decoupled Human-Centered Diffusion Video Editing with Motion + Consistency + + +
+ Diffusion models usher a new era of video editing, flexibly manipulating the +video contents with text prompts. Despite the widespread application demand in +editing human-centered videos, these models face significant challenges in +handling complex objects like humans. In this paper, we introduce DeCo, a novel +video editing framework specifically designed to treat humans and the +background as separate editable targets, ensuring global spatial-temporal +consistency by maintaining the coherence of each individual component. +Specifically, we propose a decoupled dynamic human representation that utilizes +a parametric human body prior to generate tailored humans while preserving the +consistent motions as the original video. In addition, we consider the +background as a layered atlas to apply text-guided image editing approaches on +it. To further enhance the geometry and texture of humans during the +optimization, we extend the calculation of score distillation sampling into +normal space and image space. Moreover, we tackle inconsistent lighting between +the edited targets by leveraging a lighting-aware video harmonizer, a problem +previously overlooked in decompose-edit-combine approaches. Extensive +qualitative and numerical experiments demonstrate that DeCo outperforms prior +video editing methods in human-centered videos, especially in longer videos. + +
+
+ comment: European Conference on Computer Vision +
+
+
+
+
+ + ☆ One Step Diffusion-based Super-Resolution with Time-Aware Distillation + + +
+ Diffusion-based image super-resolution (SR) methods have shown promise in +reconstructing high-resolution images with fine details from low-resolution +counterparts. However, these approaches typically require tens or even hundreds +of iterative samplings, resulting in significant latency. Recently, techniques +have been devised to enhance the sampling efficiency of diffusion-based SR +models via knowledge distillation. Nonetheless, when aligning the knowledge of +student and teacher models, these solutions either solely rely on pixel-level +loss constraints or neglect the fact that diffusion models prioritize varying +levels of information at different time steps. To accomplish effective and +efficient image super-resolution, we propose a time-aware diffusion +distillation method, named TAD-SR. Specifically, we introduce a novel score +distillation strategy to align the data distribution between the outputs of the +student and teacher models after minor noise perturbation. This distillation +strategy enables the student network to concentrate more on the high-frequency +details. Furthermore, to mitigate performance limitations stemming from +distillation, we integrate a latent adversarial loss and devise a time-aware +discriminator that leverages diffusion priors to effectively distinguish +between real images and generated images. Extensive experiments conducted on +synthetic and real-world datasets demonstrate that the proposed method achieves +comparable or even superior performance compared to both previous +state-of-the-art (SOTA) methods and the teacher model in just one sampling +step. Codes are available at https://github.com/LearningHx/TAD-SR. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Domain-invariant Representation Learning via Segment Anything Model for + Blood Cell Classification + + +
+ Accurate classification of blood cells is of vital significance in the +diagnosis of hematological disorders. However, in real-world scenarios, domain +shifts caused by the variability in laboratory procedures and settings, result +in a rapid deterioration of the model's generalization performance. To address +this issue, we propose a novel framework of domain-invariant representation +learning (DoRL) via segment anything model (SAM) for blood cell classification. +The DoRL comprises two main components: a LoRA-based SAM (LoRA-SAM) and a +cross-domain autoencoder (CAE). The advantage of DoRL is that it can extract +domain-invariant representations from various blood cell datasets in an +unsupervised manner. Specifically, we first leverage the large-scale foundation +model of SAM, fine-tuned with LoRA, to learn general image embeddings and +segment blood cells. Additionally, we introduce CAE to learn domain-invariant +representations across different-domain datasets while mitigating images' +artifacts. To validate the effectiveness of domain-invariant representations, +we employ five widely used machine learning classifiers to construct blood cell +classification models. Experimental results on two public blood cell datasets +and a private real dataset demonstrate that our proposed DoRL achieves a new +state-of-the-art cross-domain performance, surpassing existing methods by a +significant margin. The source code can be available at the URL +(https://github.com/AnoK3111/DoRL). + +
+
+
+
+
+ + ☆ Infra-YOLO: Efficient Neural Network Structure with Model Compression + for Real-Time Infrared Small Object Detection + + +
+ Although convolutional neural networks have made outstanding achievements in +visible light target detection, there are still many challenges in infrared +small object detection because of the low signal-to-noise ratio, incomplete +object structure, and a lack of reliable infrared small object dataset. To +resolve limitations of the infrared small object dataset, a new dataset named +InfraTiny was constructed, and more than 85% bounding box is less than 32x32 +pixels (3218 images and a total of 20,893 bounding boxes). A multi-scale +attention mechanism module (MSAM) and a Feature Fusion Augmentation Pyramid +Module (FFAFPM) were proposed and deployed onto embedded devices. The MSAM +enables the network to obtain scale perception information by acquiring +different receptive fields, while the background noise information is +suppressed to enhance feature extraction ability. The proposed FFAFPM can +enrich semantic information, and enhance the fusion of shallow feature and deep +feature, thus false positive results have been significantly reduced. By +integrating the proposed methods into the YOLO model, which is named +Infra-YOLO, infrared small object detection performance has been improved. +Compared to yolov3, mAP@0.5 has been improved by 2.7%; and compared to yolov4, +that by 2.5% on the InfraTiny dataset. The proposed Infra-YOLO was also +transferred onto the embedded device in the unmanned aerial vehicle (UAV) for +real application scenarios, where the channel pruning method is adopted to +reduce FLOPs and to achieve a tradeoff between speed and accuracy. Even if the +parameters of Infra-YOLO are reduced by 88% with the pruning method, a gain of +0.7% is still achieved on mAP@0.5 compared to yolov3, and a gain of 0.5% +compared to yolov4. Experimental results show that the proposed MSAM and FFAFPM +method can improve infrared small object detection performance compared with +the previous benchmark method. + +
+
+
+
+
+ + ☆ Modality Invariant Multimodal Learning to Handle Missing Modalities: A + Single-Branch Approach + + +
+ Multimodal networks have demonstrated remarkable performance improvements +over their unimodal counterparts. Existing multimodal networks are designed in +a multi-branch fashion that, due to the reliance on fusion strategies, exhibit +deteriorated performance if one or more modalities are missing. In this work, +we propose a modality invariant multimodal learning method, which is less +susceptible to the impact of missing modalities. It consists of a single-branch +network sharing weights across multiple modalities to learn inter-modality +representations to maximize performance as well as robustness to missing +modalities. Extensive experiments are performed on four challenging datasets +including textual-visual (UPMC Food-101, Hateful Memes, Ferramenta) and +audio-visual modalities (VoxCeleb1). Our proposed method achieves superior +performance when all modalities are present as well as in the case of missing +modalities during training or testing compared to the existing state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Costal Cartilage Segmentation with Topology Guided Deformable Mamba: + Method and Benchmark + + +
+ Costal cartilage segmentation is crucial to various medical applications, +necessitating precise and reliable techniques due to its complex anatomy and +the importance of accurate diagnosis and surgical planning. We propose a novel +deep learning-based approach called topology-guided deformable Mamba (TGDM) for +costal cartilage segmentation. The TGDM is tailored to capture the intricate +long-range costal cartilage relationships. Our method leverages a deformable +model that integrates topological priors to enhance the adaptability and +accuracy of the segmentation process. Furthermore, we developed a comprehensive +benchmark that contains 165 cases for costal cartilage segmentation. This +benchmark sets a new standard for evaluating costal cartilage segmentation +techniques and provides a valuable resource for future research. Extensive +experiments conducted on both in-domain benchmarks and out-of domain test sets +demonstrate the superiority of our approach over existing methods, showing +significant improvements in segmentation precision and robustness. + +
+
+
+
+
+ + ☆ BAPLe: Backdoor Attacks on Medical Foundational Models using Prompt + Learning MICCAI 2024 + + +
+ Medical foundation models are gaining prominence in the medical community for +their ability to derive general representations from extensive collections of +medical image-text pairs. Recent research indicates that these models are +susceptible to backdoor attacks, which allow them to classify clean images +accurately but fail when specific triggers are introduced. However, traditional +backdoor attacks necessitate a considerable amount of additional data to +maliciously pre-train a model. This requirement is often impractical in medical +imaging applications due to the usual scarcity of data. Inspired by the latest +developments in learnable prompts, this work introduces a method to embed a +backdoor into the medical foundation model during the prompt learning phase. By +incorporating learnable prompts within the text encoder and introducing +imperceptible learnable noise trigger to the input images, we exploit the full +capabilities of the medical foundation models (Med-FM). Our method, BAPLe, +requires only a minimal subset of data to adjust the noise trigger and the text +prompts for downstream tasks, enabling the creation of an effective backdoor +attack. Through extensive experiments with four medical foundation models, each +pre-trained on different modalities and evaluated across six downstream +datasets, we demonstrate the efficacy of our approach. BAPLe achieves a high +backdoor success rate across all models and datasets, outperforming the +baseline backdoor attack methods. Our work highlights the vulnerability of +Med-FMs towards backdoor attacks and strives to promote the safe adoption of +Med-FMs before their deployment in real-world applications. Code is available +at https://asif-hanif.github.io/baple/. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ Achieving Data Efficient Neural Networks with Hybrid Concept-based + Models + + +
+ Most datasets used for supervised machine learning consist of a single label +per data point. However, in cases where more information than just the class +label is available, would it be possible to train models more efficiently? We +introduce two novel model architectures, which we call hybrid concept-based +models, that train using both class labels and additional information in the +dataset referred to as concepts. In order to thoroughly assess their +performance, we introduce ConceptShapes, an open and flexible class of datasets +with concept labels. We show that the hybrid concept-based models outperform +standard computer vision models and previously proposed concept-based models +with respect to accuracy, especially in sparse data settings. We also introduce +an algorithm for performing adversarial concept attacks, where an image is +perturbed in a way that does not change a concept-based model's concept +predictions, but changes the class prediction. The existence of such +adversarial examples raises questions about the interpretable qualities +promised by concept-based models. + +
+
+ comment: 11 pages, 8 figures, appendix +
+
+
+
+
+ + ☆ MagicFace: Training-free Universal-Style Human Image Customized + Synthesis + + +
+ Existing human image personalized generation methods often require tedious +training: either fine-tuning with a few images or retraining on large-scale +datasets. In such cases, these methods are prone to overfitting and encounter +difficulties when personalizing individuals of diverse styles. Moreover, these +training-based approaches also struggle with multi-concept human image +customizing. To this end, we propose MagicFace, the first method for +universal-style human image personalized synthesis that enables +single/multi-concept customization for humans of any style in a training-free +manner. MagicFace introduces a coarse-to-fine generation pipeline, involving +two sequential stages: semantic scene construction and concept feature +injection. This is achieved by our Reference-aware Self-Attention (RSA) and +Region-grouped Blend Attention (RBA) mechanisms. Specifically, in the first +stage, RSA enables the latent image to query features from reference concepts +simultaneously, extracting the coarse-grained overall semantic understanding to +facilitate the initial semantic layout establishment. In the second stage, we +employ an attention-based semantic segmentation method to pinpoint the +generated regions of all concepts in the latent image at each step. Following +this, RBA divides the pixels of the latent image into semantic groups, with +each group querying fine-grained features from its reference concept, which +ensures precise attribute alignment and feature injection. Throughout the +two-stage process, a weight mask strategy is employed to ensure the model +focuses more on the reference concepts. Extensive experiments demonstrate our +superiority in both human-centric subject-to-image synthesis and multi-concept +human image customization. Our approach also can be applied to texture +transformation, further enhancing its versatility and applicability. + +
+
+ comment: project page: https://codegoat24.github.io/MagicFace +
+
+
+
+
+ + ☆ UAHOI: Uncertainty-aware Robust Interaction Learning for HOI Detection + + +
+ This paper focuses on Human-Object Interaction (HOI) detection, addressing +the challenge of identifying and understanding the interactions between humans +and objects within a given image or video frame. Spearheaded by Detection +Transformer (DETR), recent developments lead to significant improvements by +replacing traditional region proposals by a set of learnable queries. However, +despite the powerful representation capabilities provided by Transformers, +existing Human-Object Interaction (HOI) detection methods still yield low +confidence levels when dealing with complex interactions and are prone to +overlooking interactive actions. To address these issues, we propose a novel +approach \textsc{UAHOI}, Uncertainty-aware Robust Human-Object Interaction +Learning that explicitly estimates prediction uncertainty during the training +process to refine both detection and interaction predictions. Our model not +only predicts the HOI triplets but also quantifies the uncertainty of these +predictions. Specifically, we model this uncertainty through the variance of +predictions and incorporate it into the optimization objective, allowing the +model to adaptively adjust its confidence threshold based on prediction +variance. This integration helps in mitigating the adverse effects of incorrect +or ambiguous predictions that are common in traditional methods without any +hand-designed components, serving as an automatic confidence threshold. Our +method is flexible to existing HOI detection methods and demonstrates improved +accuracy. We evaluate \textsc{UAHOI} on two standard benchmarks in the field: +V-COCO and HICO-DET, which represent challenging scenarios for HOI detection. +Through extensive experiments, we demonstrate that \textsc{UAHOI} achieves +significant improvements over existing state-of-the-art methods, enhancing both +the accuracy and robustness of HOI detection. + +
+
+ comment: Accepted by CVIU +
+
+
+
+
+ + ☆ LLMI3D: Empowering LLM with 3D Perception from a Single 2D Image + + +
+ Recent advancements in autonomous driving, augmented reality, robotics, and +embodied intelligence have necessitated 3D perception algorithms. However, +current 3D perception methods, particularly small models, struggle with +processing logical reasoning, question-answering, and handling open scenario +categories. On the other hand, generative multimodal large language models +(MLLMs) excel in general capacity but underperform in 3D tasks, due to weak +spatial and local object perception, poor text-based geometric numerical +output, and inability to handle camera focal variations. To address these +challenges, we propose the following solutions: Spatial-Enhanced Local Feature +Mining for better spatial feature extraction, 3D Query Token-Derived Info +Decoding for precise geometric regression, and Geometry Projection-Based 3D +Reasoning for handling camera focal length variations. We employ +parameter-efficient fine-tuning for a pre-trained MLLM and develop LLMI3D, a +powerful 3D perception MLLM. Additionally, we have constructed the IG3D +dataset, which provides fine-grained descriptions and question-answer +annotations. Extensive experiments demonstrate that our LLMI3D achieves +state-of-the-art performance, significantly outperforming existing methods. + +
+
+
+
+
+ + ☆ Unsupervised Stereo Matching Network For VHR Remote Sensing Images Based + On Error Prediction + + +
+ Stereo matching in remote sensing has recently garnered increased attention, +primarily focusing on supervised learning. However, datasets with ground truth +generated by expensive airbone Lidar exhibit limited quantity and diversity, +constraining the effectiveness of supervised networks. In contrast, +unsupervised learning methods can leverage the increasing availability of +very-high-resolution (VHR) remote sensing images, offering considerable +potential in the realm of stereo matching. Motivated by this intuition, we +propose a novel unsupervised stereo matching network for VHR remote sensing +images. A light-weight module to bridge confidence with predicted error is +introduced to refine the core model. Robust unsupervised losses are formulated +to enhance network convergence. The experimental results on US3D and WHU-Stereo +datasets demonstrate that the proposed network achieves superior accuracy +compared to other unsupervised networks and exhibits better generalization +capabilities than supervised models. Our code will be available at +https://github.com/Elenairene/CBEM. + +
+
+ comment: Accepted to International Geoscience and Remote Sensing Symposium + (IGARSS), 2024 +
+
+
+
+
+ + ☆ Rethinking Open-Vocabulary Segmentation of Radiance Fields in 3D Space + + +
+ Understanding the 3D semantics of a scene is a fundamental problem for +various scenarios such as embodied agents. While NeRFs and 3DGS excel at +novel-view synthesis, previous methods for understanding their semantics have +been limited to incomplete 3D understanding: their segmentation results are 2D +masks and their supervision is anchored at 2D pixels. This paper revisits the +problem set to pursue a better 3D understanding of a scene modeled by NeRFs and +3DGS as follows. 1) We directly supervise the 3D points to train the language +embedding field. It achieves state-of-the-art accuracy without relying on +multi-scale language embeddings. 2) We transfer the pre-trained language field +to 3DGS, achieving the first real-time rendering speed without sacrificing +training time or accuracy. 3) We introduce a 3D querying and evaluation +protocol for assessing the reconstructed geometry and semantics together. Code, +checkpoints, and annotations will be available online. Project page: +https://hyunji12.github.io/Open3DRF + +
+
+ comment: Project page: https://hyunji12.github.io/Open3DRF +
+
+
+
+
+ + ☆ Segment Using Just One Example + + +
+ Semantic segmentation is an important topic in computer vision with many +relevant application in Earth observation. While supervised methods exist, the +constraints of limited annotated data has encouraged development of +unsupervised approaches. However, existing unsupervised methods resemble +clustering and cannot be directly mapped to explicit target classes. In this +paper, we deal with single shot semantic segmentation, where one example for +the target class is provided, which is used to segment the target class from +query/test images. Our approach exploits recently popular Segment Anything +(SAM), a promptable foundation model. We specifically design several techniques +to automatically generate prompts from the only example/key image in such a way +that the segmentation is successfully achieved on a stitch or concatenation of +the example/key and query/test images. Proposed technique does not involve any +training phase and just requires one example image to grasp the concept. +Furthermore, no text-based prompt is required for the proposed method. We +evaluated the proposed techniques on building and car classes. + +
+
+
+
+
+ + ☆ Automated Retinal Image Analysis and Medical Report Generation through + Deep Learning + + +
+ The increasing prevalence of retinal diseases poses a significant challenge +to the healthcare system, as the demand for ophthalmologists surpasses the +available workforce. This imbalance creates a bottleneck in diagnosis and +treatment, potentially delaying critical care. Traditional methods of +generating medical reports from retinal images rely on manual interpretation, +which is time-consuming and prone to errors, further straining +ophthalmologists' limited resources. This thesis investigates the potential of +Artificial Intelligence (AI) to automate medical report generation for retinal +images. AI can quickly analyze large volumes of image data, identifying subtle +patterns essential for accurate diagnosis. By automating this process, AI +systems can greatly enhance the efficiency of retinal disease diagnosis, +reducing doctors' workloads and enabling them to focus on more complex cases. +The proposed AI-based methods address key challenges in automated report +generation: (1) Improved methods for medical keyword representation enhance the +system's ability to capture nuances in medical terminology; (2) A multi-modal +deep learning approach captures interactions between textual keywords and +retinal images, resulting in more comprehensive medical reports; (3) Techniques +to enhance the interpretability of the AI-based report generation system, +fostering trust and acceptance in clinical practice. These methods are +rigorously evaluated using various metrics and achieve state-of-the-art +performance. This thesis demonstrates AI's potential to revolutionize retinal +disease diagnosis by automating medical report generation, ultimately improving +clinical efficiency, diagnostic accuracy, and patient care. +[https://github.com/Jhhuangkay/DeepOpht-Medical-Report-Generation-for-Retinal-Images-via-Deep-Models-and-Visual-Explanation] + +
+
+ comment: Ph.D. thesis, 124 pages +
+
+
+
+
+ + ☆ RTAT: A Robust Two-stage Association Tracker for Multi-Object Tracking ICPR2024 + + +
+ Data association is an essential part in the tracking-by-detection based +Multi-Object Tracking (MOT). Most trackers focus on how to design a better data +association strategy to improve the tracking performance. The rule-based +handcrafted association methods are simple and highly efficient but lack +generalization capability to deal with complex scenes. While the learnt +association methods can learn high-order contextual information to deal with +various complex scenes, but they have the limitations of higher complexity and +cost. To address these limitations, we propose a Robust Two-stage Association +Tracker, named RTAT. The first-stage association is performed between tracklets +and detections to generate tracklets with high purity, and the second-stage +association is performed between tracklets to form complete trajectories. For +the first-stage association, we use a simple data association strategy to +generate tracklets with high purity by setting a low threshold for the matching +cost in the assignment process. We conduct the tracklet association in the +second-stage based on the framework of message-passing GNN. Our method models +the tracklet association as a series of edge classification problem in +hierarchical graphs, which can recursively merge short tracklets into longer +ones. Our tracker RTAT ranks first on the test set of MOT17 and MOT20 +benchmarks in most of the main MOT metrics: HOTA, IDF1, and AssA. We achieve +67.2 HOTA, 84.7 IDF1, and 69.7 AssA on MOT17, and 66.2 HOTA, 82.5 IDF1, and +68.1 AssA on MOT20. + +
+
+ comment: ICPR2024 +
+
+
+
+
+ + ☆ Gradient Alignment Improves Test-Time Adaptation for Medical Image + Segmentation + + +
+ Although recent years have witnessed significant advancements in medical +image segmentation, the pervasive issue of domain shift among medical images +from diverse centres hinders the effective deployment of pre-trained models. +Many Test-time Adaptation (TTA) methods have been proposed to address this +issue by fine-tuning pre-trained models with test data during inference. These +methods, however, often suffer from less-satisfactory optimization due to +suboptimal optimization direction (dictated by the gradient) and fixed +step-size (predicated on the learning rate). In this paper, we propose the +Gradient alignment-based Test-time adaptation (GraTa) method to improve both +the gradient direction and learning rate in the optimization procedure. Unlike +conventional TTA methods, which primarily optimize the pseudo gradient derived +from a self-supervised objective, our method incorporates an auxiliary gradient +with the pseudo one to facilitate gradient alignment. Such gradient alignment +enables the model to excavate the similarities between different gradients and +correct the gradient direction to approximate the empirical gradient related to +the current segmentation task. Additionally, we design a dynamic learning rate +based on the cosine similarity between the pseudo and auxiliary gradients, +thereby empowering the adaptive fine-tuning of pre-trained models on diverse +test data. Extensive experiments establish the effectiveness of the proposed +gradient alignment and dynamic learning rate and substantiate the superiority +of our GraTa method over other state-of-the-art TTA methods on a benchmark +medical image segmentation task. The code and weights of pre-trained source +models will be available. + +
+
+
+
+
+ + ☆ Robust Semi-supervised Multimodal Medical Image Segmentation via Cross + Modality Collaboration + + +
+ Multimodal learning leverages complementary information derived from +different modalities, thereby enhancing performance in medical image +segmentation. However, prevailing multimodal learning methods heavily rely on +extensive well-annotated data from various modalities to achieve accurate +segmentation performance. This dependence often poses a challenge in clinical +settings due to limited availability of such data. Moreover, the inherent +anatomical misalignment between different imaging modalities further +complicates the endeavor to enhance segmentation performance. To address this +problem, we propose a novel semi-supervised multimodal segmentation framework +that is robust to scarce labeled data and misaligned modalities. Our framework +employs a novel cross modality collaboration strategy to distill +modality-independent knowledge, which is inherently associated with each +modality, and integrates this information into a unified fusion layer for +feature amalgamation. With a channel-wise semantic consistency loss, our +framework ensures alignment of modality-independent information from a +feature-wise perspective across modalities, thereby fortifying it against +misalignments in multimodal scenarios. Furthermore, our framework effectively +integrates contrastive consistent learning to regulate anatomical structures, +facilitating anatomical-wise prediction alignment on unlabeled data in +semi-supervised segmentation tasks. Our method achieves competitive performance +compared to other multimodal methods across three tasks: cardiac, abdominal +multi-organ, and thyroid-associated orbitopathy segmentations. It also +demonstrates outstanding robustness in scenarios involving scarce labeled data +and misaligned modalities. + +
+
+
+
+
+ + ☆ KIND: Knowledge Integration and Diversion in Diffusion Models + + +
+ Pre-trained models have become the preferred backbone due to the expansion of +model parameters, with techniques like Parameter-Efficient Fine-Tuning (PEFTs) +typically fixing the parameters of these models. However, pre-trained models +may not always be optimal, especially when there are discrepancies between +training tasks and target tasks, potentially resulting in negative transfer. To +address this, we introduce \textbf{KIND}, which performs \textbf{K}nowledge +\textbf{IN}tegration and \textbf{D}iversion in diffusion models. KIND first +integrates knowledge by decomposing parameter matrices of models using $U$, +$\Sigma$, and $V$ matrices, formally inspired by singular value decomposition +(SVD). Then it explicitly partitions the components of these matrices into +\textbf{learngenes} and \textbf{tailors} to condense common and class-specific +knowledge, respectively, through a class gate. In this way, KIND redefines +traditional pre-training methods by adjusting training objectives from +maximizing model performance on current tasks to condensing transferable common +knowledge, leveraging the \textit{Learngene} framework. We conduct experiments +on ImageNet-1K and compare KIND with PEFT and other learngene methods. Results +indicate that KIND achieves state-of-the-art performance compared to other PEFT +and learngene methods. Specifically, the images generated by KIND achieves more +than 6.54 and 1.07 decrease in FID and sFID on DiT-L/2, utilizing only 45.4M +trainable parameters and saving at least 35.4G FLOPs in computational cost. + +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Ranking-Based Hybrid + Training and Multimodal Fusion + + +
+ Visual Question Answering (VQA) is a challenging task that requires systems +to provide accurate answers to questions based on image content. Current VQA +models struggle with complex questions due to limitations in capturing and +integrating multimodal information effectively. To address these challenges, we +propose the Rank VQA model, which leverages a ranking-inspired hybrid training +strategy to enhance VQA performance. The Rank VQA model integrates high-quality +visual features extracted using the Faster R-CNN model and rich semantic text +features obtained from a pre-trained BERT model. These features are fused +through a sophisticated multimodal fusion technique employing multi-head +self-attention mechanisms. Additionally, a ranking learning module is +incorporated to optimize the relative ranking of answers, thus improving answer +accuracy. The hybrid training strategy combines classification and ranking +losses, enhancing the model's generalization ability and robustness across +diverse datasets. Experimental results demonstrate the effectiveness of the +Rank VQA model. Our model significantly outperforms existing state-of-the-art +models on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of +both accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank +VQA is evident in its ability to handle complex questions that require +understanding nuanced details and making sophisticated inferences from the +image and text. This work highlights the effectiveness of a ranking-based +hybrid training strategy in improving VQA performance and lays the groundwork +for further research in multimodal learning methods. + +
+
+ comment: Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal + Fusion, Ranking Learning, Hybrid Training Strategy +
+
+
+
+
+ + ☆ Image-Based Leopard Seal Recognition: Approaches and Challenges in + Current Automated Systems + + +
+ This paper examines the challenges and advancements in recognizing seals +within their natural habitats using conventional photography, underscored by +the emergence of machine learning technologies. We used the leopard seal, +\emph{Hydrurga leptonyx}, a key species within Antarctic ecosystems, to review +the different available methods found. As apex predators, Leopard seals are +characterized by their significant ecological role and elusive nature so +studying them is crucial to understand the health of their ecosystem. +Traditional methods of monitoring seal species are often constrained by the +labor-intensive and time-consuming processes required for collecting data, +compounded by the limited insights these methods provide. The advent of machine +learning, particularly through the application of vision transformers, heralds +a new era of efficiency and precision in species monitoring. By leveraging +state-of-the-art approaches in detection, segmentation, and recognition within +digital imaging, this paper presents a synthesis of the current landscape, +highlighting both the cutting-edge methodologies and the predominant challenges +faced in accurately identifying seals through photographic data. + +
+
+ comment: 28th International Conference on Image Processing, Computer Vision, & + Pattern Recognition (IPCV'24), Las Vegas, USA +
+
+
+
+
+ + ☆ Enhanced Scale-aware Depth Estimation for Monocular Endoscopic Scenes + with Geometric Modeling + + +
+ Scale-aware monocular depth estimation poses a significant challenge in +computer-aided endoscopic navigation. However, existing depth estimation +methods that do not consider the geometric priors struggle to learn the +absolute scale from training with monocular endoscopic sequences. Additionally, +conventional methods face difficulties in accurately estimating details on +tissue and instruments boundaries. In this paper, we tackle these problems by +proposing a novel enhanced scale-aware framework that only uses monocular +images with geometric modeling for depth estimation. Specifically, we first +propose a multi-resolution depth fusion strategy to enhance the quality of +monocular depth estimation. To recover the precise scale between relative depth +and real-world values, we further calculate the 3D poses of instruments in the +endoscopic scenes by algebraic geometry based on the image-only geometric +primitives (i.e., boundaries and tip of instruments). Afterwards, the 3D poses +of surgical instruments enable the scale recovery of relative depth maps. By +coupling scale factors and relative depth estimation, the scale-aware depth of +the monocular endoscopic scenes can be estimated. We evaluate the pipeline on +in-house endoscopic surgery videos and simulated data. The results demonstrate +that our method can learn the absolute scale with geometric modeling and +accurately estimate scale-aware depth for monocular scenes. + +
+
+
+
+
+ + ☆ Lesion-aware network for diabetic retinopathy diagnosis + + +
+ Deep learning brought boosts to auto diabetic retinopathy (DR) diagnosis, +thus, greatly helping ophthalmologists for early disease detection, which +contributes to preventing disease deterioration that may eventually lead to +blindness. It has been proved that convolutional neural network (CNN)-aided +lesion identifying or segmentation benefits auto DR screening. The key to +fine-grained lesion tasks mainly lies in: (1) extracting features being both +sensitive to tiny lesions and robust against DR-irrelevant interference, and +(2) exploiting and re-using encoded information to restore lesion locations +under extremely imbalanced data distribution. To this end, we propose a +CNN-based DR diagnosis network with attention mechanism involved, termed +lesion-aware network, to better capture lesion information from imbalanced +data. Specifically, we design the lesion-aware module (LAM) to capture +noise-like lesion areas across deeper layers, and the feature-preserve module +(FPM) to assist shallow-to-deep feature fusion. Afterward, the proposed +lesion-aware network (LANet) is constructed by embedding the LAM and FPM into +the CNN decoders for DR-related information utilization. The proposed LANet is +then further extended to a DR screening network by adding a classification +layer. Through experiments on three public fundus datasets with pixel-level +annotations, our method outperforms the mainstream methods with an area under +curve of 0.967 in DR screening, and increases the overall average precision by +7.6%, 2.1%, and 1.2% in lesion segmentation on three datasets. Besides, the +ablation study validates the effectiveness of the proposed sub-modules. + +
+
+ comment: This is submitted version wihout improvements by reviewers. The final + version is published on International Journal of Imaging Systems and + Techonology (https://onlinelibrary.wiley.com/doi/10.1002/ima.22933) +
+
+
+
+
+ + ☆ Ensemble architecture in polyp segmentation + + +
+ In this research, we revisit the architecture of semantic segmentation and +evaluate the models excelling in polyp segmentation. We introduce an integrated +framework that harnesses the advantages of different models to attain an +optimal outcome. More specifically, we fuse the learned features from +convolutional and transformer models for prediction, and we view this approach +as an ensemble technique to enhance model performance. Our experiments on polyp +segmentation reveal that the proposed architecture surpasses other top models, +exhibiting improved learning capacity and resilience. The code is available at +https://github.com/HuangDLab/EnFormer. + +
+
+
+
+
+ + ☆ GRIF-DM: Generation of Rich Impression Fonts using Diffusion Models ECAI2024 + + +
+ Fonts are integral to creative endeavors, design processes, and artistic +productions. The appropriate selection of a font can significantly enhance +artwork and endow advertisements with a higher level of expressivity. Despite +the availability of numerous diverse font designs online, traditional +retrieval-based methods for font selection are increasingly being supplanted by +generation-based approaches. These newer methods offer enhanced flexibility, +catering to specific user preferences and capturing unique stylistic +impressions. However, current impression font techniques based on Generative +Adversarial Networks (GANs) necessitate the utilization of multiple auxiliary +losses to provide guidance during generation. Furthermore, these methods +commonly employ weighted summation for the fusion of impression-related +keywords. This leads to generic vectors with the addition of more impression +keywords, ultimately lacking in detail generation capacity. In this paper, we +introduce a diffusion-based method, termed \ourmethod, to generate fonts that +vividly embody specific impressions, utilizing an input consisting of a single +letter and a set of descriptive impression keywords. The core innovation of +\ourmethod lies in the development of dual cross-attention modules, which +process the characteristics of the letters and impression keywords +independently but synergistically, ensuring effective integration of both types +of information. Our experimental results, conducted on the MyFonts dataset, +affirm that this method is capable of producing realistic, vibrant, and +high-fidelity fonts that are closely aligned with user specifications. This +confirms the potential of our approach to revolutionize font generation by +accommodating a broad spectrum of user-driven design requirements. Our code is +publicly available at \url{https://github.com/leitro/GRIF-DM}. + +
+
+ comment: Accepted to ECAI2024 +
+
+
+
+
+ + ☆ All-around Neural Collapse for Imbalanced Classification + + +
+ Neural Collapse (NC) presents an elegant geometric structure that enables +individual activations (features), class means and classifier (weights) vectors +to reach \textit{optimal} inter-class separability during the terminal phase of +training on a \textit{balanced} dataset. Once shifted to imbalanced +classification, such an optimal structure of NC can be readily destroyed by the +notorious \textit{minority collapse}, where the classifier vectors +corresponding to the minority classes are squeezed. In response, existing works +endeavor to recover NC typically by optimizing classifiers. However, we +discover that this squeezing phenomenon is not only confined to classifier +vectors but also occurs with class means. + Consequently, reconstructing NC solely at the classifier aspect may be +futile, as the feature means remain compressed, leading to the violation of +inherent \textit{self-duality} in NC (\textit{i.e.}, class means and classifier +vectors converge mutually) and incidentally, resulting in an unsatisfactory +collapse of individual activations towards the corresponding class means. To +shake off these dilemmas, we present a unified \textbf{All}-around +\textbf{N}eural \textbf{C}ollapse framework (AllNC), aiming to comprehensively +restore NC across multiple aspects including individual activations, class +means and classifier vectors. We thoroughly analyze its effectiveness and +verify on multiple benchmark datasets that it achieves state-of-the-art in both +balanced and imbalanced settings. + +
+
+
+
+
+ + ☆ GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval + + +
+ In the rapidly expanding domain of web video content, the task of text-video +retrieval has become increasingly critical, bridging the semantic gap between +textual queries and video data. This paper introduces a novel data-centric +approach, Generalized Query Expansion (GQE), to address the inherent +information imbalance between text and video, enhancing the effectiveness of +text-video retrieval systems. Unlike traditional model-centric methods that +focus on designing intricate cross-modal interaction mechanisms, GQE aims to +expand the text queries associated with videos both during training and testing +phases. By adaptively segmenting videos into short clips and employing +zero-shot captioning, GQE enriches the training dataset with comprehensive +scene descriptions, effectively bridging the data imbalance gap. Furthermore, +during retrieval, GQE utilizes Large Language Models (LLM) to generate a +diverse set of queries and a query selection module to filter these queries +based on relevance and diversity, thus optimizing retrieval performance while +reducing computational overhead. Our contributions include a detailed +examination of the information imbalance challenge, a novel approach to query +expansion in video-text datasets, and the introduction of a query selection +strategy that enhances retrieval accuracy without increasing computational +costs. GQE achieves state-of-the-art performance on several benchmarks, +including MSR-VTT, MSVD, LSMDC, and VATEX, demonstrating the effectiveness of +addressing text-video retrieval from a data-centric perspective. + +
+
+ comment: 18 pages including appendix +
+
+
+
+
+ + ☆ Seeing and Understanding: Bridging Vision with Chemical Knowledge Via + ChemVLM + + +
+ In this technical report, we propose ChemVLM, the first open-source +multimodal large language model dedicated to the fields of chemistry, designed +to address the incompatibility between chemical image understanding and text +analysis. Built upon the VIT-MLP-LLM architecture, we leverage ChemLLM-20B as +the foundational large model, endowing our model with robust capabilities in +understanding and utilizing chemical text knowledge. Additionally, we employ +InternVIT-6B as a powerful image encoder. We have curated high-quality data +from the chemical domain, including molecules, reaction formulas, and chemistry +examination data, and compiled these into a bilingual multimodal +question-answering dataset. We test the performance of our model on multiple +open-source benchmarks and three custom evaluation sets. Experimental results +demonstrate that our model achieves excellent performance, securing +state-of-the-art results in five out of six involved tasks. Our model can be +found at https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: Techical report +
+
+
+
+
+ + ☆ Sign language recognition based on deep learning and low-cost + handcrafted descriptors + + +
+ In recent years, deep learning techniques have been used to develop sign +language recognition systems, potentially serving as a communication tool for +millions of hearing-impaired individuals worldwide. However, there are inherent +challenges in creating such systems. Firstly, it is important to consider as +many linguistic parameters as possible in gesture execution to avoid ambiguity +between words. Moreover, to facilitate the real-world adoption of the created +solution, it is essential to ensure that the chosen technology is realistic, +avoiding expensive, intrusive, or low-mobility sensors, as well as very complex +deep learning architectures that impose high computational requirements. Based +on this, our work aims to propose an efficient sign language recognition system +that utilizes low-cost sensors and techniques. To this end, an object detection +model was trained specifically for detecting the interpreter's face and hands, +ensuring focus on the most relevant regions of the image and generating inputs +with higher semantic value for the classifier. Additionally, we introduced a +novel approach to obtain features representing hand location and movement by +leveraging spatial information derived from centroid positions of bounding +boxes, thereby enhancing sign discrimination. The results demonstrate the +efficiency of our handcrafted features, increasing accuracy by 7.96% on the +AUTSL dataset, while adding fewer than 700 thousand parameters and incurring +less than 10 milliseconds of additional inference time. These findings +highlight the potential of our technique to strike a favorable balance between +computational cost and accuracy, making it a promising approach for practical +sign language recognition applications. + +
+
+ comment: 28 pages, 12 figures, submitted to Image and Vision Computing Journal +
+
+
+
+
+ + ☆ Leveraging Perceptual Scores for Dataset Pruning in Computer Vision + Tasks CVPR 2024 + + +
+ In this paper we propose a score of an image to use for coreset selection in +image classification and semantic segmentation tasks. The score is the entropy +of an image as approximated by the bits-per-pixel of its compressed version. +Thus the score is intrinsic to an image and does not require supervision or +training. It is very simple to compute and readily available as all images are +stored in a compressed format. The motivation behind our choice of score is +that most other scores proposed in literature are expensive to compute. More +importantly, we want a score that captures the perceptual complexity of an +image. Entropy is one such measure, images with clutter tend to have a higher +entropy. However sampling only low entropy iconic images, for example, leads to +biased learning and an overall decrease in test performance with current deep +learning models. To mitigate the bias we use a graph based method that +increases the spatial diversity of the selected samples. We show that this +simple score yields good results, particularly for semantic segmentation tasks. + +
+
+ comment: 1st workshop on Dataset Distillation CVPR 2024 +
+
+
+
+
+ + ☆ Enhancing Autonomous Vehicle Perception in Adverse Weather through Image + Augmentation during Semantic Segmentation Training + + +
+ Robust perception is crucial in autonomous vehicle navigation and +localization. Visual processing tasks, like semantic segmentation, should work +in varying weather conditions and during different times of day. Semantic +segmentation is where each pixel is assigned a class, which is useful for +locating overall features (1). Training a segmentation model requires large +amounts of data, and the labeling process for segmentation data is especially +tedious. Additionally, many large datasets include only images taken in clear +weather. This is a problem because training a model exclusively on clear +weather data hinders performance in adverse weather conditions like fog or +rain. We hypothesize that given a dataset of only clear days images, applying +image augmentation (such as random rain, fog, and brightness) during training +allows for domain adaptation to diverse weather conditions. We used CARLA, a 3D +realistic autonomous vehicle simulator, to collect 1200 images in clear weather +composed of 29 classes from 10 different towns (2). We also collected 1200 +images of random weather effects. We trained encoder-decoder UNet models to +perform semantic segmentation. Applying augmentations significantly improved +segmentation under weathered night conditions (p < 0.001). However, models +trained on weather data have significantly lower losses than those trained on +augmented data in all conditions except for clear days. This shows there is +room for improvement in the domain adaptation approach. Future work should test +more types of augmentations and also use real-life images instead of CARLA. +Ideally, the augmented model meets or exceeds the performance of the weather +model. + +
+
+
+
+
+ + ☆ Algebraic Representations for Faster Predictions in Convolutional Neural + Networks SC 2024 + + +
+ Convolutional neural networks (CNNs) are a popular choice of model for tasks +in computer vision. When CNNs are made with many layers, resulting in a deep +neural network, skip connections may be added to create an easier gradient +optimization problem while retaining model expressiveness. In this paper, we +show that arbitrarily complex, trained, linear CNNs with skip connections can +be simplified into a single-layer model, resulting in greatly reduced +computational requirements during prediction time. We also present a method for +training nonlinear models with skip connections that are gradually removed +throughout training, giving the benefits of skip connections without requiring +computational overhead during during prediction time. These results are +demonstrated with practical examples on Residual Networks (ResNet) +architecture. + +
+
+ comment: Accepted for publication in the proceedings of the 27th International + Workshop on Computer Algebra in Scientific Computing (CASC 2024) +
+
+
+
+
+ + ☆ An Efficient and Explanatory Image and Text Clustering System with + Multimodal Autoencoder Architecture + + +
+ We demonstrate the efficiencies and explanatory abilities of extensions to +the common tools of Autoencoders and LLM interpreters, in the novel context of +comparing different cultural approaches to the same international news event. +We develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model +that extends the modalities of previous CVAE models, by using fully-connected +latent layers to embed in parallel the CNN encodings of video frames, together +with the LSTM encodings of their related text derived from audio. We +incorporate the model within a larger system that includes frame-caption +alignment, latent space vector clustering, and a novel LLM-based cluster +interpreter. We measure, tune, and apply this system to the task of summarizing +a video into three to five thematic clusters, with each theme described by ten +LLM-produced phrases. We apply this system to two news topics, COVID-19 and the +Winter Olympics, and five other topics are in progress. + +
+
+
+
+
+ + ☆ Cropper: Vision-Language Model for Image Cropping through In-Context + Learning + + +
+ The goal of image cropping is to identify visually appealing crops within an +image. Conventional methods rely on specialized architectures trained on +specific datasets, which struggle to be adapted to new requirements. Recent +breakthroughs in large vision-language models (VLMs) have enabled visual +in-context learning without explicit training. However, effective strategies +for vision downstream tasks with VLMs remain largely unclear and underexplored. +In this paper, we propose an effective approach to leverage VLMs for better +image cropping. First, we propose an efficient prompt retrieval mechanism for +image cropping to automate the selection of in-context examples. Second, we +introduce an iterative refinement strategy to iteratively enhance the predicted +crops. The proposed framework, named Cropper, is applicable to a wide range of +cropping tasks, including free-form cropping, subject-aware cropping, and +aspect ratio-aware cropping. Extensive experiments and a user study demonstrate +that Cropper significantly outperforms state-of-the-art methods across several +benchmarks. + +
+
+
+
+
+ + ☆ Perspectives: Comparison of Deep Learning Segmentation Models on + Biophysical and Biomedical Data + + +
+ Deep learning based approaches are now widely used across biophysics to help +automate a variety of tasks including image segmentation, feature selection, +and deconvolution. However, the presence of multiple competing deep learning +architectures, each with its own unique advantages and disadvantages, makes it +challenging to select an architecture best suited for a specific application. +As such, we present a comprehensive comparison of common models. Here, we focus +on the task of segmentation assuming the typically small training dataset sizes +available from biophysics experiments and compare the following four commonly +used architectures: convolutional neural networks, U-Nets, vision transformers, +and vision state space models. In doing so, we establish criteria for +determining optimal conditions under which each model excels, thereby offering +practical guidelines for researchers and practitioners in the field. + +
+
+
+
+
+ + ☆ NeuroPapyri: A Deep Attention Embedding Network for Handwritten Papyri + Retrieval + + +
+ The intersection of computer vision and machine learning has emerged as a +promising avenue for advancing historical research, facilitating a more +profound exploration of our past. However, the application of machine learning +approaches in historical palaeography is often met with criticism due to their +perceived ``black box'' nature. In response to this challenge, we introduce +NeuroPapyri, an innovative deep learning-based model specifically designed for +the analysis of images containing ancient Greek papyri. To address concerns +related to transparency and interpretability, the model incorporates an +attention mechanism. This attention mechanism not only enhances the model's +performance but also provides a visual representation of the image regions that +significantly contribute to the decision-making process. Specifically +calibrated for processing images of papyrus documents with lines of handwritten +text, the model utilizes individual attention maps to inform the presence or +absence of specific characters in the input image. This paper presents the +NeuroPapyri model, including its architecture and training methodology. Results +from the evaluation demonstrate NeuroPapyri's efficacy in document retrieval, +showcasing its potential to advance the analysis of historical manuscripts. + +
+
+
+
+
+ + ♻ ☆ Idea2Img: Iterative Self-Refinement with GPT-4V(ision) for Automatic + Image Design and Generation ECCV 2024 + + +
+ We introduce ``Idea to Image,'' a system that enables multimodal iterative +self-refinement with GPT-4V(ision) for automatic image design and generation. +Humans can quickly identify the characteristics of different text-to-image +(T2I) models via iterative explorations. This enables them to efficiently +convert their high-level generation ideas into effective T2I prompts that can +produce good images. We investigate if systems based on large multimodal models +(LMMs) can develop analogous multimodal self-refinement abilities that enable +exploring unknown models or environments via self-refining tries. Idea2Img +cyclically generates revised T2I prompts to synthesize draft images, and +provides directional feedback for prompt revision, both conditioned on its +memory of the probed T2I model's characteristics. The iterative self-refinement +brings Idea2Img various advantages over vanilla T2I models. Notably, Idea2Img +can process input ideas with interleaved image-text sequences, follow ideas +with design instructions, and generate images of better semantic and visual +qualities. The user preference study validates the efficacy of multimodal +iterative self-refinement on automatic image design and generation. + +
+
+ comment: ECCV 2024; Project page at https://idea2img.github.io/ +
+
+
+
+
+ + ♻ ☆ NIGHT -- Non-Line-of-Sight Imaging from Indirect Time of Flight Data ECCV + + +
+ The acquisition of objects outside the Line-of-Sight of cameras is a very +intriguing but also extremely challenging research topic. Recent works showed +the feasibility of this idea exploiting transient imaging data produced by +custom direct Time of Flight sensors. In this paper, for the first time, we +tackle this problem using only data from an off-the-shelf indirect Time of +Flight sensor without any further hardware requirement. We introduced a Deep +Learning model able to re-frame the surfaces where light bounces happen as a +virtual mirror. This modeling makes the task easier to handle and also +facilitates the construction of annotated training data. From the obtained data +it is possible to retrieve the depth information of the hidden scene. We also +provide a first-in-its-kind synthetic dataset for the task and demonstrate the +feasibility of the proposed idea over it. + +
+
+ comment: Submitted to MELEX 24 (ECCV workshop), 17 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Robust Curve Detection in Volumetric Medical Imaging via Attraction + Field MICCAI 2024 + + +
+ Understanding body part geometry is crucial for precise medical diagnostics. +Curves effectively describe anatomical structures and are widely used in +medical imaging applications related to cardiovascular, respiratory, and +skeletal diseases. Traditional curve detection methods are often task-specific, +relying heavily on domain-specific features, limiting their broader +applicability. This paper introduces a novel approach for detecting +non-branching curves, which does not require prior knowledge of the object's +orientation, shape, or position. Our method uses neural networks to predict (1) +an attraction field, which offers subpixel accuracy, and (2) a closeness map, +which limits the region of interest and essentially eliminates outliers far +from the desired curve. We tested our curve detector on several clinically +relevant tasks with diverse morphologies and achieved impressive subpixel-level +accuracy results that surpass existing methods, highlighting its versatility +and robustness. Additionally, to support further advancements in this field, we +provide our private annotations of aortic centerlines and masks, which can +serve as a benchmark for future research. The dataset can be found at +https://github.com/neuro-ml/curve-detection. + +
+
+ comment: Accepted to ShapeMI MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Data Science for Geographic Information Systems + + +
+ The integration of data science into Geographic Information Systems (GIS) has +facilitated the evolution of these tools into complete spatial analysis +platforms. The adoption of machine learning and big data techniques has +equipped these platforms with the capacity to handle larger amounts of +increasingly complex data, transcending the limitations of more traditional +approaches. This work traces the historical and technical evolution of data +science and GIS as fields of study, highlighting the critical points of +convergence between domains, and underlining the many sectors that rely on this +integration. A GIS application is presented as a case study in the disaster +management sector where we utilize aerial data from Tr\'oia, Portugal, to +emphasize the process of insight extraction from raw data. We conclude by +outlining prospects for future research in integration of these fields in +general, and the developed application in particular. + +
+
+ comment: The peer-reviewed version of this paper is published in IEEE Xplore + at https://doi.org/10.1109/YEF-ECE62614.2024.10624902. This version is + typeset by the author and differs only in pagination and typographical detail +
+
+
+
+
+ + ♻ ☆ Distilling the Knowledge in Data Pruning + + +
+ With the increasing size of datasets used for training neural networks, data +pruning becomes an attractive field of research. However, most current data +pruning algorithms are limited in their ability to preserve accuracy compared +to models trained on the full data, especially in high pruning regimes. In this +paper we explore the application of data pruning while incorporating knowledge +distillation (KD) when training on a pruned subset. That is, rather than +relying solely on ground-truth labels, we also use the soft predictions from a +teacher network pre-trained on the complete data. By integrating KD into +training, we demonstrate significant improvement across datasets, pruning +methods, and on all pruning fractions. We first establish a theoretical +motivation for employing self-distillation to improve training on pruned data. +Then, we empirically make a compelling and highly practical observation: using +KD, simple random pruning is comparable or superior to sophisticated pruning +methods across all pruning regimes. On ImageNet for example, we achieve +superior accuracy despite training on a random subset of only 50% of the data. +Additionally, we demonstrate a crucial connection between the pruning factor +and the optimal knowledge distillation weight. This helps mitigate the impact +of samples with noisy labels and low-quality images retained by typical pruning +algorithms. Finally, we make an intriguing observation: when using lower +pruning fractions, larger teachers lead to accuracy degradation, while +surprisingly, employing teachers with a smaller capacity than the student's may +improve results. Our code will be made available. + +
+
+
+
+
+ + ♻ ☆ A Survey of Open Source User Activity Traces with Applications to User + Mobility Characterization and Modeling + + +
+ The current state-of-the-art in user mobility research has extensively relied +on open-source mobility traces captured from pedestrian and vehicular activity +through a variety of communication technologies as users engage in a wide-range +of applications, including connected healthcare, localization, social media, +e-commerce, etc. Most of these traces are feature-rich and diverse, not only in +the information they provide, but also in how they can be used and leveraged. +This diversity poses two main challenges for researchers and practitioners who +wish to make use of available mobility datasets. First, it is quite difficult +to get a bird's eye view of the available traces without spending considerable +time looking them up. Second, once they have found the traces, they still need +to figure out whether the traces are adequate to their needs. + The purpose of this survey is three-fold. It proposes a taxonomy to classify +open-source mobility traces including their mobility mode, data source and +collection technology. It then uses the proposed taxonomy to classify existing +open-source mobility traces and finally, highlights three case studies using +popular publicly available datasets to showcase how our taxonomy can tease out +feature sets in traces to help determine their applicability to specific +use-cases. + +
+
+ comment: 23 pages, 6 pages references +
+
+
+
+
+ + ♻ ☆ Evolving from Single-modal to Multi-modal Facial Deepfake Detection: A + Survey + + +
+ This survey addresses the critical challenge of deepfake detection amidst the +rapid advancements in artificial intelligence. As AI-generated media, including +video, audio and text, become more realistic, the risk of misuse to spread +misinformation and commit identity fraud increases. Focused on face-centric +deepfakes, this work traces the evolution from traditional single-modality +methods to sophisticated multi-modal approaches that handle audio-visual and +text-visual scenarios. We provide comprehensive taxonomies of detection +techniques, discuss the evolution of generative methods from auto-encoders and +GANs to diffusion models, and categorize these technologies by their unique +attributes. To our knowledge, this is the first survey of its kind. We also +explore the challenges of adapting detection methods to new generative models +and enhancing the reliability and robustness of deepfake detectors, proposing +directions for future research. This survey offers a detailed roadmap for +researchers, supporting the development of technologies to counter the +deceptive use of AI in media creation, particularly facial forgery. A curated +list of all related papers can be found at +\href{https://github.com/qiqitao77/Comprehensive-Advances-in-Deepfake-Detection-Spanning-Diverse-Modalities}{https://github.com/qiqitao77/Awesome-Comprehensive-Deepfake-Detection}. + +
+
+ comment: P. Liu is with the Department of Computer Science and Engineering, + University of Nevada, Reno, NV, 89512. Q. Tao and J. Zhou are with Centre for + Frontier AI Research (CFAR), and Institute of High Performance Computing + (IHPC), A*STAR, Singapore. J. Zhou is also with Centre for Advanced + Technologies in Online Safety (CATOS), A*STAR, Singapore. J. Zhou is the + corresponding author +
+
+
+
+
+ + ♻ ☆ Compact Model Training by Low-Rank Projection with Energy Transfer + + +
+ Low-rankness plays an important role in traditional machine learning, but is +not so popular in deep learning. Most previous low-rank network compression +methods compress networks by approximating pre-trained models and re-training. +However, the optimal solution in the Euclidean space may be quite different +from the one with low-rank constraint. A well-pre-trained model is not a good +initialization for the model with low-rank constraints. Thus, the performance +of a low-rank compressed network degrades significantly. Compared with other +network compression methods such as pruning, low-rank methods attract less +attention in recent years. In this paper, we devise a new training method, +low-rank projection with energy transfer (LRPET), that trains low-rank +compressed networks from scratch and achieves competitive performance. We +propose to alternately perform stochastic gradient descent training and +projection of each weight matrix onto the corresponding low-rank manifold. +Compared to re-training on the compact model, this enables full utilization of +model capacity since solution space is relaxed back to Euclidean space after +projection. The matrix energy (the sum of squares of singular values) reduction +caused by projection is compensated by energy transfer. We uniformly transfer +the energy of the pruned singular values to the remaining ones. We +theoretically show that energy transfer eases the trend of gradient vanishing +caused by projection. In modern networks, a batch normalization (BN) layer can +be merged into the previous convolution layer for inference, thereby +influencing the optimal low-rank approximation of the previous layer. We +propose BN rectification to cut off its effect on the optimal low-rank +approximation, which further improves the performance. + +
+
+
+
+
+ + ♻ ☆ CLIP with Generative Latent Replay: a Strong Baseline for Incremental + Learning BMVC 2024 + + +
+ With the emergence of Transformers and Vision-Language Models (VLMs) such as +CLIP, fine-tuning large pre-trained models has recently become a prevalent +strategy in Continual Learning. This has led to the development of numerous +prompting strategies to adapt transformer-based models without incurring +catastrophic forgetting. However, these strategies often compromise the +original zero-shot capabilities of the pre-trained CLIP model and struggle to +adapt to domains that significantly deviate from the pre-training data. In this +work, we propose Continual Generative training for Incremental prompt-Learning, +a simple and novel approach to mitigate forgetting while adapting CLIP. +Briefly, we employ Variational Autoencoders (VAEs) to learn class-conditioned +distributions within the embedding space of the visual encoder. We then exploit +these distributions to sample new synthetic visual embeddings and train the +corresponding class-specific textual prompts during subsequent tasks. Through +extensive experiments on different domains, we show that such a generative +replay approach can adapt to new tasks while improving zero-shot capabilities, +evaluated using a novel metric tailored for CL scenarios. Notably, further +analysis reveals that our approach can bridge the gap with joint prompt tuning. +The codebase is available at https://github.com/aimagelab/mammoth. + +
+
+ comment: 15 pages, 1 figure. Accepted at the The 35th British Machine Vision + Conference 2024 (BMVC 2024), Glasgow, UK +
+
+
+
+
+ + ♻ ☆ DeepFace-Attention: Multimodal Face Biometrics for Attention Estimation + with Application to e-Learning + + +
+ This work introduces an innovative method for estimating attention levels +(cognitive load) using an ensemble of facial analysis techniques applied to +webcam videos. Our method is particularly useful, among others, in e-learning +applications, so we trained, evaluated, and compared our approach on the mEBAL2 +database, a public multi-modal database acquired in an e-learning environment. +mEBAL2 comprises data from 60 users who performed 8 different tasks. These +tasks varied in difficulty, leading to changes in their cognitive loads. Our +approach adapts state-of-the-art facial analysis technologies to quantify the +users' cognitive load in the form of high or low attention. Several behavioral +signals and physiological processes related to the cognitive load are used, +such as eyeblink, heart rate, facial action units, and head pose, among others. +Furthermore, we conduct a study to understand which individual features obtain +better results, the most efficient combinations, explore local and global +features, and how temporary time intervals affect attention level estimation, +among other aspects. We find that global facial features are more appropriate +for multimodal systems using score-level fusion, particularly as the temporal +window increases. On the other hand, local features are more suitable for +fusion through neural network training with score-level fusion approaches. Our +method outperforms existing state-of-the-art accuracies using the public mEBAL2 +benchmark. + +
+
+ comment: Article accepted in the IEEE Access journal. Accessible at + https://ieeexplore.ieee.org/document/10633208 +
+
+
+
+
+ + ♻ ☆ InternVideo2: Scaling Foundation Models for Multimodal Video + Understanding ECCV2024 + + +
+ We introduce InternVideo2, a new family of video foundation models (ViFM) +that achieve the state-of-the-art results in video recognition, video-text +tasks, and video-centric dialogue. Our core design is a progressive training +approach that unifies the masked video modeling, crossmodal contrastive +learning, and next token prediction, scaling up the video encoder size to 6B +parameters. At the data level, we prioritize spatiotemporal consistency by +semantically segmenting videos and generating video-audio-speech captions. This +improves the alignment between video and text. Through extensive experiments, +we validate our designs and demonstrate superior performance on over 60 video +and audio tasks. Notably, our model outperforms others on various video-related +dialogue and long video understanding benchmarks, highlighting its ability to +reason and comprehend longer contexts. Code and models are available at +https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/. + +
+
+ comment: a technical report about video understanding (accepted to ECCV2024) +
+
+
+
+
+ + ♻ ☆ Disentangled Representation Learning with Transmitted Information + Bottleneck + + +
+ Encoding only the task-related information from the raw data, \ie, +disentangled representation learning, can greatly contribute to the robustness +and generalizability of models. Although significant advances have been made by +regularizing the information in representations with information theory, two +major challenges remain: 1) the representation compression inevitably leads to +performance drop; 2) the disentanglement constraints on representations are in +complicated optimization. To these issues, we introduce Bayesian networks with +transmitted information to formulate the interaction among input and +representations during disentanglement. Building upon this framework, we +propose \textbf{DisTIB} (\textbf{T}ransmitted \textbf{I}nformation +\textbf{B}ottleneck for \textbf{Dis}entangled representation learning), a novel +objective that navigates the balance between information compression and +preservation. We employ variational inference to derive a tractable estimation +for DisTIB. This estimation can be simply optimized via standard gradient +descent with a reparameterization trick. Moreover, we theoretically prove that +DisTIB can achieve optimal disentanglement, underscoring its superior efficacy. +To solidify our claims, we conduct extensive experiments on various downstream +tasks to demonstrate the appealing efficacy of DisTIB and validate our +theoretical analyses. + +
+
+
+
+
+ + ♻ ☆ GS-Pose: Generalizable Segmentation-based 6D Object Pose Estimation with + 3D Gaussian Splatting + + +
+ This paper introduces GS-Pose, a unified framework for localizing and +estimating the 6D pose of novel objects. GS-Pose begins with a set of posed RGB +images of a previously unseen object and builds three distinct representations +stored in a database. At inference, GS-Pose operates sequentially by locating +the object in the input image, estimating its initial 6D pose using a retrieval +approach, and refining the pose with a render-and-compare method. The key +insight is the application of the appropriate object representation at each +stage of the process. In particular, for the refinement step, we leverage 3D +Gaussian splatting, a novel differentiable rendering technique that offers high +rendering speed and relatively low optimization time. Off-the-shelf toolchains +and commodity hardware, such as mobile phones, can be used to capture new +objects to be added to the database. Extensive evaluations on the LINEMOD and +OnePose-LowTexture datasets demonstrate excellent performance, establishing the +new state-of-the-art. Project page: https://dingdingcai.github.io/gs-pose. + +
+
+ comment: Project Page: https://dingdingcai.github.io/gs-pose +
+
+
+
+
+ + ♻ ☆ R2Human: Real-Time 3D Human Appearance Rendering from a Single Image + + +
+ Rendering 3D human appearance from a single image in real-time is crucial for +achieving holographic communication and immersive VR/AR. Existing methods +either rely on multi-camera setups or are constrained to offline operations. In +this paper, we propose R2Human, the first approach for real-time inference and +rendering of photorealistic 3D human appearance from a single image. The core +of our approach is to combine the strengths of implicit texture fields and +explicit neural rendering with our novel representation, namely Z-map. Based on +this, we present an end-to-end network that performs high-fidelity color +reconstruction of visible areas and provides reliable color inference for +occluded regions. To further enhance the 3D perception ability of our network, +we leverage the Fourier occupancy field as a prior for generating the texture +field and providing a sampling surface in the rendering stage. We also propose +a consistency loss and a spatial fusion strategy to ensure the multi-view +coherence. Experimental results show that our method outperforms the +state-of-the-art methods on both synthetic data and challenging real-world +images, in real-time. The project page can be found at +http://cic.tju.edu.cn/faculty/likun/projects/R2Human. + +
+
+
+
+
+ + ♻ ☆ On the Utility of 3D Hand Poses for Action Recognition ECCV 2024 + + +
+ 3D hand pose is an underexplored modality for action recognition. Poses are +compact yet informative and can greatly benefit applications with limited +compute budgets. However, poses alone offer an incomplete understanding of +actions, as they cannot fully capture objects and environments with which +humans interact. We propose HandFormer, a novel multimodal transformer, to +efficiently model hand-object interactions. HandFormer combines 3D hand poses +at a high temporal resolution for fine-grained motion modeling with sparsely +sampled RGB frames for encoding scene semantics. Observing the unique +characteristics of hand poses, we temporally factorize hand modeling and +represent each joint by its short-term trajectories. This factorized pose +representation combined with sparse RGB samples is remarkably efficient and +highly accurate. Unimodal HandFormer with only hand poses outperforms existing +skeleton-based methods at 5x fewer FLOPs. With RGB, we achieve new +state-of-the-art performance on Assembly101 and H2O with significant +improvements in egocentric action recognition. + +
+
+ comment: ECCV 2024; https://s-shamil.github.io/HandFormer/ +
+
+
+
+
+ + ♻ ☆ DynaMoN: Motion-Aware Fast and Robust Camera Localization for Dynamic + Neural Radiance Fields + + +
+ The accurate reconstruction of dynamic scenes with neural radiance fields is +significantly dependent on the estimation of camera poses. Widely used +structure-from-motion pipelines encounter difficulties in accurately tracking +the camera trajectory when faced with separate dynamics of the scene content +and the camera movement. To address this challenge, we propose Dynamic +Motion-Aware Fast and Robust Camera Localization for Dynamic Neural Radiance +Fields (DynaMoN). DynaMoN utilizes semantic segmentation and generic motion +masks to handle dynamic content for initial camera pose estimation and +statics-focused ray sampling for fast and accurate novel-view synthesis. Our +novel iterative learning scheme switches between training the NeRF and updating +the pose parameters for an improved reconstruction and trajectory estimation +quality. The proposed pipeline shows significant acceleration of the training +process. We extensively evaluate our approach on two real-world dynamic +datasets, the TUM RGB-D dataset and the BONN RGB-D Dynamic dataset. DynaMoN +improves over the state-of-the-art both in terms of reconstruction quality and +trajectory accuracy. We plan to make our code public to enhance research in +this area. + +
+
+
+
+
+ + ♻ ☆ Volley Revolver: A Novel Matrix-Encoding Method for Privacy-Preserving + Neural Networks (Inference) + + +
+ In this work, we present a novel matrix-encoding method that is particularly +convenient for neural networks to make predictions in a privacy-preserving +manner using homomorphic encryption. Based on this encoding method, we +implement a convolutional neural network for handwritten image classification +over encryption. For two matrices $A$ and $B$ to perform homomorphic +multiplication, the main idea behind it, in a simple version, is to encrypt +matrix $A$ and the transpose of matrix $B$ into two ciphertexts respectively. +With additional operations, the homomorphic matrix multiplication can be +calculated over encrypted matrices efficiently. For the convolution operation, +we in advance span each convolution kernel to a matrix space of the same size +as the input image so as to generate several ciphertexts, each of which is +later used together with the ciphertext encrypting input images for calculating +some of the final convolution results. We accumulate all these intermediate +results and thus complete the convolution operation. + In a public cloud with 40 vCPUs, our convolutional neural network +implementation on the MNIST testing dataset takes $\sim$ 287 seconds to compute +ten likelihoods of 32 encrypted images of size $28 \times 28$ simultaneously. +The data owner only needs to upload one ciphertext ($\sim 19.8$ MB) encrypting +these 32 images to the public cloud. + +
+
+ comment: The encoding method we proposed in this work, $\texttt{Volley + Revolver}$, is particularly tailored for privacy-preserving neural networks. + There is a great chance that it can be used to assist the private neural + networks training, in which case for the backpropagation algorithm of the + fully-connected layer the first matrix $A$ is revolved while the second + matrix $B$ is settled to be still +
+
+
+
+
+ + ♻ ☆ VIRUS-NeRF -- Vision, InfraRed and UltraSonic based Neural Radiance + Fields + + +
+ Autonomous mobile robots are an increasingly integral part of modern factory +and warehouse operations. Obstacle detection, avoidance and path planning are +critical safety-relevant tasks, which are often solved using expensive LiDAR +sensors and depth cameras. We propose to use cost-effective low-resolution +ranging sensors, such as ultrasonic and infrared time-of-flight sensors by +developing VIRUS-NeRF - Vision, InfraRed, and UltraSonic based Neural Radiance +Fields. Building upon Instant Neural Graphics Primitives with a Multiresolution +Hash Encoding (Instant-NGP), VIRUS-NeRF incorporates depth measurements from +ultrasonic and infrared sensors and utilizes them to update the occupancy grid +used for ray marching. Experimental evaluation in 2D demonstrates that +VIRUS-NeRF achieves comparable mapping performance to LiDAR point clouds +regarding coverage. Notably, in small environments, its accuracy aligns with +that of LiDAR measurements, while in larger ones, it is bounded by the utilized +ultrasonic sensors. An in-depth ablation study reveals that adding ultrasonic +and infrared sensors is highly effective when dealing with sparse data and low +view variation. Further, the proposed occupancy grid of VIRUS-NeRF improves the +mapping capabilities and increases the training speed by 46% compared to +Instant-NGP. Overall, VIRUS-NeRF presents a promising approach for +cost-effective local mapping in mobile robotics, with potential applications in +safety and navigation tasks. The code can be found at +https://github.com/ethz-asl/virus nerf. + +
+
+
+
+
+ + ♻ ☆ Integrating Representational Gestures into Automatically Generated + Embodied Explanations and its Effects on Understanding and Interaction + Quality + + +
+ In human interaction, gestures serve various functions such as marking speech +rhythm, highlighting key elements, and supplementing information. These +gestures are also observed in explanatory contexts. However, the impact of +gestures on explanations provided by virtual agents remains underexplored. A +user study was carried out to investigate how different types of gestures +influence perceived interaction quality and listener understanding. This study +addresses the effect of gestures in explanation by developing an embodied +virtual explainer integrating both beat gestures and iconic gestures to enhance +its automatically generated verbal explanations. Our model combines beat +gestures generated by a learned speech-driven synthesis module with manually +captured iconic gestures, supporting the agent's verbal expressions about the +board game Quarto! as an explanation scenario. Findings indicate that neither +the use of iconic gestures alone nor their combination with beat gestures +outperforms the baseline or beat-only conditions in terms of understanding. +Nonetheless, compared to prior research, the embodied agent significantly +enhances understanding. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Synthetic Infrared Image synthesis + + +
+ Synthetic infrared (IR) scene and target generation is an important computer +vision problem as it allows the generation of realistic IR images and targets +for training and testing of various applications, such as remote sensing, +surveillance, and target recognition. It also helps reduce the cost and risk +associated with collecting real-world IR data. This survey paper aims to +provide a comprehensive overview of the conventional mathematical +modelling-based methods and deep learning-based methods used for generating +synthetic IR scenes and targets. The paper discusses the importance of +synthetic IR scene and target generation and briefly covers the mathematics of +blackbody and grey body radiations, as well as IR image-capturing methods. The +potential use cases of synthetic IR scenes and target generation are also +described, highlighting the significance of these techniques in various fields. +Additionally, the paper explores possible new ways of developing new techniques +to enhance the efficiency and effectiveness of synthetic IR scenes and target +generation while highlighting the need for further research to advance this +field. + +
+
+ comment: Submitted in Journal of Infrared Physics & Technology +
+
+
+
+
+ + ♻ ☆ Explicit Abnormality Extraction for Unsupervised Motion Artifact + Reduction in Magnetic Resonance Imaging + + +
+ Motion artifacts compromise the quality of magnetic resonance imaging (MRI) +and pose challenges to achieving diagnostic outcomes and image-guided +therapies. In recent years, supervised deep learning approaches have emerged as +successful solutions for motion artifact reduction (MAR). One disadvantage of +these methods is their dependency on acquiring paired sets of motion +artifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images +for training purposes. Obtaining such image pairs is difficult and therefore +limits the application of supervised training. In this paper, we propose a +novel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this +problem. Our network is capable of working with unpaired MA-corrupted and +MA-free images. It converts the MA-corrupted images to MA-reduced images by +extracting abnormalities from the MA-corrupted images using a proposed artifact +extractor, which intercepts the residual artifact maps from the MA-corrupted MR +images explicitly, and a reconstructor to restore the original input from the +MA-reduced images. The performance of UNAEN was assessed by experimenting with +various publicly available MRI datasets and comparing them with +state-of-the-art methods. The quantitative evaluation demonstrates the +superiority of UNAEN over alternative MAR methods and visually exhibits fewer +residual artifacts. Our results substantiate the potential of UNAEN as a +promising solution applicable in real-world clinical environments, with the +capability to enhance diagnostic accuracy and facilitate image-guided +therapies. Our codes are publicly available at +https://github.com/YuSheng-Zhou/UNAEN. + +
+
+ comment: Accepted by IEEE Journal of Biomedical and Health Informatics +
+
+
+
+
+ + ♻ ☆ Switched auxiliary loss for robust training of transformer models for + histopathological image segmentation + + +
+ Functional tissue Units (FTUs) are cell population neighborhoods local to a +particular organ performing its main function.The FTUs provide crucial +information to the pathologist in understanding the disease affecting a +particular organ by providing information at the cellular level.In our +research, we have developed a model to segment multi-organ FTUs across 5 organs +namely: the kidney, large intestine, lung, prostate and spleen by utilizing the +'HuBMAP + HPA - Hacking the Human Body' competition dataset.We propose adding +switched auxiliary loss for training models like the transformers to overcome +the diminishing gradient problem which poses a challenge towards optimal +training of deep models.Overall, our model achieved a dice score of 0.793 on +the public dataset and 0.778 on the private dataset.The results supports the +robustness of the proposed training methodology.The findings also bolster the +use of transformers models for dense prediction tasks in the field of medical +image analysis.The study assists in understanding the relationships between +cell and tissue organization thereby providing a useful medium to look at the +impact of cellular functions on human health. + +
+
+
+
+
+ + ♻ ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention for +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model, which +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear scalability and fast processing of long sequences. We replace +the Transformer-based backbone with a pre-trained Mamba-2 model and explore +methods for integrating 2D visual selective scanning mechanisms into multimodal +learning while also trying various visual encoders and Mamba-2 model variants. +Our extensive experiments in various multimodal benchmark tests demonstrate the +competitive performance of ML-Mamba and highlight the potential of state space +models in multimodal tasks. The experimental results show that: (1) we +empirically explore how to effectively apply the 2D vision selective scan +mechanism for multimodal learning. We propose a novel multimodal connector +called the Mamba-2 Scan Connector (MSC), which enhances representational +capabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling while faster inference speed; (3) Compared to multimodal models +utilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference +performance and effectiveness. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.13600, + arXiv:2406.07537 by other authors +
+
+
+
+
+ + ♻ ☆ SYM3D: Learning Symmetric Triplanes for Better 3D-Awareness of GANs + + +
+ Despite the growing success of 3D-aware GANs, which can be trained on 2D +images to generate high-quality 3D assets, they still rely on multi-view images +with camera annotations to synthesize sufficient details from all viewing +directions. However, the scarce availability of calibrated multi-view image +datasets, especially in comparison to single-view images, has limited the +potential of 3D GANs. Moreover, while bypassing camera pose annotations with a +camera distribution constraint reduces dependence on exact camera parameters, +it still struggles to generate a consistent orientation of 3D assets. To this +end, we propose SYM3D, a novel 3D-aware GAN designed to leverage the prevalent +reflectional symmetry structure found in natural and man-made objects, +alongside a proposed view-aware spatial attention mechanism in learning the 3D +representation. We evaluate SYM3D on both synthetic (ShapeNet Chairs, Cars, and +Airplanes) and real-world datasets (ABO-Chair), demonstrating its superior +performance in capturing detailed geometry and texture, even when trained on +only single-view images. Finally, we demonstrate the effectiveness of +incorporating symmetry regularization in helping reduce artifacts in the +modeling of 3D assets in the text-to-3D task. Project is at +\url{https://jingyang2017.github.io/sym3d.github.io/} + +
+
+ comment: 11 +
+
+
+
+
+ + ♻ ☆ Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies BMVC 2024 + + +
+ Existing methods on audio-visual deepfake detection mainly focus on +high-level features for modeling inconsistencies between audio and visual data. +As a result, these approaches usually overlook finer audio-visual artifacts, +which are inherent to deepfakes. Herein, we propose the introduction of +fine-grained mechanisms for detecting subtle artifacts in both spatial and +temporal domains. First, we introduce a local audio-visual model capable of +capturing small spatial regions that are prone to inconsistencies with audio. +For that purpose, a fine-grained mechanism based on a spatially-local distance +coupled with an attention module is adopted. Second, we introduce a +temporally-local pseudo-fake augmentation to include samples incorporating +subtle temporal inconsistencies in our training set. Experiments on the DFDC +and the FakeAVCeleb datasets demonstrate the superiority of the proposed method +in terms of generalization as compared to the state-of-the-art under both +in-dataset and cross-dataset settings. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ Interaction as Explanation: A User Interaction-based Method for + Explaining Image Classification Models IJCAI 2024 + + +
+ In computer vision, explainable AI (xAI) methods seek to mitigate the +'black-box' problem by making the decision-making process of deep learning +models more interpretable and transparent. Traditional xAI methods concentrate +on visualizing input features that influence model predictions, providing +insights primarily suited for experts. In this work, we present an +interaction-based xAI method that enhances user comprehension of image +classification models through their interaction. Thus, we developed a web-based +prototype allowing users to modify images via painting and erasing, thereby +observing changes in classification results. Our approach enables users to +discern critical features influencing the model's decision-making process, +aligning their mental models with the model's logic. Experiments conducted with +five images demonstrate the potential of the method to reveal feature +importance through user interaction. Our work contributes a novel perspective +to xAI by centering on end-user engagement and understanding, paving the way +for more intuitive and accessible explainability in AI systems. + +
+
+ comment: IJCAI 2024 (International Joint Conference on Artificial Intelligence + 2024) Workshop on Explainable Artificial Intelligence (XAI) +
+
+
+
+
+ + ♻ ☆ AutoCLIP: Auto-tuning Zero-Shot Classifiers for Vision-Language Models + + +
+ Classifiers built upon vision-language models such as CLIP have shown +remarkable zero-shot performance across a broad range of image classification +tasks. Prior work has studied different ways of automatically creating +descriptor sets for every class based on prompt templates, ranging from +manually engineered templates over templates obtained from a large language +model to templates built from random words and characters. Up until now, +deriving zero-shot classifiers from the respective encoded class descriptors +has remained nearly unchanged, i.e., classify to the class that maximizes +cosine similarity between its averaged encoded class descriptors and the image +encoding. However, weighing all class descriptors equally can be suboptimal +when certain descriptors match visual clues on a given image better than +others. In this work, we propose AutoCLIP, a method for auto-tuning zero-shot +classifiers. AutoCLIP tunes per-image weights to each prompt template at +inference time, based on statistics of class descriptor-image similarities. +AutoCLIP is fully unsupervised, has only a minor additional computation +overhead, and can be easily implemented in few lines of code. We show that +AutoCLIP outperforms baselines across a broad range of vision-language models, +datasets, and prompt templates consistently and by up to 3 percent point +accuracy. + +
+
+ comment: accepted at TMLR, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ A Survey on Graph Neural Networks and Graph Transformers in Computer + Vision: A Task-Oriented Perspective + + +
+ Graph Neural Networks (GNNs) have gained momentum in graph representation +learning and boosted the state of the art in a variety of areas, such as data +mining (\emph{e.g.,} social network analysis and recommender systems), computer +vision (\emph{e.g.,} object detection and point cloud learning), and natural +language processing (\emph{e.g.,} relation extraction and sequence learning), +to name a few. With the emergence of Transformers in natural language +processing and computer vision, graph Transformers embed a graph structure into +the Transformer architecture to overcome the limitations of local neighborhood +aggregation while avoiding strict structural inductive biases. In this paper, +we present a comprehensive review of GNNs and graph Transformers in computer +vision from a task-oriented perspective. Specifically, we divide their +applications in computer vision into five categories according to the modality +of input data, \emph{i.e.,} 2D natural images, videos, 3D data, vision + +language, and medical images. In each category, we further divide the +applications according to a set of vision tasks. Such a task-oriented taxonomy +allows us to examine how each task is tackled by different GNN-based approaches +and how well these approaches perform. Based on the necessary preliminaries, we +provide the definitions and challenges of the tasks, in-depth coverage of the +representative approaches, as well as discussions regarding insights, +limitations, and future directions. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (T-PAMI) +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Sketch-Aware Interpolation Network for High-Quality + Animation Sketch Inbetweening + + +
+ Hand-drawn 2D animation workflow is typically initiated with the creation of +sketch keyframes. Subsequent manual inbetweens are crafted for smoothness, +which is a labor-intensive process and the prospect of automatic animation +sketch interpolation has become highly appealing. Yet, common frame +interpolation methods are generally hindered by two key issues: 1) limited +texture and colour details in sketches, and 2) exaggerated alterations between +two sketch keyframes. To overcome these issues, we propose a novel deep +learning method - Sketch-Aware Interpolation Network (SAIN). This approach +incorporates multi-level guidance that formulates region-level correspondence, +stroke-level correspondence and pixel-level dynamics. A multi-stream +U-Transformer is then devised to characterize sketch inbetweening patterns +using these multi-level guides through the integration of self / +cross-attention mechanisms. Additionally, to facilitate future research on +animation sketch inbetweening, we constructed a large-scale dataset - STD-12K, +comprising 30 sketch animation series in diverse artistic styles. Comprehensive +experiments on this dataset convincingly show that our proposed SAIN surpasses +the state-of-the-art interpolation methods. + +
+
+
+
+
+ + ♻ ☆ SSL-SoilNet: A Hybrid Transformer-based Framework with Self-Supervised + Learning for Large-scale Soil Organic Carbon Prediction + + +
+ Soil Organic Carbon (SOC) constitutes a fundamental component of terrestrial +ecosystem functionality, playing a pivotal role in nutrient cycling, +hydrological balance, and erosion mitigation. Precise mapping of SOC +distribution is imperative for the quantification of ecosystem services, +notably carbon sequestration and soil fertility enhancement. Digital soil +mapping (DSM) leverages statistical models and advanced technologies, including +machine learning (ML), to accurately map soil properties, such as SOC, +utilizing diverse data sources like satellite imagery, topography, remote +sensing indices, and climate series. Within the domain of ML, self-supervised +learning (SSL), which exploits unlabeled data, has gained prominence in recent +years. This study introduces a novel approach that aims to learn the +geographical link between multimodal features via self-supervised contrastive +learning, employing pretrained Vision Transformers (ViT) for image inputs and +Transformers for climate data, before fine-tuning the model with ground +reference samples. The proposed approach has undergone rigorous testing on two +distinct large-scale datasets, with results indicating its superiority over +traditional supervised learning models, which depends solely on labeled data. +Furthermore, through the utilization of various evaluation metrics (e.g., RMSE, +MAE, CCC, etc.), the proposed model exhibits higher accuracy when compared to +other conventional ML algorithms like random forest and gradient boosting. This +model is a robust tool for predicting SOC and contributes to the advancement of +DSM techniques, thereby facilitating land management and decision-making +processes based on accurate information. + +
+
+ comment: Accepted for publication in IEEE Transactions on Geoscience and + Remote Sensing (TGRS) +
+
+
+
+
+ + ♻ ☆ MedAugment: Universal Automatic Data Augmentation Plug-in for Medical + Image Analysis + + +
+ Data augmentation (DA) has been widely leveraged in computer vision to +alleviate the data shortage, whereas the DA in medical image analysis (MIA) +faces multiple challenges. The prevalent DA approaches in MIA encompass +conventional DA, synthetic DA, and automatic DA. However, utilizing these +approaches poses various challenges such as experience-driven design and +intensive computation cost. Here, we propose an efficient and effective +automatic DA method termed MedAugment. We propose a pixel augmentation space +and spatial augmentation space and exclude the operations that can break +medical details and features, such as severe color distortions or structural +alterations that can compromise image diagnostic value. Besides, we propose a +novel sampling strategy by sampling a limited number of operations from the two +spaces. Moreover, we present a hyperparameter mapping relationship to produce a +rational augmentation level and make the MedAugment fully controllable using a +single hyperparameter. These configurations settle the differences between +natural and medical images, such as high sensitivity to certain attributes such +as brightness and posterize. Extensive experimental results on four +classification and four segmentation datasets demonstrate the superiority of +MedAugment. Compared with existing approaches, the proposed MedAugment serves +as a more suitable yet general processing pipeline for medical images without +producing color distortions or structural alterations and involving negligible +computational overhead. We emphasize that our method can serve as a plugin for +arbitrary projects without any extra training stage, thereby holding the +potential to make a valuable contribution to the medical field, particularly +for medical experts without a solid foundation in deep learning. Code is +available at https://github.com/NUS-Tim/MedAugment. + +
+
+ comment: 29 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ ViTime: A Visual Intelligence-Based Foundation Model for Time Series + Forecasting + + +
+ The success of large pretrained models in natural language processing (NLP) +and computer vision (CV) has opened new avenues for constructing foundation +models for time series forecasting (TSF). Traditional TSF foundation models +rely heavily on numerical data fitting. In contrast, the human brain is +inherently skilled at processing visual information, prefer predicting future +trends by observing visualized sequences. From a biomimetic perspective, +utilizing models to directly process numerical sequences might not be the most +effective route to achieving Artificial General Intelligence (AGI). This paper +proposes ViTime, a novel Visual Intelligence-based foundation model for TSF. +ViTime overcomes the limitations of numerical time series data fitting by +utilizing visual data processing paradigms and employs a innovative data +synthesis method during training, called Real Time Series (RealTS). Experiments +on a diverse set of previously unseen forecasting datasets demonstrate that +ViTime achieves state-of-the-art zero-shot performance, even surpassing the +best individually trained supervised models in some situations. These findings +suggest that visual intelligence can significantly enhance time series analysis +and forecasting, paving the way for more advanced and versatile models in the +field. The code for our framework is accessible at +https://github.com/IkeYang/ViTime. + +
+
+
+
+
+ + ♻ ☆ AROID: Improving Adversarial Robustness Through Online Instance-Wise + Data Augmentation + + +
+ Deep neural networks are vulnerable to adversarial examples. Adversarial +training (AT) is an effective defense against adversarial examples. However, AT +is prone to overfitting which degrades robustness substantially. Recently, data +augmentation (DA) was shown to be effective in mitigating robust overfitting if +appropriately designed and optimized for AT. This work proposes a new method to +automatically learn online, instance-wise, DA policies to improve robust +generalization for AT. This is the first automated DA method specific for +robustness. A novel policy learning objective, consisting of Vulnerability, +Affinity and Diversity, is proposed and shown to be sufficiently effective and +efficient to be practical for automatic DA generation during AT. Importantly, +our method dramatically reduces the cost of policy search from the 5000 hours +of AutoAugment and the 412 hours of IDBH to 9 hours, making automated DA more +practical to use for adversarial robustness. This allows our method to +efficiently explore a large search space for a more effective DA policy and +evolve the policy as training progresses. Empirically, our method is shown to +outperform all competitive DA methods across various model architectures and +datasets. Our DA policy reinforced vanilla AT to surpass several +state-of-the-art AT methods regarding both accuracy and robustness. It can also +be combined with those advanced AT methods to further boost robustness. Code +and pre-trained models are available at https://github.com/TreeLLi/AROID. + +
+
+ comment: published at the IJCV in press +
+
+
+
+
+ + ♻ ☆ ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive + Language-Image Pre-traning Model + + +
+ Contrastive Language-Image Pre-training (CLIP) model excels in integrating +semantic information between images and text through contrastive learning +techniques. It has achieved remarkable performance in various multimodal tasks. +However, the deployment of large CLIP models is hindered in resource-limited +environments, while smaller models frequently fail to meet the performance +benchmarks required for practical applications. In this paper, we propose a +novel approach, ComKD-CLIP: Comprehensive Knowledge Distillation for +Contrastive Language-Image Pre-traning Model, which aims to comprehensively +distill the knowledge from a large teacher CLIP model into a smaller student +model, ensuring comparable performance with significantly reduced parameters. +ComKD-CLIP is composed of two key mechanisms: Image Feature Alignment (IFAlign) +and Educational Attention (EduAttention). IFAlign makes the image features +extracted by the student model closely match those extracted by the teacher +model, enabling the student to learn teacher's knowledge of extracting image +features. EduAttention explores the cross-relationships between text features +extracted by the teacher model and image features extracted by the student +model, enabling the student model to learn how the teacher model integrates +text-image features. In addition, ComKD-CLIP can refine the knowledge distilled +from IFAlign and EduAttention by leveraging the text-image feature fusion +results of the teacher model, ensuring the student model accurately absorbs the +teacher's knowledge. Extensive experiments conducted on 11 datasets have +demonstrated the superiority of the proposed method. + +
+
+ comment: update +
+
+
+
+
+ + ♻ ☆ Read and Think: An Efficient Step-wise Multimodal Language Model for + Document Understanding and Reasoning + + +
+ Understanding the contents of multimodal documents is essential to accurately +extract relevant evidence and use it for reasoning. Existing document +understanding models tend to generate answers with a single word or phrase +directly, ignoring the source document's evidence and lacking interpretability. +In this work, we address the lack of step-wise capabilities through data +augmentation and extension. Specifically, We use Multi-modal Large Language +Models (MLLMs), which have strong visual understanding and reasoning abilities, +as data generators to generate step-wise question-and-answer pairs for document +images and use a high-performance LLM as the error detector to filter out noisy +data. This step-wise data generation pipeline is implemented using both +template-based and few-shot methods. We then use the generated high-quality +data to train a humanized document understanding and reasoning model, +specifically designed to solve complex questions that require reasoning or +multi-hop question answering, dubbed DocAssistant. Experimental results +demonstrate the effectiveness and application value of step-wise generation, +showing a 5 improvement on InfoVQA with complex layouts and a 7 improvement on +ChartQA with complex reasoning, compared to directly generated answers. We hope +our work highlights the potential of synthetic data and encourages further +exploration of multi-modal document reasoning capabilities. + +
+
+
+
+
+ + ♻ ☆ SCP: Soft Conditional Prompt Learning for Aerial Video Action + Recognition IROS2024 + + +
+ We present a new learning approach, Soft Conditional Prompt Learning (SCP), +which leverages the strengths of prompt learning for aerial video action +recognition. Our approach is designed to predict the action of each agent by +helping the models focus on the descriptions or instructions associated with +actions in the input videos for aerial/robot visual perception. Our formulation +supports various prompts, including learnable prompts, auxiliary visual +information, and large vision models to improve the recognition performance. We +present a soft conditional prompt method that learns to dynamically generate +prompts from a pool of prompt experts under different video inputs. By sharing +the same objective with the task, our proposed SCP can optimize prompts that +guide the model's predictions while explicitly learning input-invariant (prompt +experts pool) and input-specific (data-dependent) prompt knowledge. In +practice, we observe a 3.17-10.2% accuracy improvement on the aerial video +datasets (Okutama, NECDrone), which consist of scenes with single-agent and +multi-agent actions. We further evaluate our approach on ground camera videos +to verify the effectiveness and generalization and achieve a 1.0-3.6% +improvement on dataset SSV2. We integrate our method into the ROS2 as well. + +
+
+ comment: IROS2024 +
+
+
+
+
+ + ♻ ☆ Learning to Learn without Forgetting using Attention + + +
+ Continual learning (CL) refers to the ability to continually learn over time +by accommodating new knowledge while retaining previously learned experience. +While this concept is inherent in human learning, current machine learning +methods are highly prone to overwrite previously learned patterns and thus +forget past experience. Instead, model parameters should be updated selectively +and carefully, avoiding unnecessary forgetting while optimally leveraging +previously learned patterns to accelerate future learning. Since hand-crafting +effective update mechanisms is difficult, we propose meta-learning a +transformer-based optimizer to enhance CL. This meta-learned optimizer uses +attention to learn the complex relationships between model parameters across a +stream of tasks, and is designed to generate effective weight updates for the +current task while preventing catastrophic forgetting on previously encountered +tasks. Evaluations on benchmark datasets like SplitMNIST, RotatedMNIST, and +SplitCIFAR-100 affirm the efficacy of the proposed approach in terms of both +forward and backward transfer, even on small sets of labeled data, highlighting +the advantages of integrating a meta-learned optimizer within the continual +learning framework. + +
+
+ comment: Published at the 3rd Conference on Lifelong Learning Agents (CoLLAs), + 2024 +
+
+
+
+
+ + ♻ ☆ SRFormerV2: Taking a Closer Look at Permuted Self-Attention for Image + Super-Resolution ICCV2023 + + +
+ Previous works have shown that increasing the window size for +Transformer-based image super-resolution models (e.g., SwinIR) can +significantly improve the model performance. Still, the computation overhead is +also considerable when the window size gradually increases. In this paper, we +present SRFormer, a simple but novel method that can enjoy the benefit of large +window self-attention but introduces even less computational burden. The core +of our SRFormer is the permuted self-attention (PSA), which strikes an +appropriate balance between the channel and spatial information for +self-attention. Without any bells and whistles, we show that our SRFormer +achieves a 33.86dB PSNR score on the Urban100 dataset, which is 0.46dB higher +than that of SwinIR but uses fewer parameters and computations. In addition, we +also attempt to scale up the model by further enlarging the window size and +channel numbers to explore the potential of Transformer-based models. +Experiments show that our scaled model, named SRFormerV2, can further improve +the results and achieves state-of-the-art. We hope our simple and effective +approach could be useful for future research in super-resolution model design. +The homepage is https://z-yupeng.github.io/SRFormer/. + +
+
+ comment: Previous version has been accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Self-augmented Gaussian Splatting with Structure-aware Masks for + Sparse-view 3D Reconstruction + + +
+ Sparse-view 3D reconstruction stands as a formidable challenge in computer +vision, aiming to build complete three-dimensional models from a limited array +of viewing perspectives. This task confronts several difficulties: 1) the +limited number of input images that lack consistent information; 2) dependence +on the quality of input images; and 3) the substantial size of model +parameters. To address these challenges, we propose a self-augmented +coarse-to-fine Gaussian splatting paradigm, enhanced with a structure-aware +mask, for sparse-view 3D reconstruction. In particular, our method initially +employs a coarse Gaussian model to obtain a basic 3D representation from +sparse-view inputs. Subsequently, we develop a fine Gaussian network to enhance +consistent and detailed representation of the output with both 3D geometry +augmentation and perceptual view augmentation. During training, we design a +structure-aware masking strategy to further improve the model's robustness +against sparse inputs and noise.Experimental results on the MipNeRF360 and +OmniObject3D datasets demonstrate that the proposed method achieves +state-of-the-art performances for sparse input views in both perceptual quality +and efficiency. + +
+
+
+
+
+ + ♻ ☆ Automatic Feature Recognition and Dimensional Attributes Extraction From + CAD Models for Hybrid Additive-Subtractive Manufacturing + + +
+ The integration of Computer-Aided Design (CAD), Computer-Aided Process +Planning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in +modern manufacturing, facilitating seamless transitions from digital designs to +physical products. However, a significant challenge within this integration is +the Automatic Feature Recognition (AFR) of CAD models, especially in the +context of hybrid manufacturing that combines subtractive and additive +manufacturing processes. Traditional AFR methods, focused mainly on the +identification of subtractive (machined) features including holes, fillets, +chamfers, pockets, and slots, fail to recognize features pertinent to additive +manufacturing. Furthermore, the traditional methods fall short in accurately +extracting geometric dimensions and orientations, which are also key factors +for effective manufacturing process planning. This paper presents a novel +approach for creating a synthetic CAD dataset that encompasses features +relevant to both additive and subtractive machining through Python Open +Cascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is +implemented to accurately identify the composite additive-subtractive features +within the synthetic CAD dataset. The key novelty and contribution of the +proposed methodology lie in its ability to recognize a wide range of +manufacturing features, and precisely extracting their dimensions, +orientations, and stock sizes. The proposed model demonstrates remarkable +feature recognition accuracy exceeding 97% and a dimension extraction accuracy +of 100% for identified features. Therefore, the proposed methodology enhances +the integration of CAD, CAPP, and CAM within hybrid manufacturing by providing +precise feature recognition and dimension extraction. It facilitates improved +manufacturing process planning, by enabling more informed decision-making. + +
+
+ comment: 10 pages, 12 figures. This paper has been accepted for presentation + at the ASME IDETC-CIE 2024 conference +
+
+
+
+
+ + ♻ ☆ Cascaded Multi-path Shortcut Diffusion Model for Medical Image + Translation + + +
+ Image-to-image translation is a vital component in medical imaging +processing, with many uses in a wide range of imaging modalities and clinical +scenarios. Previous methods include Generative Adversarial Networks (GANs) and +Diffusion Models (DMs), which offer realism but suffer from instability and +lack uncertainty estimation. Even though both GAN and DM methods have +individually exhibited their capability in medical image translation tasks, the +potential of combining a GAN and DM to further improve translation performance +and to enable uncertainty estimation remains largely unexplored. In this work, +we address these challenges by proposing a Cascade Multi-path Shortcut +Diffusion Model (CMDM) for high-quality medical image translation and +uncertainty estimation. To reduce the required number of iterations and ensure +robust performance, our method first obtains a conditional GAN-generated prior +image that will be used for the efficient reverse translation with a DM in the +subsequent step. Additionally, a multi-path shortcut diffusion strategy is +employed to refine translation results and estimate uncertainty. A cascaded +pipeline further enhances translation quality, incorporating residual averaging +between cascades. We collected three different medical image datasets with two +sub-tasks for each dataset to test the generalizability of our approach. Our +experimental results found that CMDM can produce high-quality translations +comparable to state-of-the-art methods while providing reasonable uncertainty +estimations that correlate well with the translation error. + +
+
+ comment: Accepted at Medical Image Analysis Journal +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ HAIFIT: Human-to-AI Fashion Image Translation + + +
+ In the realm of fashion design, sketches serve as the canvas for expressing +an artist's distinctive drawing style and creative vision, capturing intricate +details like stroke variations and texture nuances. The advent of +sketch-to-image cross-modal translation technology has notably aided designers. +However, existing methods often compromise these sketch details during image +generation, resulting in images that deviate from the designer's intended +concept. This limitation hampers the ability to offer designers a precise +preview of the final output. To overcome this challenge, we introduce HAIFIT, a +novel approach that transforms sketches into high-fidelity, lifelike clothing +images by integrating multi-scale features and capturing extensive feature map +dependencies from diverse perspectives. Through extensive qualitative and +quantitative evaluations conducted on our self-collected dataset, our method +demonstrates superior performance compared to existing methods in generating +photorealistic clothing images. Our method excels in preserving the distinctive +style and intricate details essential for fashion design applications. In +addition, our method also has obvious advantages in model training and +inference speed, contributing to reducing designers' time costs and improving +design efficiency. + +
+
+ comment: 10 pages,8 figures +
+
+
+
+
+ + ♻ ☆ On the Hidden Mystery of OCR in Large Multimodal Models + + +
+ Large models have recently played a dominant role in natural language +processing and multimodal vision-language learning. However, their +effectiveness in text-related visual tasks remains relatively unexplored. In +this paper, we conducted a comprehensive evaluation of Large Multimodal Models, +such as GPT4V and Gemini, in various text-related visual tasks including Text +Recognition, Scene Text-Centric Visual Question Answering (VQA), +Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten +Mathematical Expression Recognition (HMER). To facilitate the assessment of +Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we +propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29 +datasets, making it the most comprehensive OCR evaluation benchmark available. +Furthermore, our study reveals both the strengths and weaknesses of these +models, particularly in handling multilingual text, handwritten text, +non-semantic text, and mathematical expression recognition. Most importantly, +the baseline results presented in this study could provide a foundational +framework for the conception and assessment of innovative strategies targeted +at enhancing zero-shot multimodal techniques. The evaluation pipeline and +benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR. + +
+
+
+
+
+ + ♻ ☆ Patch-wise Auto-Encoder for Visual Anomaly Detection + + +
+ Anomaly detection without priors of the anomalies is challenging. In the +field of unsupervised anomaly detection, traditional auto-encoder (AE) tends to +fail based on the assumption that by training only on normal images, the model +will not be able to reconstruct abnormal images correctly. On the contrary, we +propose a novel patch-wise auto-encoder (Patch AE) framework, which aims at +enhancing the reconstruction ability of AE to anomalies instead of weakening +it. Each patch of image is reconstructed by corresponding spatially distributed +feature vector of the learned feature representation, i.e., patch-wise +reconstruction, which ensures anomaly-sensitivity of AE. Our method is simple +and efficient. It advances the state-of-the-art performances on Mvtec AD +benchmark, which proves the effectiveness of our model. It shows great +potential in practical industrial application scenarios. + +
+
+
+
+
+ + ♻ ☆ SC4D: Sparse-Controlled Video-to-4D Generation and Motion Transfer ECCV2024 + + +
+ Recent advances in 2D/3D generative models enable the generation of dynamic +3D objects from a single-view video. Existing approaches utilize score +distillation sampling to form the dynamic scene as dynamic NeRF or dense 3D +Gaussians. However, these methods struggle to strike a balance among reference +view alignment, spatio-temporal consistency, and motion fidelity under +single-view conditions due to the implicit nature of NeRF or the intricate +dense Gaussian motion prediction. To address these issues, this paper proposes +an efficient, sparse-controlled video-to-4D framework named SC4D, that +decouples motion and appearance to achieve superior video-to-4D generation. +Moreover, we introduce Adaptive Gaussian (AG) initialization and Gaussian +Alignment (GA) loss to mitigate shape degeneration issue, ensuring the fidelity +of the learned motion and shape. Comprehensive experimental results demonstrate +that our method surpasses existing methods in both quality and efficiency. In +addition, facilitated by the disentangled modeling of motion and appearance of +SC4D, we devise a novel application that seamlessly transfers the learned +motion onto a diverse array of 4D entities according to textual descriptions. + +
+
+ comment: Accepted by ECCV2024! Project Page: https://sc4d.github.io/ Code is + available at: https://github.com/JarrentWu1031/SC4D +
+
+
+
+
+ + ♻ ☆ Self-Supervised Scalable Deep Compressed Sensing + + +
+ Compressed sensing (CS) is a promising tool for reducing sampling costs. +Current deep neural network (NN)-based CS methods face the challenges of +collecting labeled measurement-ground truth (GT) data and generalizing to real +applications. This paper proposes a novel $\mathbf{S}$elf-supervised +s$\mathbf{C}$alable deep CS method, comprising a deep $\mathbf{L}$earning +scheme called $\mathbf{SCL}$ and a family of $\mathbf{Net}$works named +$\mathbf{SCNet}$, which does not require GT and can handle arbitrary sampling +ratios and matrices once trained on a partial measurement set. Our SCL contains +a dual-domain loss and a four-stage recovery strategy. The former encourages a +cross-consistency on two measurement parts and a sampling-reconstruction +cycle-consistency regarding arbitrary ratios and matrices to maximize +data/information utilization. The latter can progressively leverage common +signal prior in external measurements and internal characteristics of test +samples and learned NNs to improve accuracy. SCNet combines both the explicit +guidance from optimization algorithms with implicit regularization from +advanced NN blocks to learn a collaborative signal representation. Our +theoretical analyses and experiments on simulated and real captured data, +covering 1-/2-/3-D natural and scientific signals, demonstrate the +effectiveness, superior performance, flexibility, and generalization ability of +our method over existing self-supervised methods and its significant potential +in competing against state-of-the-art supervised methods. Code is available at +https://github.com/Guaishou74851/SCNet. + +
+
+ comment: Accepted by Internaltional Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ Harmonious Group Choreography with Trajectory-Controllable Diffusion + + +
+ Creating group choreography from music has gained attention in cultural +entertainment and virtual reality, aiming to coordinate visually cohesive and +diverse group movements. Despite increasing interest, recent works face +challenges in achieving aesthetically appealing choreography, primarily for two +key issues: multi-dancer collision and single-dancer foot slide. To address +these issues, we propose a Trajectory-Controllable Diffusion (TCDiff), a novel +approach that harnesses non-overlapping trajectories to facilitate coherent +dance movements. Specifically, to tackle dancer collisions, we introduce a +Dance-Beat Navigator capable of generating trajectories for multiple dancers +based on the music, complemented by a Distance-Consistency loss to maintain +appropriate spacing among trajectories within a reasonable threshold. To +mitigate foot sliding, we present a Footwork Adaptor that utilizes trajectory +displacement from adjacent frames to enable flexible footwork, coupled with a +Relative Forward-Kinematic loss to adjust the positioning of individual +dancers' root nodes and joints. Extensive experiments demonstrate that our +method achieves state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ Camera Perspective Transformation to Bird's Eye View via Spatial + Transformer Model for Road Intersection Monitoring + + +
+ Road intersection monitoring and control research often utilize bird's eye +view (BEV) simulators. In real traffic settings, achieving a BEV akin to that +in a simulator necessitates the deployment of drones or specific sensor +mounting, which is neither feasible nor practical. Consequently, traffic +intersection management remains confined to simulation environments given these +constraints. In this paper, we address the gap between simulated environments +and real-world implementation by introducing a novel deep-learning model that +converts a single camera's perspective of a road intersection into a BEV. We +created a simulation environment that closely resembles a real-world traffic +junction. The proposed model transforms the vehicles into BEV images, +facilitating road intersection monitoring and control model processing. +Inspired by image transformation techniques, we propose a Spatial-Transformer +Double Decoder-UNet (SDD-UNet) model that aims to eliminate the transformed +image distortions. In addition, the model accurately estimates the vehicle's +positions and enables the direct application of simulation-trained models in +real-world contexts. SDD-UNet model achieves an average dice similarity +coefficient (DSC) above 95% which is 40% better than the original UNet model. +The mean absolute error (MAE) is 0.102 and the centroid of the predicted mask +is 0.14 meters displaced, on average, indicating high accuracy. + +
+
+
+
+
+ + ♻ ☆ BiEquiFormer: Bi-Equivariant Representations for Global Point Cloud + Registration + + +
+ The goal of this paper is to address the problem of global point cloud +registration (PCR) i.e., finding the optimal alignment between point clouds +irrespective of the initial poses of the scans. This problem is notoriously +challenging for classical optimization methods due to computational +constraints. First, we show that state-of-the-art deep learning methods suffer +from huge performance degradation when the point clouds are arbitrarily placed +in space. We propose that equivariant deep learning should be utilized for +solving this task and we characterize the specific type of bi-equivariance of +PCR. Then, we design BiEquiformer a novel and scalable bi-equivariant pipeline +i.e. equivariant to the independent transformations of the input point clouds. +While a naive approach would process the point clouds independently we design +expressive bi-equivariant layers that fuse the information from both point +clouds. This allows us to extract high-quality superpoint correspondences and +in turn, robust point-cloud registration. Extensive comparisons against +state-of-the-art methods show that our method achieves comparable performance +in the canonical setting and superior performance in the robust setting in both +the 3DMatch and the challenging low-overlap 3DLoMatch dataset. + +
+
+
+
+
+ + ♻ ☆ Dynamic and Compressive Adaptation of Transformers From Images to Videos + + +
+ Recently, the remarkable success of pre-trained Vision Transformers (ViTs) +from image-text matching has sparked an interest in image-to-video adaptation. +However, most current approaches retain the full forward pass for each frame, +leading to a high computation overhead for processing entire videos. In this +paper, we present InTI, a novel approach for compressive image-to-video +adaptation using dynamic Inter-frame Token Interpolation. InTI aims to softly +preserve the informative tokens without disrupting their coherent +spatiotemporal structure. Specifically, each token pair at identical positions +within neighbor frames is linearly aggregated into a new token, where the +aggregation weights are generated by a multi-scale context-aware network. In +this way, the information of neighbor frames can be adaptively compressed in a +point-by-point manner, thereby effectively reducing the number of processed +frames by half each time. Importantly, InTI can be seamlessly integrated with +existing adaptation methods, achieving strong performance without extra-complex +design. On Kinetics-400, InTI reaches a top-1 accuracy of 87.1 with a +remarkable 37.5% reduction in GFLOPs compared to naive adaptation. When +combined with additional temporal modules, InTI achieves a top-1 accuracy of +87.6 with a 37% reduction in GFLOPs. Similar conclusions have been verified in +other common datasets. + +
+
+
+
+
+ + ♻ ☆ RL-I2IT: Image-to-Image Translation with Deep Reinforcement Learning + + +
+ Most existing Image-to-Image Translation (I2IT) methods generate images in a +single run of a deep learning (DL) model. However, designing such a single-step +model is always challenging, requiring a huge number of parameters and easily +falling into bad global minimums and overfitting. In this work, we reformulate +I2IT as a step-wise decision-making problem via deep reinforcement learning +(DRL) and propose a novel framework that performs RL-based I2IT (RL-I2IT). The +key feature in the RL-I2IT framework is to decompose a monolithic learning +process into small steps with a lightweight model to progressively transform a +source image successively to a target image. Considering that it is challenging +to handle high dimensional continuous state and action spaces in the +conventional RL framework, we introduce meta policy with a new concept Plan to +the standard Actor-Critic model, which is of a lower dimension than the +original image and can facilitate the actor to generate a tractable high +dimensional action. In the RL-I2IT framework, we also employ a task-specific +auxiliary learning strategy to stabilize the training process and improve the +performance of the corresponding task. Experiments on several I2IT tasks +demonstrate the effectiveness and robustness of the proposed method when facing +high-dimensional continuous action space problems. Our implementation of the +RL-I2IT framework is available at +https://github.com/Algolzw/SPAC-Deformable-Registration. + +
+
+
+
+
+ + ♻ ☆ HawkI: Homography & Mutual Information Guidance for 3D-free Single Image + to Aerial View + + +
+ We present HawkI, for synthesizing aerial-view images from text and an +exemplar image, without any additional multi-view or 3D information for +finetuning or at inference. HawkI uses techniques from classical computer +vision and information theory. It seamlessly blends the visual features from +the input image within a pretrained text-to-2Dimage stable diffusion model with +a test-time optimization process for a careful bias-variance trade-off, which +uses an Inverse Perspective Mapping (IPM) homography transformation to provide +subtle cues for aerialview synthesis. At inference, HawkI employs a unique +mutual information guidance formulation to steer the generated image towards +faithfully replicating the semantic details of the input-image, while +maintaining a realistic aerial perspective. Mutual information guidance +maximizes the semantic consistency between the generated image and the input +image, without enforcing pixel-level correspondence between vastly different +viewpoints. Through extensive qualitative and quantitative comparisons against +text + exemplar-image based methods and 3D/ multi-view based novel-view +synthesis methods on proposed synthetic and real datasets, we demonstrate that +our method achieves a significantly better bias-variance trade-off towards +generating high fidelity aerial-view images.Code and data is available at +https://github.com/divyakraman/HawkI2024. + +
+
+
+
+
+ + ♻ ☆ Multi-graph Graph Matching for Coronary Artery Semantic Labeling + + +
+ Coronary artery disease (CAD) stands as the leading cause of death worldwide, +and invasive coronary angiography (ICA) remains the gold standard for assessing +vascular anatomical information. However, deep learning-based methods encounter +challenges in generating semantic labels for arterial segments, primarily due +to the morphological similarity between arterial branches and varying anatomy +of arterial system between different projection view angles and patients. To +address this challenge, we model the vascular tree as a graph and propose a +multi-graph graph matching (MGM) algorithm for coronary artery semantic +labeling. The MGM algorithm assesses the similarity between arterials in +multiple vascular tree graphs, considering the cycle consistency between each +pair of graphs. As a result, the unannotated arterial segments are +appropriately labeled by matching them with annotated segments. Through the +incorporation of anatomical graph structure, radiomics features, and semantic +mapping, the proposed MGM model achieves an impressive accuracy of 0.9471 for +coronary artery semantic labeling using our multi-site dataset with 718 ICAs. +With the semantic labeled arteries, an overall accuracy of 0.9155 was achieved +for stenosis detection. The proposed MGM presents a novel tool for coronary +artery analysis using multiple ICA-derived graphs, offering valuable insights +into vascular health and pathology. + +
+
+
+
+
+ + ♻ ☆ Generalization Enhancement Strategies to Enable Cross-year Cropland + Mapping with Convolutional Neural Networks Trained Using Historical Samples + + +
+ The accuracy of mapping agricultural fields across large areas is steadily +improving with high-resolution satellite imagery and deep learning (DL) models, +even in regions where fields are small and geometrically irregular. However, +developing effective DL models often requires large, expensive label datasets, +typically available only for specific years or locations. This limits the +ability to create annual maps essential for agricultural monitoring, as domain +shifts occur between years and regions due to changes in farming practices and +environmental conditions. The challenge is to design a model flexible enough to +account for these shifts without needing yearly labels. While domain adaptation +techniques or semi-supervised training are common solutions, we explored +enhancing the model's generalization power. Our results indicate that a +holistic approach is essential, combining methods to improve generalization. +Specifically, using an area-based loss function, such as Tversky-focal loss +(TFL), significantly improved predictions across multiple years. The use of +different augmentation techniques helped to encode different types of +invariance, particularly photometric augmentations encoded invariance to +brightness changes, though they increased false positives. The combination of +photometric augmentation, TFL loss, and MC-dropout produced the best results, +although dropout alone led to more false negatives in subsequent year +predictions. Additionally, the choice of input normalization had a significant +impact, with the best results obtained when statistics were calculated either +locally or across the entire dataset over all bands (lab and gab). We developed +a workflow that enabled a U-Net model to generate effective multi-year crop +maps over large areas. Our code, available at: +https://github.com/agroimpacts/cnn-generalization-enhancement, will be +regularly updated with improvements. + +
+
+
+
+
+ + ♻ ☆ Virchow2: Scaling Self-Supervised Mixed Magnification Models in + Pathology + + +
+ Foundation models are rapidly being developed for computational pathology +applications. However, it remains an open question which factors are most +important for downstream performance with data scale and diversity, model size, +and training algorithm all playing a role. In this work, we propose algorithmic +modifications, tailored for pathology, and we present the result of scaling +both data and model size, surpassing previous studies in both dimensions. We +introduce two new models: Virchow2, a 632 million parameter vision transformer, +and Virchow2G, a 1.9 billion parameter vision transformer, each trained with +3.1 million histopathology whole slide images, with diverse tissues, +originating institutions, and stains. We achieve state of the art performance +on 12 tile-level tasks, as compared to the top performing competing models. Our +results suggest that data diversity and domain-specific methods can outperform +models that only scale in the number of parameters, but, on average, +performance benefits from the combination of domain-specific methods, data +scale, and model scale. + +
+
+
+
+
+ + ♻ ☆ WATonoBus: Field-Tested All-Weather Autonomous Shuttle Technology SC + + +
+ All-weather autonomous vehicle operation poses significant challenges, +encompassing modules from perception and decision-making to path planning and +control. The complexity arises from the need to address adverse weather +conditions such as rain, snow, and fog across the autonomy stack. Conventional +model-based single-module approaches often lack holistic integration with +upstream or downstream tasks. We tackle this problem by proposing a +multi-module and modular system architecture with considerations for adverse +weather across the perception level, through features such as snow covered curb +detection, to decision-making and safety monitoring. Through daily weekday +service on the WATonoBus platform for almost two years, we demonstrate that our +proposed approach is capable of addressing adverse weather conditions and +provide valuable insights from edge cases observed during operation. + +
+
+ comment: 8 pages, 10 figures. This work has been submitted to the ITSC for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ Exact Trajectory Similarity Search With N-tree: An Efficient Metric + Index for kNN and Range Queries + + +
+ Similarity search is the problem of finding in a collection of objects those +that are similar to a given query object. It is a fundamental problem in modern +applications and the objects considered may be as diverse as locations in +space, text documents, images, twitter messages, or trajectories of moving +objects. + In this paper we are motivated by the latter application. Trajectories are +recorded movements of mobile objects such as vehicles, animals, public +transportation, or parts of the human body. We propose a novel distance +function called DistanceAvg to capture the similarity of such movements. To be +practical, it is necessary to provide indexing for this distance measure. + Fortunately we do not need to start from scratch. A generic and unifying +approach is metric space, which organizes the set of objects solely by a +distance (similarity) function with certain natural properties. Our function +DistanceAvg is a metric. + Although metric indexes have been studied for decades and many such +structures are available, they do not offer the best performance with +trajectories. In this paper we propose a new design, which outperforms the best +existing indexes for kNN queries and is equally good for range queries. It is +especially suitable for expensive distance functions as they occur in +trajectory similarity search. In many applications, kNN queries are more +practical than range queries as it may be difficult to determine an appropriate +search radius. Our index provides exact result sets for the given distance +function. + +
+
+ comment: 54 pages, 26 figures +
+
+
+
+
+ + ☆ Towards Fair and Rigorous Evaluations: Hyperparameter Optimization for + Top-N Recommendation Task with Implicit Feedback + + +
+ The widespread use of the internet has led to an overwhelming amount of data, +which has resulted in the problem of information overload. Recommender systems +have emerged as a solution to this problem by providing personalized +recommendations to users based on their preferences and historical data. +However, as recommendation models become increasingly complex, finding the best +hyperparameter combination for different models has become a challenge. The +high-dimensional hyperparameter search space poses numerous challenges for +researchers, and failure to disclose hyperparameter settings may impede the +reproducibility of research results. In this paper, we investigate the Top-N +implicit recommendation problem and focus on optimizing the benchmark +recommendation algorithm commonly used in comparative experiments using +hyperparameter optimization algorithms. We propose a research methodology that +follows the principles of a fair comparison, employing seven types of +hyperparameter search algorithms to fine-tune six common recommendation +algorithms on three datasets. We have identified the most suitable +hyperparameter search algorithms for various recommendation algorithms on +different types of datasets as a reference for later study. This study +contributes to algorithmic research in recommender systems based on +hyperparameter optimization, providing a fair basis for comparison. + +
+
+
+
+
+ + ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation + Integrating Web Search and Knowledge Graphs KDD + + +
+ Large Language Models (LLMs) have greatly contributed to the development of +adaptive intelligent agents and are positioned as an important way to achieve +Artificial General Intelligence (AGI). However, LLMs are prone to produce +factually incorrect information and often produce "phantom" content that +undermines their reliability, which poses a serious challenge for their +deployment in real-world scenarios. Enhancing LLMs by combining external +databases and information retrieval mechanisms is an effective path. To address +the above challenges, we propose a new approach called WeKnow-RAG, which +integrates Web search and Knowledge Graphs into a "Retrieval-Augmented +Generation (RAG)" system. First, the accuracy and reliability of LLM responses +are improved by combining the structured representation of Knowledge Graphs +with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes +domain-specific knowledge graphs to satisfy a variety of queries and domains, +thereby improving performance on factual information and complex reasoning +tasks by employing multi-stage web page retrieval techniques using both sparse +and dense retrieval methods. Our approach effectively balances the efficiency +and accuracy of information retrieval, thus improving the overall retrieval +process. Finally, we also integrate a self-assessment mechanism for the LLM to +evaluate the trustworthiness of the answers it generates. Our approach proves +its outstanding effectiveness in a wide range of offline experiments and online +submissions. + +
+
+ comment: 8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta + KDD Cup 2024 CRAG Challenge +
+
+
+
+
+ + ☆ New Curriculum, New Chance -- Retrieval Augmented Generation for Lesson + Planning in Ugandan Secondary Schools. Prototype Quality Evaluation + + +
+ Introduction: Poor educational quality in Secondary Schools is still regarded +as one of the major struggles in 21st century Uganda - especially in rural +areas. Research identifies several problems, including low quality or absent +teacher lesson planning. As the government pushes towards the implementation of +a new curriculum, exiting lesson plans become obsolete and the problem is +worsened. Using a Retrieval Augmented Generation approach, we developed a +prototype that generates customized lesson plans based on the +government-accredited textbooks. This helps teachers create lesson plans more +efficiently and with better quality, ensuring they are fully aligned the new +curriculum and the competence-based learning approach. + Methods: The prototype was created using Cohere LLM and Sentence Embeddings, +and LangChain Framework - and thereafter made available on a public website. +Vector stores were trained for three new curriculum textbooks (ICT, +Mathematics, History), all at Secondary 1 Level. Twenty-four lessons plans were +generated following a pseudo-random generation protocol, based on the suggested +periods in the textbooks. The lesson plans were analyzed regarding their +technical quality by three independent raters following the Lesson Plan +Analysis Protocol (LPAP) by Ndihokubwayo et al. (2022) that is specifically +designed for East Africa and competence-based curriculums. + Results: Evaluation of 24 lesson plans using the LPAP resulted in an average +quality of between 75 and 80%, corresponding to "very good lesson plan". None +of the lesson plans scored below 65%, although one lesson plan could be argued +to have been missing the topic. In conclusion, the quality of the generated +lesson plans is at least comparable, if not better, than those created by +humans, as demonstrated in a study in Rwanda, whereby no lesson plan even +reached the benchmark of 50%. + +
+
+ comment: Presented at Ndejje University Second Annual Research Dissemination + Symposium 2024 +
+
+
+
+
+ + ☆ Beyond Inter-Item Relations: Dynamic Adaptive Mixture-of-Experts for + LLM-Based Sequential Recommendation + + +
+ Sequential recommender system (SRS) predicts the next items that users may +prefer based on user historical interaction sequences. Inspired by the rise of +large language models (LLMs) in various AI applications, there is a surge of +work on LLM-based SRS. Despite their attractive performance, existing LLM-based +SRS still exhibit some limitations, including neglecting intra-item relations, +ignoring long-term collaborative knowledge and using inflexible architecture +designs for adaption. To alleviate these issues, we propose an LLM-based SRS +named MixRec. Built on top of coarse-grained adaption for capturing inter-item +relations, MixRec is further enhanced with (1) context masking that models +intra-item relations to help LLM better understand token and item semantics in +the context of SRS, (2) collaborative knowledge injection that helps LLM +incorporate long-term collaborative knowledge, and (3) a dynamic adaptive +mixture-of-experts design that can flexibly choose expert architectures based +on Bayesian optimization to better incorporate different sequential +information. Extensive experiments demonstrate that MixRec can effectively +handle sequential recommendation in a dynamic and adaptive manner. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+ + ☆ GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval + + +
+ In the rapidly expanding domain of web video content, the task of text-video +retrieval has become increasingly critical, bridging the semantic gap between +textual queries and video data. This paper introduces a novel data-centric +approach, Generalized Query Expansion (GQE), to address the inherent +information imbalance between text and video, enhancing the effectiveness of +text-video retrieval systems. Unlike traditional model-centric methods that +focus on designing intricate cross-modal interaction mechanisms, GQE aims to +expand the text queries associated with videos both during training and testing +phases. By adaptively segmenting videos into short clips and employing +zero-shot captioning, GQE enriches the training dataset with comprehensive +scene descriptions, effectively bridging the data imbalance gap. Furthermore, +during retrieval, GQE utilizes Large Language Models (LLM) to generate a +diverse set of queries and a query selection module to filter these queries +based on relevance and diversity, thus optimizing retrieval performance while +reducing computational overhead. Our contributions include a detailed +examination of the information imbalance challenge, a novel approach to query +expansion in video-text datasets, and the introduction of a query selection +strategy that enhances retrieval accuracy without increasing computational +costs. GQE achieves state-of-the-art performance on several benchmarks, +including MSR-VTT, MSVD, LSMDC, and VATEX, demonstrating the effectiveness of +addressing text-video retrieval from a data-centric perspective. + +
+
+ comment: 18 pages including appendix +
+
+
+
+
+ + ☆ SWaT: Statistical Modeling of Video Watch Time through User Behavior + Analysis + + +
+ The significance of estimating video watch time has been highlighted by the +rising importance of (short) video recommendation, which has become a core +product of mainstream social media platforms. Modeling video watch time, +however, has been challenged by the complexity of user-video interaction, such +as different user behavior modes in watching the recommended videos and varying +watching probabilities over the video horizon. Despite the importance and +challenges, existing literature on modeling video watch time mostly focuses on +relatively black-box mechanical enhancement of the classical +regression/classification losses, without factoring in user behavior in a +principled manner. In this paper, we for the first time take on a user-centric +perspective to model video watch time, from which we propose a white-box +statistical framework that directly translates various user behavior +assumptions in watching (short) videos into statistical watch time models. +These behavior assumptions are portrayed by our domain knowledge on users' +behavior modes in video watching. We further employ bucketization to cope with +user's non-stationary watching probability over the video horizon, which +additionally helps to respect the constraint of video length and facilitate the +practical compatibility between the continuous regression event of watch time +and other binary classification events. We test our models extensively on two +public datasets, a large-scale offline industrial dataset, and an online A/B +test on a short video platform with hundreds of millions of daily-active users. +On all experiments, our models perform competitively against strong relevant +baselines, demonstrating the efficacy of our user-centric perspective and +proposed framework. + +
+
+
+
+
+ + ♻ ☆ RECE: Reduced Cross-Entropy Loss for Large-Catalogue Sequential + Recommenders CIKM'24 + + +
+ Scalability is a major challenge in modern recommender systems. In sequential +recommendations, full Cross-Entropy (CE) loss achieves state-of-the-art +recommendation quality but consumes excessive GPU memory with large item +catalogs, limiting its practicality. Using a GPU-efficient locality-sensitive +hashing-like algorithm for approximating large tensor of logits, this paper +introduces a novel RECE (REduced Cross-Entropy) loss. RECE significantly +reduces memory consumption while allowing one to enjoy the state-of-the-art +performance of full CE loss. Experimental results on various datasets show that +RECE cuts training peak memory usage by up to 12 times compared to existing +methods while retaining or exceeding performance metrics of CE loss. The +approach also opens up new possibilities for large-scale applications in other +domains. + +
+
+ comment: 5 pages, accepted for CIKM'24 +
+
+
+
+
+ + ♻ ☆ CAPRI-FAIR: Integration of Multi-sided Fairness in Contextual POI + Recommendation Framework + + +
+ Point-of-interest (POI) recommendation considers spatio-temporal factors like +distance, peak hours, and user check-ins. Given their influence on both +consumer experience and POI business, it's crucial to consider fairness from +multiple perspectives. Unfortunately, these systems often provide less accurate +recommendations to inactive users and less exposure to unpopular POIs. This +paper develops a post-filter method that includes provider and consumer +fairness in existing models, aiming to balance fairness metrics like item +exposure with performance metrics such as precision and distance. Experiments +show that a linear scoring model for provider fairness in re-scoring items +offers the best balance between performance and long-tail exposure, sometimes +without much precision loss. Addressing consumer fairness by recommending more +popular POIs to inactive users increased precision in some models and datasets. +However, combinations that reached the Pareto front of consumer and provider +fairness resulted in the lowest precision values, highlighting that tradeoffs +depend greatly on the model and dataset. + +
+
+
+
+
+ + ♻ ☆ Optimal Baseline Corrections for Off-Policy Contextual Bandits + + +
+ The off-policy learning paradigm allows for recommender systems and general +ranking applications to be framed as decision-making problems, where we aim to +learn decision policies that optimize an unbiased offline estimate of an online +reward metric. With unbiasedness comes potentially high variance, and prevalent +methods exist to reduce estimation variance. These methods typically make use +of control variates, either additive (i.e., baseline corrections or doubly +robust methods) or multiplicative (i.e., self-normalisation). Our work unifies +these approaches by proposing a single framework built on their equivalence in +learning scenarios. The foundation of our framework is the derivation of an +equivalent baseline correction for all of the existing control variates. +Consequently, our framework enables us to characterize the variance-optimal +unbiased estimator and provide a closed-form solution for it. This optimal +estimator brings significantly improved performance in both evaluation and +learning, and minimizes data requirements. Empirical observations corroborate +our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: Fix HTML view. That is, fix the heap (strikethrough) order of .tex + files using the auxiliary Arxiv Readme XXX +
+
+
+
+
+ + ♻ ☆ BioRAG: A RAG-LLM Framework for Biological Question Reasoning + + +
+ The question-answering system for Life science research, which is +characterized by the rapid pace of discovery, evolving insights, and complex +interactions among knowledge entities, presents unique challenges in +maintaining a comprehensive knowledge warehouse and accurate information +retrieval. To address these issues, we introduce BioRAG, a novel +Retrieval-Augmented Generation (RAG) with the Large Language Models (LLMs) +framework. Our approach starts with parsing, indexing, and segmenting an +extensive collection of 22 million scientific papers as the basic knowledge, +followed by training a specialized embedding model tailored to this domain. +Additionally, we enhance the vector retrieval process by incorporating a +domain-specific knowledge hierarchy, which aids in modeling the intricate +interrelationships among each query and context. For queries requiring the most +current information, BioRAG deconstructs the question and employs an iterative +retrieval process incorporated with the search engine for step-by-step +reasoning. Rigorous experiments have demonstrated that our model outperforms +fine-tuned LLM, LLM with search engines, and other scientific RAG frameworks +across multiple life science question-answering tasks. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Watermarking Recommender Systems + + +
+ Recommender systems embody significant commercial value and represent crucial +intellectual property. However, the integrity of these systems is constantly +challenged by malicious actors seeking to steal their underlying models. +Safeguarding against such threats is paramount to upholding the rights and +interests of the model owner. While model watermarking has emerged as a potent +defense mechanism in various domains, its direct application to recommender +systems remains unexplored and non-trivial. In this paper, we address this gap +by introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel +technique tailored specifically for recommender systems. Our approach entails +selecting an initial item and querying it through the oracle model, followed by +the selection of subsequent items with small prediction scores. This iterative +process generates a watermark sequence autoregressively, which is then +ingrained into the model's memory through training. To assess the efficacy of +the watermark, the model is tasked with predicting the subsequent item given a +truncated watermark sequence. Through extensive experimentation and analysis, +we demonstrate the superior performance and robust properties of AOW. Notably, +our watermarking technique exhibits high-confidence extraction capabilities and +maintains effectiveness even in the face of distillation and fine-tuning +processes. + +
+
+
+
+
+ + ♻ ☆ MM-GEF: Multi-modal representation meet collaborative filtering + + +
+ In modern e-commerce, item content features in various modalities offer +accurate yet comprehensive information to recommender systems. The majority of +previous work either focuses on learning effective item representation during +modelling user-item interactions, or exploring item-item relationships by +analysing multi-modal features. Those methods, however, fail to incorporate the +collaborative item-user-item relationships into the multi-modal feature-based +item structure. In this work, we propose a graph-based item structure +enhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion, +which effectively combines the latent item structure underlying multi-modal +contents with the collaborative signals. Instead of processing the content +feature in different modalities separately, we show that the early-fusion of +multi-modal features provides significant improvement. MM-GEF learns refined +item representations by injecting structural information obtained from both +multi-modal and collaborative signals. Through extensive experiments on four +publicly available datasets, we demonstrate systematical improvements of our +method over state-of-the-art multi-modal recommendation methods. + +
+
+
+
+
+ + ♻ ☆ The Elephant in the Room: Rethinking the Usage of Pre-trained Language + Model in Sequential Recommendation RecSys 2024 + + +
+ Sequential recommendation (SR) has seen significant advancements with the +help of Pre-trained Language Models (PLMs). Some PLM-based SR models directly +use PLM to encode user historical behavior's text sequences to learn user +representations, while there is seldom an in-depth exploration of the +capability and suitability of PLM in behavior sequence modeling. In this work, +we first conduct extensive model analyses between PLMs and PLM-based SR models, +discovering great underutilization and parameter redundancy of PLMs in behavior +sequence modeling. Inspired by this, we explore different lightweight usages of +PLMs in SR, aiming to maximally stimulate the ability of PLMs for SR while +satisfying the efficiency and usability demands of practical systems. We +discover that adopting behavior-tuned PLMs for item initializations of +conventional ID-based SR models is the most economical framework of PLM-based +SR, which would not bring in any additional inference cost but could achieve a +dramatic performance boost compared with the original version. Extensive +experiments on five datasets show that our simple and universal framework leads +to significant improvement compared to classical SR and SOTA PLM-based SR +models without additional inference costs. Our code can be found in +https://github.com/777pomingzi/Rethinking-PLM-in-RS. + +
+
+ comment: Accepted at RecSys 2024 +
+
+
+
+
+ + ♻ ☆ Chain-of-Factors Paper-Reviewer Matching + + +
+ With the rapid increase in paper submissions to academic conferences, the +need for automated and accurate paper-reviewer matching is more critical than +ever. Previous efforts in this area have considered various factors to assess +the relevance of a reviewer's expertise to a paper, such as the semantic +similarity, shared topics, and citation connections between the paper and the +reviewer's previous works. However, most of these studies focus on only one +factor, resulting in an incomplete evaluation of the paper-reviewer relevance. +To address this issue, we propose a unified model for paper-reviewer matching +that jointly considers semantic, topic, and citation factors. To be specific, +during training, we instruction-tune a contextualized language model shared +across all factors to capture their commonalities and characteristics; during +inference, we chain the three factors to enable step-by-step, coarse-to-fine +search for qualified reviewers given a submission. Experiments on four datasets +(one of which is newly contributed by us) spanning various fields such as +machine learning, computer vision, information retrieval, and data mining +consistently demonstrate the effectiveness of our proposed Chain-of-Factors +model in comparison with state-of-the-art paper-reviewer matching methods and +scientific pre-trained language models. + +
+
+
+
+
+ + ♻ ☆ Read and Think: An Efficient Step-wise Multimodal Language Model for + Document Understanding and Reasoning + + +
+ Understanding the contents of multimodal documents is essential to accurately +extract relevant evidence and use it for reasoning. Existing document +understanding models tend to generate answers with a single word or phrase +directly, ignoring the source document's evidence and lacking interpretability. +In this work, we address the lack of step-wise capabilities through data +augmentation and extension. Specifically, We use Multi-modal Large Language +Models (MLLMs), which have strong visual understanding and reasoning abilities, +as data generators to generate step-wise question-and-answer pairs for document +images and use a high-performance LLM as the error detector to filter out noisy +data. This step-wise data generation pipeline is implemented using both +template-based and few-shot methods. We then use the generated high-quality +data to train a humanized document understanding and reasoning model, +specifically designed to solve complex questions that require reasoning or +multi-hop question answering, dubbed DocAssistant. Experimental results +demonstrate the effectiveness and application value of step-wise generation, +showing a 5 improvement on InfoVQA with complex layouts and a 7 improvement on +ChartQA with complex reasoning, compared to directly generated answers. We hope +our work highlights the potential of synthetic data and encourages further +exploration of multi-modal document reasoning capabilities. + +
+
+
+
+
+ + ♻ ☆ BMX: Entropy-weighted Similarity and Semantic-enhanced Lexical Search + + +
+ BM25, a widely-used lexical search algorithm, remains crucial in information +retrieval despite the rise of pre-trained and large language models +(PLMs/LLMs). However, it neglects query-document similarity and lacks semantic +understanding, limiting its performance. We revisit BM25 and introduce BMX, a +novel extension of BM25 incorporating entropy-weighted similarity and semantic +enhancement techniques. Extensive experiments demonstrate that BMX consistently +outperforms traditional BM25 and surpasses PLM/LLM-based dense retrieval in +long-context and real-world retrieval benchmarks. This study bridges the gap +between classical lexical search and modern semantic approaches, offering a +promising direction for future information retrieval research. The reference +implementation of BMX can be found in Baguetter, which was created in the +context of this work. The code can be found here: +https://github.com/mixedbread-ai/baguetter. + +
+
+ comment: correct the affiliation order +
+
+
+
+
+ + ♻ ☆ Look into the Future: Deep Contextualized Sequential Recommendation + + +
+ Sequential recommendation aims to estimate how a user's interests evolve over +time via uncovering valuable patterns from user behavior history. Many previous +sequential models have solely relied on users' historical information to model +the evolution of their interests, neglecting the crucial role that future +information plays in accurately capturing these dynamics. However, effectively +incorporating future information in sequential modeling is non-trivial since it +is impossible to make the current-step prediction for any target user by +leveraging his future data. In this paper, we propose a novel framework of +sequential recommendation called Look into the Future (LIFT), which builds and +leverages the contexts of sequential recommendation. In LIFT, the context of a +target user's interaction is represented based on i) his own past behaviors and +ii) the past and future behaviors of the retrieved similar interactions from +other users. As such, the learned context will be more informative and +effective in predicting the target user's behaviors in sequential +recommendation without temporal data leakage. Furthermore, in order to exploit +the intrinsic information embedded within the context itself, we introduce an +innovative pretraining methodology incorporating behavior masking. In our +extensive experiments on five real-world datasets, LIFT achieves significant +performance improvement on click-through rate prediction and rating prediction +tasks in sequential recommendation over strong baselines, demonstrating that +retrieving and leveraging relevant contexts from the global user pool greatly +benefits sequential recommendation. The experiment code is provided at +https://anonymous.4open.science/r/LIFT-277C/Readme.md. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.18304 by other authors +
+
+
+
+
+ + ♻ ☆ Contextual Distillation Model for Diversified Recommendation KDD 2024 + + +
+ The diversity of recommendation is equally crucial as accuracy in improving +user experience. Existing studies, e.g., Determinantal Point Process (DPP) and +Maximal Marginal Relevance (MMR), employ a greedy paradigm to iteratively +select items that optimize both accuracy and diversity. However, prior methods +typically exhibit quadratic complexity, limiting their applications to the +re-ranking stage and are not applicable to other recommendation stages with a +larger pool of candidate items, such as the pre-ranking and ranking stages. In +this paper, we propose Contextual Distillation Model (CDM), an efficient +recommendation model that addresses diversification, suitable for the +deployment in all stages of industrial recommendation pipelines. Specifically, +CDM utilizes the candidate items in the same user request as context to enhance +the diversification of the results. We propose a contrastive context encoder +that employs attention mechanisms to model both positive and negative contexts. +For the training of CDM, we compare each target item with its context embedding +and utilize the knowledge distillation framework to learn the win probability +of each target item under the MMR algorithm, where the teacher is derived from +MMR outputs. During inference, ranking is performed through a linear +combination of the recommendation and student model scores, ensuring both +diversity and efficiency. We perform offline evaluations on two industrial +datasets and conduct online A/B test of CDM on the short-video platform +KuaiShou. The considerable enhancements observed in both recommendation quality +and diversity, as shown by metrics, provide strong superiority for the +effectiveness of CDM. + +
+
+ comment: accepted by KDD 2024 v2 +
+
+
+
+
+ + ♻ ☆ Efficient Retrieval with Learned Similarities + + +
+ Retrieval plays a fundamental role in recommendation systems, search, and +natural language processing by efficiently finding relevant items from a large +corpus given a query. Dot products have been widely used as the similarity +function in such retrieval tasks, thanks to Maximum Inner Product Search (MIPS) +that enabled efficient retrieval based on dot products. However, +state-of-the-art retrieval algorithms have migrated to learned similarities. +Such algorithms vary in form; the queries can be represented with multiple +embeddings, complex neural networks can be deployed, the item ids can be +decoded directly from queries using beam search, and multiple approaches can be +combined in hybrid solutions. Unfortunately, we lack efficient solutions for +retrieval in these state-of-the-art setups. Our work investigates techniques +for approximate nearest neighbor search with learned similarity functions. We +first prove that Mixture-of-Logits (MoL) is a universal approximator, and can +express all learned similarity functions. We next propose techniques to +retrieve the approximate top K results using MoL with a tight bound. We finally +compare our techniques with existing approaches, showing that MoL sets new +state-of-the-art results on recommendation retrieval tasks, and our approximate +top-k retrieval with learned similarities outperforms baselines by up to two +orders of magnitude in latency, while achieving > .99 recall rate of exact +algorithms. + +
+
+
+
+
+ + ♻ ☆ Enhancing Relevance of Embedding-based Retrieval at Walmart CIKM 2024 + + +
+ Embedding-based neural retrieval (EBR) is an effective search retrieval +method in product search for tackling the vocabulary gap between customer +search queries and products. The initial launch of our EBR system at Walmart +yielded significant gains in relevance and add-to-cart rates [1]. However, +despite EBR generally retrieving more relevant products for reranking, we have +observed numerous instances of relevance degradation. Enhancing retrieval +performance is crucial, as it directly influences product reranking and affects +the customer shopping experience. Factors contributing to these degradations +include false positives/negatives in the training data and the inability to +handle query misspellings. To address these issues, we present several +approaches to further strengthen the capabilities of our EBR model in terms of +retrieval relevance. We introduce a Relevance Reward Model (RRM) based on human +relevance feedback. We utilize RRM to remove noise from the training data and +distill it into our EBR model through a multi-objective loss. In addition, we +present the techniques to increase the performance of our EBR model, such as +typo-aware training, and semi-positive generation. The effectiveness of our EBR +is demonstrated through offline relevance evaluation, online AB tests, and +successful deployments to live production. + [1] Alessandro Magnani, Feng Liu, Suthee Chaidaroon, Sachin Yadav, Praveen +Reddy Suram, Ajit Puthenputhussery, Sijie Chen, Min Xie, Anirudh Kashi, Tony +Lee, et al. 2022. Semantic retrieval at walmart. In Proceedings of the 28th ACM +SIGKDD Conference on Knowledge Discovery and Data Mining. 3495-3503. + +
+
+ comment: 8 pages, 3 figures, CIKM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 152 + +
+
+
+ + ☆ End-to-end Semantic-centric Video-based Multimodal Affective Computing + + +
+ In the pathway toward Artificial General Intelligence (AGI), understanding +human's affection is essential to enhance machine's cognition abilities. For +achieving more sensual human-AI interaction, Multimodal Affective Computing +(MAC) in human-spoken videos has attracted increasing attention. However, +previous methods are mainly devoted to designing multimodal fusion algorithms, +suffering from two issues: semantic imbalance caused by diverse pre-processing +operations and semantic mismatch raised by inconsistent affection content +contained in different modalities comparing with the multimodal ground truth. +Besides, the usage of manual features extractors make they fail in building +end-to-end pipeline for multiple MAC downstream tasks. To address above +challenges, we propose a novel end-to-end framework named SemanticMAC to +compute multimodal semantic-centric affection for human-spoken videos. We +firstly employ pre-trained Transformer model in multimodal data pre-processing +and design Affective Perceiver module to capture unimodal affective +information. Moreover, we present a semantic-centric approach to unify +multimodal representation learning in three ways, including gated feature +interaction, multi-task pseudo label generation, and intra-/inter-sample +contrastive learning. Finally, SemanticMAC effectively learn specific- and +shared-semantic representations in the guidance of semantic-centric labels. +Extensive experimental results demonstrate that our approach surpass the +state-of-the-art methods on 7 public datasets in four MAC downstream tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ A Spitting Image: Modular Superpixel Tokenization in Vision Transformers ECCV + + +
+ Vision Transformer (ViT) architectures traditionally employ a grid-based +approach to tokenization independent of the semantic content of an image. We +propose a modular superpixel tokenization strategy which decouples tokenization +and feature extraction; a shift from contemporary approaches where these are +treated as an undifferentiated whole. Using on-line content-aware tokenization +and scale- and shape-invariant positional embeddings, we perform experiments +and ablations that contrast our approach with patch-based tokenization and +randomized partitions as baselines. We show that our method significantly +improves the faithfulness of attributions, gives pixel-level granularity on +zero-shot unsupervised dense prediction tasks, while maintaining predictive +performance in classification tasks. Our approach provides a modular +tokenization framework commensurable with standard architectures, extending the +space of ViTs to a larger class of semantically-rich models. + +
+
+ comment: To appear in ECCV (MELEX) 2024 Workshop Proceedings +
+
+
+
+
+ + ☆ Deep Learning: a Heuristic Three-stage Mechanism for Grid Searches to + Optimize the Future Risk Prediction of Breast Cancer Metastasis Using + EHR-based Clinical Data + + +
+ A grid search, at the cost of training and testing a large number of models, +is an effective way to optimize the prediction performance of deep learning +models. A challenging task concerning grid search is the time management. +Without a good time management scheme, a grid search can easily be set off as a +mission that will not finish in our lifetime. In this study, we introduce a +heuristic three-stage mechanism for managing the running time of low-budget +grid searches, and the sweet-spot grid search (SSGS) and randomized grid search +(RGS) strategies for improving model prediction performance, in predicting the +5-year, 10-year, and 15-year risk of breast cancer metastasis. We develop deep +feedforward neural network (DFNN) models and optimize them through grid +searches. We conduct eight cycles of grid searches by applying our three-stage +mechanism and SSGS and RGS strategies. We conduct various SHAP analyses +including unique ones that interpret the importance of the DFNN-model +hyperparameters. Our results show that grid search can greatly improve model +prediction. The grid searches we conducted improved the risk prediction of +5-year, 10-year, and 15-year breast cancer metastasis by 18.6%, 16.3%, and +17.3% respectively, over the average performance of all corresponding models we +trained. We not only demonstrate best model performance but also characterize +grid searches from various aspects such as their capabilities of discovering +decent models and the unit grid search time. The three-stage mechanism worked +effectively. It made our low-budget grid searches feasible and manageable, and +in the meantime helped improve model prediction performance. Our SHAP analyses +identified both clinical risk factors important for the prediction of future +risk of breast cancer metastasis, and DFNN-model hyperparameters important to +the prediction of performance scores. + +
+
+
+
+
+ + ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ☆ Interpretable Graph Neural Networks for Heterogeneous Tabular Data + + +
+ Many machine learning algorithms for tabular data produce black-box models, +which prevent users from understanding the rationale behind the model +predictions. In their unconstrained form, graph neural networks fall into this +category, and they have further limited abilities to handle heterogeneous data. +To overcome these limitations, an approach is proposed, called IGNH +(Interpretable Graph Neural Network for Heterogeneous tabular data), which +handles both categorical and numerical features, while constraining the +learning process to generate exact feature attributions together with the +predictions. A large-scale empirical investigation is presented, showing that +the feature attributions provided by IGNH align with Shapley values that are +computed post hoc. Furthermore, the results show that IGNH outperforms two +powerful machine learning algorithms for tabular data, Random Forests and +TabNet, while reaching a similar level of performance as XGBoost. + +
+
+ comment: Accepted at 27th International Conference on Discovery Science 2024 +
+
+
+
+
+ + ☆ Off-Policy Reinforcement Learning with High Dimensional Reward + + +
+ Conventional off-policy reinforcement learning (RL) focuses on maximizing the +expected return of scalar rewards. Distributional RL (DRL), in contrast, +studies the distribution of returns with the distributional Bellman operator in +a Euclidean space, leading to highly flexible choices for utility. This paper +establishes robust theoretical foundations for DRL. We prove the contraction +property of the Bellman operator even when the reward space is an +infinite-dimensional separable Banach space. Furthermore, we demonstrate that +the behavior of high- or infinite-dimensional returns can be effectively +approximated using a lower-dimensional Euclidean space. Leveraging these +theoretical insights, we propose a novel DRL algorithm that tackles problems +which have been previously intractable using conventional reinforcement +learning approaches. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ☆ Graph Triple Attention Network: A Decoupled Perspective + + +
+ Graph Transformers (GTs) have recently achieved significant success in the +graph domain by effectively capturing both long-range dependencies and graph +inductive biases. However, these methods face two primary challenges: (1) +multi-view chaos, which results from coupling multi-view information +(positional, structural, attribute), thereby impeding flexible usage and the +interpretability of the propagation process. (2) local-global chaos, which +arises from coupling local message passing with global attention, leading to +issues of overfitting and over-globalizing. To address these challenges, we +propose a high-level decoupled perspective of GTs, breaking them down into +three components and two interaction levels: positional attention, structural +attention, and attribute attention, alongside local and global interaction. +Based on this decoupled perspective, we design a decoupled graph triple +attention network named DeGTA, which separately computes multi-view attentions +and adaptively integrates multi-view local and global information. This +approach offers three key advantages: enhanced interpretability, flexible +design, and adaptive integration of local and global information. Through +extensive experiments, DeGTA achieves state-of-the-art performance across +various datasets and tasks, including node classification and graph +classification. Comprehensive ablation studies demonstrate that decoupling is +essential for improving performance and enhancing interpretability. Our code is +available at: https://github.com/wangxiaotang0906/DeGTA + +
+
+
+
+
+ + ☆ Adaptive Behavioral AI: Reinforcement Learning to Enhance Pharmacy + Services KDD 2024 + + +
+ Pharmacies are critical in healthcare systems, particularly in low- and +middle-income countries. Procuring pharmacists with the right behavioral +interventions or nudges can enhance their skills, public health awareness, and +pharmacy inventory management, ensuring access to essential medicines that +ultimately benefit their patients. We introduce a reinforcement learning +operational system to deliver personalized behavioral interventions through +mobile health applications. We illustrate its potential by discussing a series +of initial experiments run with SwipeRx, an all-in-one app for pharmacists, +including B2B e-commerce, in Indonesia. The proposed method has broader +applications extending beyond pharmacy operations to optimize healthcare +delivery. + +
+
+ comment: Presented at The First Workshop on AI Behavioral Science (AIBS'24) at + KDD 2024, August 25, Barcelona, Spain +
+
+
+
+
+ + ☆ SigmaRL: A Sample-Efficient and Generalizable Multi-Agent Reinforcement + Learning Framework for Motion Planning SC + + +
+ This paper introduces an open-source, decentralized framework named SigmaRL, +designed to enhance both sample efficiency and generalization of multi-agent +Reinforcement Learning (RL) for motion planning of connected and automated +vehicles. Most RL agents exhibit a limited capacity to generalize, often +focusing narrowly on specific scenarios, and are usually evaluated in similar +or even the same scenarios seen during training. Various methods have been +proposed to address these challenges, including experience replay and +regularization. However, how observation design in RL affects sample efficiency +and generalization remains an under-explored area. We address this gap by +proposing five strategies to design information-dense observations, focusing on +general features that are applicable to most traffic scenarios. We train our RL +agents using these strategies on an intersection and evaluate their +generalization through numerical experiments across completely unseen traffic +scenarios, including a new intersection, an on-ramp, and a roundabout. +Incorporating these information-dense observations reduces training times to +under one hour on a single CPU, and the evaluation results reveal that our RL +agents can effectively zero-shot generalize. Code: +github.com/cas-lab-munich/SigmaRL + +
+
+ comment: 8 pages, 5 figures, accepted for presentation at the IEEE + International Conference on Intelligent Transportation Systems (ITSC) 2024 +
+
+
+
+
+ + ☆ Drug Discovery SMILES-to-Pharmacokinetics Diffusion Models with Deep + Molecular Understanding + + +
+ Artificial intelligence (AI) is increasingly used in every stage of drug +development. One challenge facing drug discovery AI is that drug +pharmacokinetic (PK) datasets are often collected independently from each +other, often with limited overlap, creating data overlap sparsity. Data +sparsity makes data curation difficult for researchers looking to answer +research questions in poly-pharmacy, drug combination research, and +high-throughput screening. We propose Imagand, a novel +SMILES-to-Pharmacokinetic (S2PK) diffusion model capable of generating an array +of PK target properties conditioned on SMILES inputs. We show that +Imagand-generated synthetic PK data closely resembles real data univariate and +bivariate distributions, and improves performance for downstream tasks. Imagand +is a promising solution for data overlap sparsity and allows researchers to +efficiently generate ligand PK data for drug discovery research. Code is +available at \url{https://github.com/bing1100/Imagand}. + +
+
+ comment: 13 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Towards Fair and Rigorous Evaluations: Hyperparameter Optimization for + Top-N Recommendation Task with Implicit Feedback + + +
+ The widespread use of the internet has led to an overwhelming amount of data, +which has resulted in the problem of information overload. Recommender systems +have emerged as a solution to this problem by providing personalized +recommendations to users based on their preferences and historical data. +However, as recommendation models become increasingly complex, finding the best +hyperparameter combination for different models has become a challenge. The +high-dimensional hyperparameter search space poses numerous challenges for +researchers, and failure to disclose hyperparameter settings may impede the +reproducibility of research results. In this paper, we investigate the Top-N +implicit recommendation problem and focus on optimizing the benchmark +recommendation algorithm commonly used in comparative experiments using +hyperparameter optimization algorithms. We propose a research methodology that +follows the principles of a fair comparison, employing seven types of +hyperparameter search algorithms to fine-tune six common recommendation +algorithms on three datasets. We have identified the most suitable +hyperparameter search algorithms for various recommendation algorithms on +different types of datasets as a reference for later study. This study +contributes to algorithmic research in recommender systems based on +hyperparameter optimization, providing a fair basis for comparison. + +
+
+
+
+
+ + ☆ Optimizing HIV Patient Engagement with Reinforcement Learning in + Resource-Limited Settings KDD + + +
+ By providing evidence-based clinical decision support, digital tools and +electronic health records can revolutionize patient management, especially in +resource-poor settings where fewer health workers are available and often need +more training. When these tools are integrated with AI, they can offer +personalized support and adaptive interventions, effectively connecting +community health workers (CHWs) and healthcare facilities. The CHARM (Community +Health Access & Resource Management) app is an AI-native mobile app for CHWs. +Developed through a joint partnership of Causal Foundry (CF) and +mothers2mothers (m2m), CHARM empowers CHWs, mainly local women, by streamlining +case management, enhancing learning, and improving communication. This paper +details CHARM's development, integration, and upcoming reinforcement +learning-based adaptive interventions, all aimed at enhancing health worker +engagement, efficiency, and patient outcomes, thereby enhancing CHWs' +capabilities and community health. + +
+
+ comment: Presented at the 7th epiDAMIK ACM SIGKDD International Workshop on + Epidemiology meets Data Mining and Knowledge Discovery, August 26, 2024, + Barcelona, Spain +
+
+
+
+
+ + ☆ Battery GraphNets : Relational Learning for Lithium-ion Batteries(LiBs) + Life Estimation NeurIPS 2022 + + +
+ Battery life estimation is critical for optimizing battery performance and +guaranteeing minimal degradation for better efficiency and reliability of +battery-powered systems. The existing methods to predict the Remaining Useful +Life(RUL) of Lithium-ion Batteries (LiBs) neglect the relational dependencies +of the battery parameters to model the nonlinear degradation trajectories. We +present the Battery GraphNets framework that jointly learns to incorporate a +discrete dependency graph structure between battery parameters to capture the +complex interactions and the graph-learning algorithm to model the intrinsic +battery degradation for RUL prognosis. The proposed method outperforms several +popular methods by a significant margin on publicly available battery datasets +and achieves SOTA performance. We report the ablation studies to support the +efficacy of our approach. + +
+
+ comment: Accepted in Workshop on Graph Learning for Industrial Applications : + Finance, Crime Detection, Medicine, and Social Media (NeurIPS 2022) +
+
+
+
+
+ + ☆ Latent Anomaly Detection Through Density Matrices + + +
+ This paper introduces a novel anomaly detection framework that combines the +robust statistical principles of density-estimation-based anomaly detection +methods with the representation-learning capabilities of deep learning models. +The method originated from this framework is presented in two different +versions: a shallow approach employing a density-estimation model based on +adaptive Fourier features and density matrices, and a deep approach that +integrates an autoencoder to learn a low-dimensional representation of the +data. By estimating the density of new samples, both methods are able to find +normality scores. The methods can be seamlessly integrated into an end-to-end +architecture and optimized using gradient-based optimization techniques. To +evaluate their performance, extensive experiments were conducted on various +benchmark datasets. The results demonstrate that both versions of the method +can achieve comparable or superior performance when compared to other +state-of-the-art methods. Notably, the shallow approach performs better on +datasets with fewer dimensions, while the autoencoder-based approach shows +improved performance on datasets with higher dimensions. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2211.08525 +
+
+
+
+
+ + ☆ "How Big is Big Enough?" Adjusting Model Size in Continual Gaussian + Processes + + +
+ For many machine learning methods, creating a model requires setting a +parameter that controls the model's capacity before training, e.g.~number of +neurons in DNNs, or inducing points in GPs. Increasing capacity improves +performance until all the information from the dataset is captured. After this +point, computational cost keeps increasing, without improved performance. This +leads to the question ``How big is big enough?'' We investigate this problem +for Gaussian processes (single-layer neural networks) in continual learning. +Here, data becomes available incrementally, and the final dataset size will +therefore not be known before training, preventing the use of heuristics for +setting the model size. We provide a method that automatically adjusts this, +while maintaining near-optimal performance, and show that a single +hyperparameter setting for our method performs well across datasets with a wide +range of properties. + +
+
+ comment: 9 pages main, 19 pages total, 9 figures, 3 tables, preprint +
+
+
+
+
+ + ☆ FedQUIT: On-Device Federated Unlearning via a Quasi-Competent Virtual + Teacher AAAI + + +
+ Federated Learning (FL) promises better privacy guarantees for individuals' +data when machine learning models are collaboratively trained. When an FL +participant exercises its right to be forgotten, i.e., to detach from the FL +framework it has participated and to remove its past contributions to the +global model, the FL solution should perform all the necessary steps to make it +possible without sacrificing the overall performance of the global model, which +is not supported in state-of-the-art related solutions nowadays. In this paper, +we propose FedQUIT, a novel algorithm that uses knowledge distillation to scrub +the contribution of the forgetting data from an FL global model while +preserving its generalization ability. FedQUIT directly works on clients' +devices and does not require sharing additional information if compared with a +regular FL process, nor does it assume the availability of publicly available +proxy data. Our solution is efficient, effective, and applicable in both +centralized and federated settings. Our experimental results show that, on +average, FedQUIT requires less than 2.5% additional communication rounds to +recover generalization performances after unlearning, obtaining a sanitized +global model whose predictions are comparable to those of a global model that +has never seen the data to be forgotten. + +
+
+ comment: Submitted to The 39th Annual AAAI Conference on Artificial + Intelligence (AAAI-25) +
+
+
+
+
+ + ☆ Theoretical and Practical Progress in Hyperspectral Pixel Unmixing with + Large Spectral Libraries from a Sparse Perspective + + +
+ Hyperspectral unmixing is the process of determining the presence of +individual materials and their respective abundances from an observed pixel +spectrum. Unmixing is a fundamental process in hyperspectral image analysis, +and is growing in importance as increasingly large spectral libraries are +created and used. Unmixing is typically done with ordinary least squares (OLS) +regression. However, unmixing with large spectral libraries where the materials +present in a pixel are not a priori known, solving for the coefficients in OLS +requires inverting a non-invertible matrix from a large spectral library. A +number of regression methods are available that can produce a numerical +solution using regularization, but with considerably varied effectiveness. +Also, simple methods that are unpopular in the statistics literature (i.e. +step-wise regression) are used with some level of effectiveness in +hyperspectral analysis. In this paper, we provide a thorough performance +evaluation of the methods considered, evaluating methods based on how often +they select the correct materials in the models. Investigated methods include +ordinary least squares regression, non-negative least squares regression, ridge +regression, lasso regression, step-wise regression and Bayesian model +averaging. We evaluated these unmixing approaches using multiple criteria: +incorporation of non-negative abundances, model size, accurate mineral +detection and root mean squared error (RMSE). We provide a taxonomy of the +regression methods, showing that most methods can be understood as Bayesian +methods with specific priors. We conclude that methods that can be derived with +priors that correspond to the phenomenology of hyperspectral imagery outperform +those with priors that are optimal for prediction performance under the +assumptions of ordinary least squares linear regression. + +
+
+
+
+
+ + ☆ TabularBench: Benchmarking Adversarial Robustness for Tabular Deep + Learning in Real-world Use-cases + + +
+ While adversarial robustness in computer vision is a mature research field, +fewer researchers have tackled the evasion attacks against tabular deep +learning, and even fewer investigated robustification mechanisms and reliable +defenses. We hypothesize that this lag in the research on tabular adversarial +attacks is in part due to the lack of standardized benchmarks. To fill this +gap, we propose TabularBench, the first comprehensive benchmark of robustness +of tabular deep learning classification models. We evaluated adversarial +robustness with CAA, an ensemble of gradient and search attacks which was +recently demonstrated as the most effective attack against a tabular model. In +addition to our open benchmark (https://github.com/serval-uni-lu/tabularbench) +where we welcome submissions of new models and defenses, we implement 7 +robustification mechanisms inspired by state-of-the-art defenses in computer +vision and propose the largest benchmark of robust tabular deep learning over +200 models across five critical scenarios in finance, healthcare and security. +We curated real datasets for each use case, augmented with hundreds of +thousands of realistic synthetic inputs, and trained and assessed our models +with and without data augmentations. We open-source our library that provides +API access to all our pre-trained robust tabular models, and the largest +datasets of real and synthetic tabular inputs. Finally, we analyze the impact +of various defenses on the robustness and provide actionable insights to design +new defenses and robustification mechanisms. + +
+
+
+
+
+ + ☆ A Nested Graph Reinforcement Learning-based Decision-making Strategy for + Eco-platooning + + +
+ Platooning technology is renowned for its precise vehicle control, traffic +flow optimization, and energy efficiency enhancement. However, in large-scale +mixed platoons, vehicle heterogeneity and unpredictable traffic conditions lead +to virtual bottlenecks. These bottlenecks result in reduced traffic throughput +and increased energy consumption within the platoon. To address these +challenges, we introduce a decision-making strategy based on nested graph +reinforcement learning. This strategy improves collaborative decision-making, +ensuring energy efficiency and alleviating congestion. We propose a theory of +nested traffic graph representation that maps dynamic interactions between +vehicles and platoons in non-Euclidean spaces. By incorporating spatio-temporal +weighted graph into a multi-head attention mechanism, we further enhance the +model's capacity to process both local and global data. Additionally, we have +developed a nested graph reinforcement learning framework to enhance the +self-iterative learning capabilities of platooning. Using the I-24 dataset, we +designed and conducted comparative algorithm experiments, generalizability +testing, and permeability ablation experiments, thereby validating the proposed +strategy's effectiveness. Compared to the baseline, our strategy increases +throughput by 10% and decreases energy use by 9%. Specifically, increasing the +penetration rate of CAVs significantly enhances traffic throughput, though it +also increases energy consumption. + +
+
+ comment: 14 pages, 18 figures +
+
+
+
+
+ + ☆ Multi-task Heterogeneous Graph Learning on Electronic Health Records + + +
+ Learning electronic health records (EHRs) has received emerging attention +because of its capability to facilitate accurate medical diagnosis. Since the +EHRs contain enriched information specifying complex interactions between +entities, modeling EHRs with graphs is shown to be effective in practice. The +EHRs, however, present a great degree of heterogeneity, sparsity, and +complexity, which hamper the performance of most of the models applied to them. +Moreover, existing approaches modeling EHRs often focus on learning the +representations for a single task, overlooking the multi-task nature of EHR +analysis problems and resulting in limited generalizability across different +tasks. In view of these limitations, we propose a novel framework for EHR +modeling, namely MulT-EHR (Multi-Task EHR), which leverages a heterogeneous +graph to mine the complex relations and model the heterogeneity in the EHRs. To +mitigate the large degree of noise, we introduce a denoising module based on +the causal inference framework to adjust for severe confounding effects and +reduce noise in the EHR data. Additionally, since our model adopts a single +graph neural network for simultaneous multi-task prediction, we design a +multi-task learning module to leverage the inter-task knowledge to regularize +the training process. Extensive empirical studies on MIMIC-III and MIMIC-IV +datasets validate that the proposed method consistently outperforms the +state-of-the-art designs in four popular EHR analysis tasks -- drug +recommendation, and predictions of the length of stay, mortality, and +readmission. Thorough ablation studies demonstrate the robustness of our method +upon variations to key components and hyperparameters. + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ☆ Sonic: Fast and Transferable Data Poisoning on Clustering Algorithms + + +
+ Data poisoning attacks on clustering algorithms have received limited +attention, with existing methods struggling to scale efficiently as dataset +sizes and feature counts increase. These attacks typically require +re-clustering the entire dataset multiple times to generate predictions and +assess the attacker's objectives, significantly hindering their scalability. +This paper addresses these limitations by proposing Sonic, a novel genetic data +poisoning attack that leverages incremental and scalable clustering algorithms, +e.g., FISHDBC, as surrogates to accelerate poisoning attacks against +graph-based and density-based clustering methods, such as HDBSCAN. We +empirically demonstrate the effectiveness and efficiency of Sonic in poisoning +the target clustering algorithms. We then conduct a comprehensive analysis of +the factors affecting the scalability and transferability of poisoning attacks +against clustering algorithms, and we conclude by examining the robustness of +hyperparameters in our attack strategy Sonic. + +
+
+ comment: preprint paper +
+
+
+
+
+ + ☆ PolyCL: Contrastive Learning for Polymer Representation Learning via + Explicit and Implicit Augmentations + + +
+ Polymers play a crucial role in a wide array of applications due to their +diverse and tunable properties. Establishing the relationship between polymer +representations and their properties is crucial to the computational design and +screening of potential polymers via machine learning. The quality of the +representation significantly influences the effectiveness of these +computational methods. Here, we present a self-supervised contrastive learning +paradigm, PolyCL, for learning high-quality polymer representation without the +need for labels. Our model combines explicit and implicit augmentation +strategies for improved learning performance. The results demonstrate that our +model achieves either better, or highly competitive, performances on transfer +learning tasks as a feature extractor without an overcomplicated training +strategy or hyperparameter optimisation. Further enhancing the efficacy of our +model, we conducted extensive analyses on various augmentation combinations +used in contrastive learning. This led to identifying the most effective +combination to maximise PolyCL's performance. + +
+
+
+
+
+ + ☆ PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform + Generation + + +
+ Recently, universal waveform generation tasks have been investigated +conditioned on various out-of-distribution scenarios. Although GAN-based +methods have shown their strength in fast waveform generation, they are +vulnerable to train-inference mismatch scenarios such as two-stage +text-to-speech. Meanwhile, diffusion-based models have shown their powerful +generative performance in other domains; however, they stay out of the +limelight due to slow inference speed in waveform generation tasks. Above all, +there is no generator architecture that can explicitly disentangle the natural +periodic features of high-resolution waveform signals. In this paper, we +propose PeriodWave, a novel universal waveform generation model. First, we +introduce a period-aware flow matching estimator that can capture the periodic +features of the waveform signal when estimating the vector fields. +Additionally, we utilize a multi-period estimator that avoids overlaps to +capture different periodic features of waveform signals. Although increasing +the number of periods can improve the performance significantly, this requires +more computational costs. To reduce this issue, we also propose a single +period-conditional universal estimator that can feed-forward parallel by +period-wise batch inference. Additionally, we utilize discrete wavelet +transform to losslessly disentangle the frequency information of waveform +signals for high-frequency modeling, and introduce FreeU to reduce the +high-frequency noise for waveform generation. The experimental results +demonstrated that our model outperforms the previous models both in +Mel-spectrogram reconstruction and text-to-speech tasks. All source code will +be available at \url{https://github.com/sh-lee-prml/PeriodWave}. + +
+
+ comment: 24 pages, 16 tables, 4 figures +
+
+
+
+
+ + ☆ $χ$SPN: Characteristic Interventional Sum-Product Networks for Causal + Inference in Hybrid Domains UAI + + +
+ Causal inference in hybrid domains, characterized by a mixture of discrete +and continuous variables, presents a formidable challenge. We take a step +towards this direction and propose Characteristic Interventional Sum-Product +Network ($\chi$SPN) that is capable of estimating interventional distributions +in presence of random variables drawn from mixed distributions. $\chi$SPN uses +characteristic functions in the leaves of an interventional SPN (iSPN) thereby +providing a unified view for discrete and continuous random variables through +the Fourier-Stieltjes transform of the probability measures. A neural network +is used to estimate the parameters of the learned iSPN using the intervened +data. Our experiments on 3 synthetic heterogeneous datasets suggest that +$\chi$SPN can effectively capture the interventional distributions for both +discrete and continuous variables while being expressive and causally adequate. +We also show that $\chi$SPN generalize to multiple interventions while being +trained only on a single intervention data. + +
+
+ comment: 17 pages, 11 figures. Accepted as poster at UAI (Uncertainty in + Artificial Intelligence) 2024 +
+
+
+
+
+ + ☆ New Curriculum, New Chance -- Retrieval Augmented Generation for Lesson + Planning in Ugandan Secondary Schools. Prototype Quality Evaluation + + +
+ Introduction: Poor educational quality in Secondary Schools is still regarded +as one of the major struggles in 21st century Uganda - especially in rural +areas. Research identifies several problems, including low quality or absent +teacher lesson planning. As the government pushes towards the implementation of +a new curriculum, exiting lesson plans become obsolete and the problem is +worsened. Using a Retrieval Augmented Generation approach, we developed a +prototype that generates customized lesson plans based on the +government-accredited textbooks. This helps teachers create lesson plans more +efficiently and with better quality, ensuring they are fully aligned the new +curriculum and the competence-based learning approach. + Methods: The prototype was created using Cohere LLM and Sentence Embeddings, +and LangChain Framework - and thereafter made available on a public website. +Vector stores were trained for three new curriculum textbooks (ICT, +Mathematics, History), all at Secondary 1 Level. Twenty-four lessons plans were +generated following a pseudo-random generation protocol, based on the suggested +periods in the textbooks. The lesson plans were analyzed regarding their +technical quality by three independent raters following the Lesson Plan +Analysis Protocol (LPAP) by Ndihokubwayo et al. (2022) that is specifically +designed for East Africa and competence-based curriculums. + Results: Evaluation of 24 lesson plans using the LPAP resulted in an average +quality of between 75 and 80%, corresponding to "very good lesson plan". None +of the lesson plans scored below 65%, although one lesson plan could be argued +to have been missing the topic. In conclusion, the quality of the generated +lesson plans is at least comparable, if not better, than those created by +humans, as demonstrated in a study in Rwanda, whereby no lesson plan even +reached the benchmark of 50%. + +
+
+ comment: Presented at Ndejje University Second Annual Research Dissemination + Symposium 2024 +
+
+
+
+
+ + ☆ Development of a Multi-Agent Clinical Decision Support System for Korean + Triage and Acuity Scale (KTAS)-Based Triage and Treatment Planning in + Emergency Departments + + +
+ Emergency department (ED) overcrowding and the complexity of rapid +decision-making in critical care settings pose significant challenges to +healthcare systems worldwide. While clinical decision support systems (CDSS) +have shown promise, the integration of large language models (LLMs) offers new +possibilities for enhancing triage accuracy and clinical decision-making. This +study presents an LLM-driven CDSS designed to assist ED physicians and nurses +in patient triage, treatment planning, and overall emergency care management. + We developed a multi-agent CDSS utilizing Llama-3-70b as the base LLM, +orchestrated by CrewAI and Langchain. The system comprises four AI agents +emulating key ED roles: Triage Nurse, Emergency Physician, Pharmacist, and ED +Coordinator. It incorporates the Korean Triage and Acuity Scale (KTAS) for +triage assessment and integrates with the RxNorm API for medication management. + The model was evaluated using the Asclepius dataset, with performance +assessed by a clinical emergency medicine specialist. The CDSS demonstrated +high accuracy in triage decision-making compared to the baseline of a +single-agent system. Furthermore, the system exhibited strong performance in +critical areas, including primary diagnosis, critical findings identification, +disposition decision-making, treatment planning, and resource allocation. + Our multi-agent CDSS demonstrates significant potential for supporting +comprehensive emergency care management. By leveraging state-of-the-art AI +technologies, this system offers a scalable and adaptable tool that could +enhance emergency medical care delivery, potentially alleviating ED +overcrowding and improving patient outcomes. This work contributes to the +growing field of AI applications in emergency medicine and offers a promising +direction for future research and clinical implementation. + +
+
+
+
+
+ + ☆ Learning-based Models for Vulnerability Detection: An Extensive Study + + +
+ Though many deep learning-based models have made great progress in +vulnerability detection, we have no good understanding of these models, which +limits the further advancement of model capability, understanding of the +mechanism of model detection, and efficiency and safety of practical +application of models. In this paper, we extensively and comprehensively +investigate two types of state-of-the-art learning-based approaches +(sequence-based and graph-based) by conducting experiments on a recently built +large-scale dataset. We investigate seven research questions from five +dimensions, namely model capabilities, model interpretation, model stability, +ease of use of model, and model economy. We experimentally demonstrate the +priority of sequence-based models and the limited abilities of both LLM +(ChatGPT) and graph-based models. We explore the types of vulnerability that +learning-based models skilled in and reveal the instability of the models +though the input is subtlely semantical-equivalently changed. We empirically +explain what the models have learned. We summarize the pre-processing as well +as requirements for easily using the models. Finally, we initially induce the +vital information for economically and safely practical usage of these models. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Optimising MFCC parameters for the automatic detection of respiratory + diseases + + +
+ Voice signals originating from the respiratory tract are utilized as valuable +acoustic biomarkers for the diagnosis and assessment of respiratory diseases. +Among the employed acoustic features, Mel Frequency Cepstral Coefficients +(MFCC) is widely used for automatic analysis, with MFCC extraction commonly +relying on default parameters. However, no comprehensive study has +systematically investigated the impact of MFCC extraction parameters on +respiratory disease diagnosis. In this study, we address this gap by examining +the effects of key parameters, namely the number of coefficients, frame length, +and hop length between frames, on respiratory condition examination. Our +investigation uses four datasets: the Cambridge COVID-19 Sound database, the +Coswara dataset, the Saarbrucken Voice Disorders (SVD) database, and a TACTICAS +dataset. The Support Vector Machine (SVM) is employed as the classifier, given +its widespread adoption and efficacy. Our findings indicate that the accuracy +of MFCC decreases as hop length increases, and the optimal number of +coefficients is observed to be approximately 30. The performance of MFCC varies +with frame length across the datasets: for the COVID-19 datasets (Cambridge +COVID-19 Sound database and Coswara dataset), performance declines with longer +frame lengths, while for the SVD dataset, performance improves with increasing +frame length (from 50 ms to 500 ms). Furthermore, we investigate the optimized +combination of these parameters and observe substantial enhancements in +accuracy. Compared to the worst combination, the SVM model achieves an accuracy +of 81.1%, 80.6%, and 71.7%, with improvements of 19.6%, 16.10%, and 14.90% for +the Cambridge COVID-19 Sound database, the Coswara dataset, and the SVD dataset +respectively. + +
+
+
+
+
+ + ☆ Protected Test-Time Adaptation via Online Entropy Matching: A Betting + Approach + + +
+ We present a novel approach for test-time adaptation via online +self-training, consisting of two components. First, we introduce a statistical +framework that detects distribution shifts in the classifier's entropy values +obtained on a stream of unlabeled samples. Second, we devise an online +adaptation mechanism that utilizes the evidence of distribution shifts captured +by the detection tool to dynamically update the classifier's parameters. The +resulting adaptation process drives the distribution of test entropy values +obtained from the self-trained classifier to match those of the source domain, +building invariance to distribution shifts. This approach departs from the +conventional self-training method, which focuses on minimizing the classifier's +entropy. Our approach combines concepts in betting martingales and online +learning to form a detection tool capable of quickly reacting to distribution +shifts. We then reveal a tight relation between our adaptation scheme and +optimal transport, which forms the basis of our novel self-supervised loss. +Experimental results demonstrate that our approach improves test-time accuracy +under distribution shifts while maintaining accuracy and calibration in their +absence, outperforming leading entropy minimization methods across various +scenarios. + +
+
+
+
+
+ + ☆ Decoder ensembling for learned latent geometries + + +
+ Latent space geometry provides a rigorous and empirically valuable framework +for interacting with the latent variables of deep generative models. This +approach reinterprets Euclidean latent spaces as Riemannian through a pull-back +metric, allowing for a standard differential geometric analysis of the latent +space. Unfortunately, data manifolds are generally compact and easily +disconnected or filled with holes, suggesting a topological mismatch to the +Euclidean latent space. The most established solution to this mismatch is to +let uncertainty be a proxy for topology, but in neural network models, this is +often realized through crude heuristics that lack principle and generally do +not scale to high-dimensional representations. We propose using ensembles of +decoders to capture model uncertainty and show how to easily compute geodesics +on the associated expected manifold. Empirically, we find this simple and +reliable, thereby coming one step closer to easy-to-use latent geometries. + +
+
+ comment: International Conference on Machine Learning, ELLIS Workshop on + Geometry-grounded Representation Learning and Generative Modeling +
+
+
+
+
+ + ☆ Faster Stochastic Optimization with Arbitrary Delays via Asynchronous + Mini-Batching + + +
+ We consider the problem of asynchronous stochastic optimization, where an +optimization algorithm makes updates based on stale stochastic gradients of the +objective that are subject to an arbitrary (possibly adversarial) sequence of +delays. We present a procedure which, for any given $q \in (0,1]$, transforms +any standard stochastic first-order method to an asynchronous method with +convergence guarantee depending on the $q$-quantile delay of the sequence. This +approach leads to convergence rates of the form $O(\tau_q/qT+\sigma/\sqrt{qT})$ +for non-convex and $O(\tau_q^2/(q T)^2+\sigma/\sqrt{qT})$ for convex smooth +problems, where $\tau_q$ is the $q$-quantile delay, generalizing and improving +on existing results that depend on the average delay. We further show a method +that automatically adapts to all quantiles simultaneously, without any prior +knowledge of the delays, achieving convergence rates of the form $O(\inf_{q} +\tau_q/qT+\sigma/\sqrt{qT})$ for non-convex and $O(\inf_{q} \tau_q^2/(q +T)^2+\sigma/\sqrt{qT})$ for convex smooth problems. Our technique is based on +asynchronous mini-batching with a careful batch-size selection and filtering of +stale gradients. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ QirK: Question Answering via Intermediate Representation on Knowledge + Graphs + + +
+ We demonstrate QirK, a system for answering natural language questions on +Knowledge Graphs (KG). QirK can answer structurally complex questions that are +still beyond the reach of emerging Large Language Models (LLMs). It does so +using a unique combination of database technology, LLMs, and semantic search +over vector embeddings. The glue for these components is an intermediate +representation (IR). The input question is mapped to IR using LLMs, which is +then repaired into a valid relational database query with the aid of a semantic +search on vector embeddings. This allows a practical synthesis of LLM +capabilities and KG reliability. + A short video demonstrating QirK is available at +https://youtu.be/6c81BLmOZ0U. + +
+
+
+
+
+ + ☆ Adaptive Basis Function Selection for Computationally Efficient + Predictions + + +
+ Basis Function (BF) expansions are a cornerstone of any engineer's toolbox +for computational function approximation which shares connections with both +neural networks and Gaussian processes. Even though BF expansions are an +intuitive and straightforward model to use, they suffer from quadratic +computational complexity in the number of BFs if the predictive variance is to +be computed. We develop a method to automatically select the most important BFs +for prediction in a sub-domain of the model domain. This significantly reduces +the computational complexity of computing predictions while maintaining +predictive accuracy. The proposed method is demonstrated using two numerical +examples, where reductions up to 50-75% are possible without significantly +reducing the predictive accuracy. + +
+
+ comment: 5 pages, accepted for publication in IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Unsupervised Blind Joint Dereverberation and Room Acoustics Estimation + with Diffusion Models + + +
+ This paper presents an unsupervised method for single-channel blind +dereverberation and room impulse response (RIR) estimation, called BUDDy. The +algorithm is rooted in Bayesian posterior sampling: it combines a likelihood +model enforcing fidelity to the reverberant measurement, and an anechoic speech +prior implemented by an unconditional diffusion model. We design a parametric +filter representing the RIR, with exponential decay for each frequency subband. +Room acoustics estimation and speech dereverberation are jointly carried out, +as the filter parameters are iteratively estimated and the speech utterance +refined along the reverse diffusion trajectory. In a blind scenario where the +room impulse response is unknown, BUDDy successfully performs speech +dereverberation in various acoustic scenarios, significantly outperforming +other blind unsupervised baselines. Unlike supervised methods, which often +struggle to generalize, BUDDy seamlessly adapts to different acoustic +conditions. This paper extends our previous work by offering new experimental +results and insights into the algorithm's performance and versatility. We first +investigate the robustness of informed dereverberation methods to RIR +estimation errors, to motivate the joint acoustic estimation and +dereverberation paradigm. Then, we demonstrate the adaptability of our method +to high-resolution singing voice dereverberation, study its performance in RIR +estimation, and conduct subjective evaluation experiments to validate the +perceptual quality of the results, among other contributions. Audio samples and +code can be found online. + +
+
+ comment: Submitted to IEEE/ACM Transactions on Audio, Speech and Language + Processing +
+
+
+
+
+ + ☆ Fact or Fiction? Improving Fact Verification with Knowledge Graphs + through Simplified Subgraph Retrievals + + +
+ Despite recent success in natural language processing (NLP), fact +verification still remains a difficult task. Due to misinformation spreading +increasingly fast, attention has been directed towards automatically verifying +the correctness of claims. In the domain of NLP, this is usually done by +training supervised machine learning models to verify claims by utilizing +evidence from trustworthy corpora. We present efficient methods for verifying +claims on a dataset where the evidence is in the form of structured knowledge +graphs. We use the FactKG dataset, which is constructed from the DBpedia +knowledge graph extracted from Wikipedia. By simplifying the evidence retrieval +process, from fine-tuned language models to simple logical retrievals, we are +able to construct models that both require less computational resources and +achieve better test-set accuracy. + +
+
+ comment: 10 pages, 3 figures, appendix +
+
+
+
+
+ + ☆ Achieving Data Efficient Neural Networks with Hybrid Concept-based + Models + + +
+ Most datasets used for supervised machine learning consist of a single label +per data point. However, in cases where more information than just the class +label is available, would it be possible to train models more efficiently? We +introduce two novel model architectures, which we call hybrid concept-based +models, that train using both class labels and additional information in the +dataset referred to as concepts. In order to thoroughly assess their +performance, we introduce ConceptShapes, an open and flexible class of datasets +with concept labels. We show that the hybrid concept-based models outperform +standard computer vision models and previously proposed concept-based models +with respect to accuracy, especially in sparse data settings. We also introduce +an algorithm for performing adversarial concept attacks, where an image is +perturbed in a way that does not change a concept-based model's concept +predictions, but changes the class prediction. The existence of such +adversarial examples raises questions about the interpretable qualities +promised by concept-based models. + +
+
+ comment: 11 pages, 8 figures, appendix +
+
+
+
+
+ + ☆ Real-world validation of safe reinforcement learning, model predictive + control and decision tree-based home energy management systems + + +
+ Recent advancements in machine learning based energy management approaches, +specifically reinforcement learning with a safety layer (OptLayerPolicy) and a +metaheuristic algorithm generating a decision tree control policy (TreeC), have +shown promise. However, their effectiveness has only been demonstrated in +computer simulations. This paper presents the real-world validation of these +methods, comparing against model predictive control and simple rule-based +control benchmark. The experiments were conducted on the electrical +installation of 4 reproductions of residential houses, which all have their own +battery, photovoltaic and dynamic load system emulating a non-controllable +electrical load and a controllable electric vehicle charger. The results show +that the simple rules, TreeC, and model predictive control-based methods +achieved similar costs, with a difference of only 0.6%. The reinforcement +learning based method, still in its training phase, obtained a cost 25.5\% +higher to the other methods. Additional simulations show that the costs can be +further reduced by using a more representative training dataset for TreeC and +addressing errors in the model predictive control implementation caused by its +reliance on accurate data from various sources. The OptLayerPolicy safety layer +allows safe online training of a reinforcement learning agent in the +real-world, given an accurate constraint function formulation. The proposed +safety layer method remains error-prone, nonetheless, it is found beneficial +for all investigated methods. The TreeC method, which does require building a +realistic simulation for training, exhibits the safest operational performance, +exceeding the grid limit by only 27.1 Wh compared to 593.9 Wh for reinforcement +learning. + +
+
+
+
+
+ + ☆ Sum-Product-Set Networks + + +
+ Daily internet communication relies heavily on tree-structured graphs, +embodied by popular data formats such as XML and JSON. However, many recent +generative (probabilistic) models utilize neural networks to learn a +probability distribution over undirected cyclic graphs. This assumption of a +generic graph structure brings various computational challenges, and, more +importantly, the presence of non-linearities in neural networks does not permit +tractable probabilistic inference. We address these problems by proposing +sum-product-set networks, an extension of probabilistic circuits from +unstructured tensor data to tree-structured graph data. To this end, we use +random finite sets to reflect a variable number of nodes and edges in the graph +and to allow for exact and efficient inference. We demonstrate that our +tractable model performs comparably to various intractable models based on +neural networks. + +
+
+
+
+
+ + ☆ DPSNN: Spiking Neural Network for Low-Latency Streaming Speech + Enhancement + + +
+ Speech enhancement (SE) improves communication in noisy environments, +affecting areas such as automatic speech recognition, hearing aids, and +telecommunications. With these domains typically being power-constrained and +event-based while requiring low latency, neuromorphic algorithms in the form of +spiking neural networks (SNNs) have great potential. Yet, current effective SNN +solutions require a contextual sampling window imposing substantial latency, +typically around 32ms, too long for many applications. Inspired by Dual-Path +Spiking Neural Networks (DPSNNs) in classical neural networks, we develop a +two-phase time-domain streaming SNN framework -- the Dual-Path Spiking Neural +Network (DPSNN). In the DPSNN, the first phase uses Spiking Convolutional +Neural Networks (SCNNs) to capture global contextual information, while the +second phase uses Spiking Recurrent Neural Networks (SRNNs) to focus on +frequency-related features. In addition, the regularizer suppresses activation +to further enhance energy efficiency of our DPSNNs. Evaluating on the VCTK and +Intel DNS Datasets, we demonstrate that our approach achieves the very low +latency (approximately 5ms) required for applications like hearing aids, while +demonstrating excellent signal-to-noise ratio (SNR), perceptual quality, and +energy efficiency. + +
+
+
+
+
+ + ☆ Fading memory and the convolution theorem + + +
+ Several topological and analytical notions of continuity and fading memory +for causal and time-invariant filters are introduced, and the relations between +them are analysed. A significant generalization of the convolution theorem that +establishes the equivalence between the fading memory property and the +availability of convolution representations of linear filters is proved. This +result extends a previous such characterization to a complete array of weighted +norms in the definition of the fading memory property. Additionally, the main +theorem shows that the availability of convolution representations can be +characterized, at least when the codomain is finite-dimensional, not only by +the fading memory property but also by the reunion of two purely topological +notions that are called minimal continuity and minimal fading memory property. +Finally, when the input space and the codomain of a linear functional are +Hilbert spaces, it is shown that minimal continuity and the minimal fading +memory property guarantee the existence of interesting embeddings of the +associated reproducing kernel Hilbert spaces and approximation results of +solutions of kernel regressions in the presence of finite data sets. + +
+
+
+
+
+ + ☆ Posterior Covariance Structures in Gaussian Processes + + +
+ In this paper, we present a comprehensive analysis of the posterior +covariance field in Gaussian processes, with applications to the posterior +covariance matrix. The analysis is based on the Gaussian prior covariance but +the approach also applies to other covariance kernels. Our geometric analysis +reveals how the Gaussian kernel's bandwidth parameter and the spatial +distribution of the observations influence the posterior covariance as well as +the corresponding covariance matrix, enabling straightforward identification of +areas with high or low covariance in magnitude. Drawing inspiration from the a +posteriori error estimation techniques in adaptive finite element methods, we +also propose several estimators to efficiently measure the absolute posterior +covariance field, which can be used for efficient covariance matrix +approximation and preconditioning. We conduct a wide range of experiments to +illustrate our theoretical findings and their practical applications. + +
+
+ comment: 22 papges +
+
+
+
+
+ + ☆ An Adaptive Importance Sampling for Locally Stable Point Processes + + +
+ The problem of finding the expected value of a statistic of a locally stable +point process in a bounded region is addressed. We propose an adaptive +importance sampling for solving the problem. In our proposal, we restrict the +importance point process to the family of homogeneous Poisson point processes, +which enables us to generate quickly independent samples of the importance +point process. The optimal intensity of the importance point process is found +by applying the cross-entropy minimization method. In the proposed scheme, the +expected value of the function and the optimal intensity are iteratively +estimated in an adaptive manner. We show that the proposed estimator converges +to the target value almost surely, and prove the asymptotic normality of it. We +explain how to apply the proposed scheme to the estimation of the intensity of +a stationary pairwise interaction point process. The performance of the +proposed scheme is compared numerically with the Markov chain Monte Carlo +simulation and the perfect sampling. + +
+
+
+
+
+ + ☆ Robust Active Learning (RoAL): Countering Dynamic Adversaries in Active + Learning with Elastic Weight Consolidation + + +
+ Despite significant advancements in active learning and adversarial attacks, +the intersection of these two fields remains underexplored, particularly in +developing robust active learning frameworks against dynamic adversarial +threats. The challenge of developing robust active learning frameworks under +dynamic adversarial attacks is critical, as these attacks can lead to +catastrophic forgetting within the active learning cycle. This paper introduces +Robust Active Learning (RoAL), a novel approach designed to address this issue +by integrating Elastic Weight Consolidation (EWC) into the active learning +process. Our contributions are threefold: First, we propose a new dynamic +adversarial attack that poses significant threats to active learning +frameworks. Second, we introduce a novel method that combines EWC with active +learning to mitigate catastrophic forgetting caused by dynamic adversarial +attacks. Finally, we conduct extensive experimental evaluations to demonstrate +the efficacy of our approach. The results show that RoAL not only effectively +counters dynamic adversarial threats but also significantly reduces the impact +of catastrophic forgetting, thereby enhancing the robustness and performance of +active learning systems in adversarial environments. + +
+
+
+
+
+ + ☆ BadMerging: Backdoor Attacks Against Model Merging CCS + + +
+ Fine-tuning pre-trained models for downstream tasks has led to a +proliferation of open-sourced task-specific models. Recently, Model Merging +(MM) has emerged as an effective approach to facilitate knowledge transfer +among these independently fine-tuned models. MM directly combines multiple +fine-tuned task-specific models into a merged model without additional +training, and the resulting model shows enhanced capabilities in multiple +tasks. Although MM provides great utility, it may come with security risks +because an adversary can exploit MM to affect multiple downstream tasks. +However, the security risks of MM have barely been studied. In this paper, we +first find that MM, as a new learning paradigm, introduces unique challenges +for existing backdoor attacks due to the merging process. To address these +challenges, we introduce BadMerging, the first backdoor attack specifically +designed for MM. Notably, BadMerging allows an adversary to compromise the +entire merged model by contributing as few as one backdoored task-specific +model. BadMerging comprises a two-stage attack mechanism and a novel +feature-interpolation-based loss to enhance the robustness of embedded +backdoors against the changes of different merging parameters. Considering that +a merged model may incorporate tasks from different domains, BadMerging can +jointly compromise the tasks provided by the adversary (on-task attack) and +other contributors (off-task attack) and solve the corresponding unique +challenges with novel attack designs. Extensive experiments show that +BadMerging achieves remarkable attacks against various MM algorithms. Our +ablation study demonstrates that the proposed attack designs can progressively +contribute to the attack performance. Finally, we show that prior defense +mechanisms fail to defend against our attacks, highlighting the need for more +advanced defense. + +
+
+ comment: To appear in ACM Conference on Computer and Communications Security + (CCS), 2024 +
+
+
+
+
+ + ☆ Towards Few-shot Self-explaining Graph Neural Networks + + +
+ Recent advancements in Graph Neural Networks (GNNs) have spurred an upsurge +of research dedicated to enhancing the explainability of GNNs, particularly in +critical domains such as medicine. A promising approach is the self-explaining +method, which outputs explanations along with predictions. However, existing +self-explaining models require a large amount of training data, rendering them +unavailable in few-shot scenarios. To address this challenge, in this paper, we +propose a Meta-learned Self-Explaining GNN (MSE-GNN), a novel framework that +generates explanations to support predictions in few-shot settings. MSE-GNN +adopts a two-stage self-explaining structure, consisting of an explainer and a +predictor. Specifically, the explainer first imitates the attention mechanism +of humans to select the explanation subgraph, whereby attention is naturally +paid to regions containing important characteristics. Subsequently, the +predictor mimics the decision-making process, which makes predictions based on +the generated explanation. Moreover, with a novel meta-training process and a +designed mechanism that exploits task information, MSE-GNN can achieve +remarkable performance on new few-shot tasks. Extensive experimental results on +four datasets demonstrate that MSE-GNN can achieve superior performance on +prediction tasks while generating high-quality explanations compared with +existing methods. The code is publicly available at +https://github.com/jypeng28/MSE-GNN. + +
+
+
+
+
+ + ☆ RSEA-MVGNN: Multi-View Graph Neural Network with Reliable Structural + Enhancement and Aggregation + + +
+ Graph Neural Networks (GNNs) have exhibited remarkable efficacy in learning +from multi-view graph data. In the framework of multi-view graph neural +networks, a critical challenge lies in effectively combining diverse views, +where each view has distinct graph structure features (GSFs). Existing +approaches to this challenge primarily focus on two aspects: 1) prioritizing +the most important GSFs, 2) utilizing GNNs for feature aggregation. However, +prioritizing the most important GSFs can lead to limited feature diversity, and +existing GNN-based aggregation strategies equally treat each view without +considering view quality. To address these issues, we propose a novel +Multi-View Graph Neural Network with Reliable Structural Enhancement and +Aggregation (RSEA-MVGNN). Firstly, we estimate view-specific uncertainty +employing subjective logic. Based on this uncertainty, we design reliable +structural enhancement by feature de-correlation algorithm. This approach +enables each enhancement to focus on different GSFs, thereby achieving diverse +feature representation in the enhanced structure. Secondly, the model learns +view-specific beliefs and uncertainty as opinions, which are utilized to +evaluate view quality. Based on these opinions, the model enables high-quality +views to dominate GNN aggregation, thereby facilitating representation +learning. Experimental results conducted on five real-world datasets +demonstrate that RSEA-MVGNN outperforms several state-of-the-art GNN-based +methods. + +
+
+
+
+
+ + ☆ An Offline Meta Black-box Optimization Framework for Adaptive Design of + Urban Traffic Light Management Systems + + +
+ Complex urban road networks with high vehicle occupancy frequently face +severe traffic congestion. Designing an effective strategy for managing +multiple traffic lights plays a crucial role in managing congestion. However, +most current traffic light management systems rely on human-crafted decisions, +which may not adapt well to diverse traffic patterns. In this paper, we delve +into two pivotal design components of the traffic light management system that +can be dynamically adjusted to various traffic conditions: phase combination +and phase time allocation. While numerous studies have sought an efficient +strategy for managing traffic lights, most of these approaches consider a fixed +traffic pattern and are limited to relatively small road networks. To overcome +these limitations, we introduce a novel and practical framework to formulate +the optimization of such design components using an offline meta black-box +optimization. We then present a simple yet effective method to efficiently find +a solution for the aforementioned problem. In our framework, we first collect +an offline meta dataset consisting of pairs of design choices and corresponding +congestion measures from various traffic patterns. After collecting the +dataset, we employ the Attentive Neural Process (ANP) to predict the impact of +the proposed design on congestion across various traffic patterns with +well-calibrated uncertainty. Finally, Bayesian optimization, with ANP as a +surrogate model, is utilized to find an optimal design for unseen traffic +patterns through limited online simulations. Our experiment results show that +our method outperforms state-of-the-art baselines on complex road networks in +terms of the number of waiting vehicles. Surprisingly, the deployment of our +method into a real-world traffic system was able to improve traffic throughput +by 4.80\% compared to the original strategy. + +
+
+ comment: 12 pages, 7 figures, 10 tables +
+
+
+
+
+ + ☆ A systematic dataset generation technique applied to data-driven + automotive aerodynamics + + +
+ A novel strategy for generating datasets is developed within the context of +drag prediction for automotive geometries using neural networks. A primary +challenge in this space is constructing a training databse of sufficient size +and diversity. Our method relies on a small number of starting data points, and +provides a recipe to interpolate systematically between them, generating an +arbitrary number of samples at the desired quality. We test this strategy using +a realistic automotive geometry, and demonstrate that convolutional neural +networks perform exceedingly well at predicting drag coefficients and surface +pressures. Promising results are obtained in testing extrapolation performance. +Our method can be applied to other problems of aerodynamic shape optimization. + +
+
+ comment: 26 pages, 28 figures +
+
+
+
+
+ + ☆ Kolmogorov-Arnold Networks (KAN) for Time Series Classification and + Robust Analysis + + +
+ Kolmogorov-Arnold Networks (KAN) has recently attracted significant attention +as a promising alternative to traditional Multi-Layer Perceptrons (MLP). +Despite their theoretical appeal, KAN require validation on large-scale +benchmark datasets. Time series data, which has become increasingly prevalent +in recent years, especially univariate time series are naturally suited for +validating KAN. Therefore, we conducted a fair comparison among KAN, MLP, and +mixed structures. The results indicate that KAN can achieve performance +comparable to, or even slightly better than, MLP across 128 time series +datasets. We also performed an ablation study on KAN, revealing that the output +is primarily determined by the base component instead of b-spline function. +Furthermore, we assessed the robustness of these models and found that KAN and +the hybrid structure MLP\_KAN exhibit significant robustness advantages, +attributed to their lower Lipschitz constants. This suggests that KAN and KAN +layers hold strong potential to be robust models or to improve the adversarial +robustness of other models. + +
+
+ comment: 14 pages, 8 figs +
+
+
+
+
+ + ☆ Nonlocal Attention Operator: Materializing Hidden Knowledge Towards + Interpretable Physics Discovery + + +
+ Despite the recent popularity of attention-based neural architectures in core +AI fields like natural language processing (NLP) and computer vision (CV), +their potential in modeling complex physical systems remains under-explored. +Learning problems in physical systems are often characterized as discovering +operators that map between function spaces based on a few instances of function +pairs. This task frequently presents a severely ill-posed PDE inverse problem. +In this work, we propose a novel neural operator architecture based on the +attention mechanism, which we coin Nonlocal Attention Operator (NAO), and +explore its capability towards developing a foundation physical model. In +particular, we show that the attention mechanism is equivalent to a double +integral operator that enables nonlocal interactions among spatial tokens, with +a data-dependent kernel characterizing the inverse mapping from data to the +hidden parameter field of the underlying operator. As such, the attention +mechanism extracts global prior information from training data generated by +multiple systems, and suggests the exploratory space in the form of a nonlinear +kernel map. Consequently, NAO can address ill-posedness and rank deficiency in +inverse PDE problems by encoding regularization and achieving generalizability. +We empirically demonstrate the advantages of NAO over baseline neural models in +terms of generalizability to unseen data resolutions and system states. Our +work not only suggests a novel neural operator architecture for learning +interpretable foundation models of physical systems, but also offers a new +perspective towards understanding the attention mechanism. + +
+
+
+
+
+ + ☆ Learning Decisions Offline from Censored Observations with + ε-insensitive Operational Costs + + +
+ Many important managerial decisions are made based on censored observations. +Making decisions without adequately handling the censoring leads to inferior +outcomes. We investigate the data-driven decision-making problem with an +offline dataset containing the feature data and the censored historical data of +the variable of interest without the censoring indicators. Without assuming the +underlying distribution, we design and leverage {\epsilon}-insensitive +operational costs to deal with the unobserved censoring in an offline +data-driven fashion. We demonstrate the customization of the +{\epsilon}-insensitive operational costs for a newsvendor problem and use such +costs to train two representative ML models, including linear regression (LR) +models and neural networks (NNs). We derive tight generalization bounds for the +custom LR model without regularization (LR-{\epsilon}NVC) and with +regularization (LR-{\epsilon}NVC-R), and a high-probability generalization +bound for the custom NN (NN-{\epsilon}NVC) trained by stochastic gradient +descent. The theoretical results reveal the stability and learnability of +LR-{\epsilon}NVC, LR-{\epsilon}NVC-R and NN-{\epsilon}NVC. We conduct extensive +numerical experiments to compare LR-{\epsilon}NVC-R and NN-{\epsilon}NVC with +two existing approaches, estimate-as-solution (EAS) and integrated estimation +and optimization (IEO). The results show that LR-{\epsilon}NVC-R and +NN-{\epsilon}NVC outperform both EAS and IEO, with maximum cost savings up to +14.40% and 12.21% compared to the lowest cost generated by the two existing +approaches. In addition, LR-{\epsilon}NVC-R's and NN-{\epsilon}NVC's order +quantities are statistically significantly closer to the optimal solutions +should the underlying distribution be known. + +
+
+
+
+
+ + ☆ Enhancing Visual Question Answering through Ranking-Based Hybrid + Training and Multimodal Fusion + + +
+ Visual Question Answering (VQA) is a challenging task that requires systems +to provide accurate answers to questions based on image content. Current VQA +models struggle with complex questions due to limitations in capturing and +integrating multimodal information effectively. To address these challenges, we +propose the Rank VQA model, which leverages a ranking-inspired hybrid training +strategy to enhance VQA performance. The Rank VQA model integrates high-quality +visual features extracted using the Faster R-CNN model and rich semantic text +features obtained from a pre-trained BERT model. These features are fused +through a sophisticated multimodal fusion technique employing multi-head +self-attention mechanisms. Additionally, a ranking learning module is +incorporated to optimize the relative ranking of answers, thus improving answer +accuracy. The hybrid training strategy combines classification and ranking +losses, enhancing the model's generalization ability and robustness across +diverse datasets. Experimental results demonstrate the effectiveness of the +Rank VQA model. Our model significantly outperforms existing state-of-the-art +models on standard VQA datasets, including VQA v2.0 and COCO-QA, in terms of +both accuracy and Mean Reciprocal Rank (MRR). The superior performance of Rank +VQA is evident in its ability to handle complex questions that require +understanding nuanced details and making sophisticated inferences from the +image and text. This work highlights the effectiveness of a ranking-based +hybrid training strategy in improving VQA performance and lays the groundwork +for further research in multimodal learning methods. + +
+
+ comment: Visual Question Answering, Rank VQA, Faster R-CNN, BERT, Multimodal + Fusion, Ranking Learning, Hybrid Training Strategy +
+
+
+
+
+ + ☆ LiPCoT: Linear Predictive Coding based Tokenizer for Self-supervised + Learning of Time Series Data via Language Models + + +
+ Language models have achieved remarkable success in various natural language +processing tasks. However, their application to time series data, a crucial +component in many domains, remains limited. This paper proposes LiPCoT (Linear +Predictive Coding based Tokenizer for time series), a novel tokenizer that +encodes time series data into a sequence of tokens, enabling self-supervised +learning of time series using existing Language model architectures such as +BERT. Unlike traditional time series tokenizers that rely heavily on CNN +encoder for time series feature generation, LiPCoT employs stochastic modeling +through linear predictive coding to create a latent space for time series +providing a compact yet rich representation of the inherent stochastic nature +of the data. Furthermore, LiPCoT is computationally efficient and can +effectively handle time series data with varying sampling rates and lengths, +overcoming common limitations of existing time series tokenizers. In this +proof-of-concept work, we present the effectiveness of LiPCoT in classifying +Parkinson's disease (PD) using an EEG dataset from 46 participants. In +particular, we utilize LiPCoT to encode EEG data into a small vocabulary of +tokens and then use BERT for self-supervised learning and the downstream task +of PD classification. We benchmark our approach against several +state-of-the-art CNN-based deep learning architectures for PD detection. Our +results reveal that BERT models utilizing self-supervised learning outperformed +the best-performing existing method by 7.1% in precision, 2.3% in recall, 5.5% +in accuracy, 4% in AUC, and 5% in F1-score highlighting the potential for +self-supervised learning even on small datasets. Our work will inform future +foundational models for time series, particularly for self-supervised learning. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ Ensemble architecture in polyp segmentation + + +
+ In this research, we revisit the architecture of semantic segmentation and +evaluate the models excelling in polyp segmentation. We introduce an integrated +framework that harnesses the advantages of different models to attain an +optimal outcome. More specifically, we fuse the learned features from +convolutional and transformer models for prediction, and we view this approach +as an ensemble technique to enhance model performance. Our experiments on polyp +segmentation reveal that the proposed architecture surpasses other top models, +exhibiting improved learning capacity and resilience. The code is available at +https://github.com/HuangDLab/EnFormer. + +
+
+
+
+
+ + ☆ Learning Multi-Index Models with Neural Networks via Mean-Field Langevin + Dynamics + + +
+ We study the problem of learning multi-index models in high-dimensions using +a two-layer neural network trained with the mean-field Langevin algorithm. +Under mild distributional assumptions on the data, we characterize the +effective dimension $d_{\mathrm{eff}}$ that controls both sample and +computational complexity by utilizing the adaptivity of neural networks to +latent low-dimensional structures. When the data exhibit such a structure, +$d_{\mathrm{eff}}$ can be significantly smaller than the ambient dimension. We +prove that the sample complexity grows almost linearly with $d_{\mathrm{eff}}$, +bypassing the limitations of the information and generative exponents that +appeared in recent analyses of gradient-based feature learning. On the other +hand, the computational complexity may inevitably grow exponentially with +$d_{\mathrm{eff}}$ in the worst-case scenario. Motivated by improving +computational complexity, we take the first steps towards polynomial time +convergence of the mean-field Langevin algorithm by investigating a setting +where the weights are constrained to be on a compact manifold with positive +Ricci curvature, such as the hypersphere. There, we study assumptions under +which polynomial time convergence is achievable, whereas similar assumptions in +the Euclidean setting lead to exponential time complexity. + +
+
+ comment: 35 pages, 1 figure +
+
+
+
+
+ + ☆ All-around Neural Collapse for Imbalanced Classification + + +
+ Neural Collapse (NC) presents an elegant geometric structure that enables +individual activations (features), class means and classifier (weights) vectors +to reach \textit{optimal} inter-class separability during the terminal phase of +training on a \textit{balanced} dataset. Once shifted to imbalanced +classification, such an optimal structure of NC can be readily destroyed by the +notorious \textit{minority collapse}, where the classifier vectors +corresponding to the minority classes are squeezed. In response, existing works +endeavor to recover NC typically by optimizing classifiers. However, we +discover that this squeezing phenomenon is not only confined to classifier +vectors but also occurs with class means. + Consequently, reconstructing NC solely at the classifier aspect may be +futile, as the feature means remain compressed, leading to the violation of +inherent \textit{self-duality} in NC (\textit{i.e.}, class means and classifier +vectors converge mutually) and incidentally, resulting in an unsatisfactory +collapse of individual activations towards the corresponding class means. To +shake off these dilemmas, we present a unified \textbf{All}-around +\textbf{N}eural \textbf{C}ollapse framework (AllNC), aiming to comprehensively +restore NC across multiple aspects including individual activations, class +means and classifier vectors. We thoroughly analyze its effectiveness and +verify on multiple benchmark datasets that it achieves state-of-the-art in both +balanced and imbalanced settings. + +
+
+
+
+
+ + ☆ BiLSTM and Attention-Based Modulation Classification of Realistic + Wireless Signals SP + + +
+ This work proposes a novel and efficient quadstream BiLSTM-Attention network, +abbreviated as QSLA network, for robust automatic modulation classification +(AMC) of wireless signals. The proposed model exploits multiple representations +of the wireless signal as inputs to the network and the feature extraction +process combines convolutional and BiLSTM layers for processing the spatial and +temporal features of the signal, respectively. An attention layer is used after +the BiLSTM layer to emphasize the important temporal features. The experimental +results on the recent and realistic RML22 dataset demonstrate the superior +performance of the proposed model with an accuracy up to around 99%. The model +is compared with other benchmark models in the literature in terms of +classification accuracy, computational complexity, memory usage, and training +time to show the effectiveness of our proposed approach. + +
+
+ comment: Accepted at the IEEE International Conference on Signal Processing + and Communications (SPCOM) 2024 +
+
+
+
+
+ + ☆ Seeing and Understanding: Bridging Vision with Chemical Knowledge Via + ChemVLM + + +
+ In this technical report, we propose ChemVLM, the first open-source +multimodal large language model dedicated to the fields of chemistry, designed +to address the incompatibility between chemical image understanding and text +analysis. Built upon the VIT-MLP-LLM architecture, we leverage ChemLLM-20B as +the foundational large model, endowing our model with robust capabilities in +understanding and utilizing chemical text knowledge. Additionally, we employ +InternVIT-6B as a powerful image encoder. We have curated high-quality data +from the chemical domain, including molecules, reaction formulas, and chemistry +examination data, and compiled these into a bilingual multimodal +question-answering dataset. We test the performance of our model on multiple +open-source benchmarks and three custom evaluation sets. Experimental results +demonstrate that our model achieves excellent performance, securing +state-of-the-art results in five out of six involved tasks. Our model can be +found at https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: Techical report +
+
+
+
+
+ + ☆ q-exponential family for policy optimization + + +
+ Policy optimization methods benefit from a simple and tractable policy +functional, usually the Gaussian for continuous action spaces. In this paper, +we consider a broader policy family that remains tractable: the $q$-exponential +family. This family of policies is flexible, allowing the specification of both +heavy-tailed policies ($q>1$) and light-tailed policies ($q<1$). This paper +examines the interplay between $q$-exponential policies for several +actor-critic algorithms conducted on both online and offline problems. We find +that heavy-tailed policies are more effective in general and can consistently +improve on Gaussian. In particular, we find the Student's t-distribution to be +more stable than the Gaussian across settings and that a heavy-tailed +$q$-Gaussian for Tsallis Advantage Weighted Actor-Critic consistently performs +well in offline benchmark problems. Our code is available at +\url{https://github.com/lingweizhu/qexp}. + +
+
+ comment: 27 pages, 12 pages main text, 15 pages appendix +
+
+
+
+
+ + ☆ Enhancing Autonomous Vehicle Perception in Adverse Weather through Image + Augmentation during Semantic Segmentation Training + + +
+ Robust perception is crucial in autonomous vehicle navigation and +localization. Visual processing tasks, like semantic segmentation, should work +in varying weather conditions and during different times of day. Semantic +segmentation is where each pixel is assigned a class, which is useful for +locating overall features (1). Training a segmentation model requires large +amounts of data, and the labeling process for segmentation data is especially +tedious. Additionally, many large datasets include only images taken in clear +weather. This is a problem because training a model exclusively on clear +weather data hinders performance in adverse weather conditions like fog or +rain. We hypothesize that given a dataset of only clear days images, applying +image augmentation (such as random rain, fog, and brightness) during training +allows for domain adaptation to diverse weather conditions. We used CARLA, a 3D +realistic autonomous vehicle simulator, to collect 1200 images in clear weather +composed of 29 classes from 10 different towns (2). We also collected 1200 +images of random weather effects. We trained encoder-decoder UNet models to +perform semantic segmentation. Applying augmentations significantly improved +segmentation under weathered night conditions (p < 0.001). However, models +trained on weather data have significantly lower losses than those trained on +augmented data in all conditions except for clear days. This shows there is +room for improvement in the domain adaptation approach. Future work should test +more types of augmentations and also use real-life images instead of CARLA. +Ideally, the augmented model meets or exceeds the performance of the weather +model. + +
+
+
+
+
+ + ☆ CON-FOLD -- Explainable Machine Learning with Confidence + + +
+ FOLD-RM is an explainable machine learning classification algorithm that uses +training data to create a set of classification rules. In this paper we +introduce CON-FOLD which extends FOLD-RM in several ways. CON-FOLD assigns +probability-based confidence scores to rules learned for a classification task. +This allows users to know how confident they should be in a prediction made by +the model. We present a confidence-based pruning algorithm that uses the unique +structure of FOLD-RM rules to efficiently prune rules and prevent overfitting. +Furthermore, CON-FOLD enables the user to provide pre-existing knowledge in the +form of logic program rules that are either (fixed) background knowledge or +(modifiable) initial rule candidates. The paper describes our method in detail +and reports on practical experiments. We demonstrate the performance of the +algorithm on benchmark datasets from the UCI Machine Learning Repository. For +that, we introduce a new metric, Inverse Brier Score, to evaluate the accuracy +of the produced confidence scores. Finally we apply this extension to a real +world example that requires explainability: marking of student responses to a +short answer question from the Australian Physics Olympiad. + +
+
+
+
+
+ + ☆ Training Language Models on the Knowledge Graph: Insights on + Hallucinations and Their Detectability + + +
+ While many capabilities of language models (LMs) improve with increased +training budget, the influence of scale on hallucinations is not yet fully +understood. Hallucinations come in many forms, and there is no universally +accepted definition. We thus focus on studying only those hallucinations where +a correct answer appears verbatim in the training set. To fully control the +training data content, we construct a knowledge graph (KG)-based dataset, and +use it to train a set of increasingly large LMs. We find that for a fixed +dataset, larger and longer-trained LMs hallucinate less. However, hallucinating +on $\leq5$% of the training data requires an order of magnitude larger model, +and thus an order of magnitude more compute, than Hoffmann et al. (2022) +reported was optimal. Given this costliness, we study how hallucination +detectors depend on scale. While we see detector size improves performance on +fixed LM's outputs, we find an inverse relationship between the scale of the LM +and the detectability of its hallucinations. + +
+
+ comment: Published at COLM 2024. 16 pages, 11 figures +
+
+
+
+
+ + ☆ Time-inversion of spatiotemporal beam dynamics using uncertainty-aware + latent evolution reversal + + +
+ Charged particle dynamics under the influence of electromagnetic fields is a +challenging spatiotemporal problem. Many high performance physics-based +simulators for predicting behavior in a charged particle beam are +computationally expensive, limiting their utility for solving inverse problems +online. The problem of estimating upstream six-dimensional phase space given +downstream measurements of charged particles in an accelerator is an inverse +problem of growing importance. This paper introduces a reverse Latent Evolution +Model (rLEM) designed for temporal inversion of forward beam dynamics. In this +two-step self-supervised deep learning framework, we utilize a Conditional +Variational Autoencoder (CVAE) to project 6D phase space projections of a +charged particle beam into a lower-dimensional latent distribution. +Subsequently, we autoregressively learn the inverse temporal dynamics in the +latent space using a Long Short-Term Memory (LSTM) network. The coupled +CVAE-LSTM framework can predict 6D phase space projections across all upstream +accelerating sections based on single or multiple downstream phase space +measurements as inputs. The proposed model also captures the aleatoric +uncertainty of the high-dimensional input data within the latent space. This +uncertainty, which reflects potential uncertain measurements at a given module, +is propagated through the LSTM to estimate uncertainty bounds for all upstream +predictions, demonstrating the robustness of the LSTM against in-distribution +variations in the input data. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.13858 +
+
+
+
+
+ + ☆ Enhancing Equitable Access to AI in Housing and Homelessness System of + Care through Federated Learning AAAI + + +
+ The top priority of a Housing and Homelessness System of Care (HHSC) is to +connect people experiencing homelessness to supportive housing. An HHSC +typically consists of many agencies serving the same population. Information +technology platforms differ in type and quality between agencies, so their data +are usually isolated from one agency to another. Larger agencies may have +sufficient data to train and test artificial intelligence (AI) tools but +smaller agencies typically do not. To address this gap, we introduce a +Federated Learning (FL) approach enabling all agencies to train a predictive +model collaboratively without sharing their sensitive data. We demonstrate how +FL can be used within an HHSC to provide all agencies equitable access to +quality AI and further assist human decision-makers in the allocation of +resources within HHSC. This is achieved while preserving the privacy of the +people within the data by not sharing identifying information between agencies +without their consent. Our experimental results using real-world HHSC data from +Calgary, Alberta, demonstrate that our FL approach offers comparable +performance with the idealized scenario of training the predictive model with +data fully shared and linked between agencies. + +
+
+ comment: Accepted at the 2024 AAAI/ACM Conference on AI, Ethics, and Society + (AIES) +
+
+
+
+
+ + ☆ SustainDC -- Benchmarking for Sustainable Data Center Control NeurIPS 2024 + + +
+ Machine learning has driven an exponential increase in computational demand, +leading to massive data centers that consume significant amounts of energy and +contribute to climate change. This makes sustainable data center control a +priority. In this paper, we introduce SustainDC, a set of Python environments +for benchmarking multi-agent reinforcement learning (MARL) algorithms for data +centers (DC). SustainDC supports custom DC configurations and tasks such as +workload scheduling, cooling optimization, and auxiliary battery management, +with multiple agents managing these operations while accounting for the effects +of each other. We evaluate various MARL algorithms on SustainDC, showing their +performance across diverse DC designs, locations, weather conditions, grid +carbon intensity, and workload requirements. Our results highlight significant +opportunities for improvement of data center operations using MARL algorithms. +Given the increasing use of DC due to AI, SustainDC provides a crucial platform +for the development and benchmarking of advanced algorithms essential for +achieving sustainable computing and addressing other heterogeneous real-world +challenges. + +
+
+ comment: Under review at Advances in Neural Information Processing Systems + 2024 (NeurIPS 2024) +
+
+
+
+
+ + ☆ CarbonClipper: Optimal Algorithms for Carbon-Aware Spatiotemporal + Workload Management + + +
+ We study carbon-aware spatiotemporal workload management, which seeks to +address the growing environmental impact of data centers. We formalize this as +an online problem called spatiotemporal online allocation with deadline +constraints ($\mathsf{SOAD}$), in which an online player completes a workload +(e.g., a batch compute job) by moving and scheduling the workload across a +network subject to a deadline $T$. At each time step, a service cost function +is revealed, representing, e.g., the carbon intensity of servicing a workload +at each location, and the player must irrevocably decide the current +allocation. Furthermore, whenever the player moves the allocation, it incurs a +movement cost defined by a metric space $(X,d)$ that captures, e.g., the +overhead of migrating a compute job. $\mathsf{SOAD}$ formalizes the open +problem of combining general metrics and deadline constraints in the online +algorithms literature, unifying problems such as metrical task systems and +online search. We propose a competitive algorithm for $\mathsf{SOAD}$ along +with a matching lower bound that proves it is optimal. Our main algorithm, +${\rm C{\scriptsize ARBON}C{\scriptsize LIPPER}}$, is a learning-augmented +algorithm that takes advantage of predictions (e.g., carbon intensity +forecasts) and achieves an optimal consistency-robustness trade-off. We +evaluate our proposed algorithms for carbon-aware spatiotemporal workload +management on a simulated global data center network, showing that ${\rm +C{\scriptsize ARBON}C{\scriptsize LIPPER}}$ significantly improves performance +compared to baseline methods and delivers meaningful carbon reductions. + +
+
+ comment: 50 pages, 21 figures +
+
+
+
+
+ + ☆ Differentiating Policies for Non-Myopic Bayesian Optimization + + +
+ Bayesian optimization (BO) methods choose sample points by optimizing an +acquisition function derived from a statistical model of the objective. These +acquisition functions are chosen to balance sampling regions with predicted +good objective values against exploring regions where the objective is +uncertain. Standard acquisition functions are myopic, considering only the +impact of the next sample, but non-myopic acquisition functions may be more +effective. In principle, one could model the sampling by a Markov decision +process, and optimally choose the next sample by maximizing an expected reward +computed by dynamic programming; however, this is infeasibly expensive. More +practical approaches, such as rollout, consider a parametric family of sampling +policies. In this paper, we show how to efficiently estimate rollout +acquisition functions and their gradients, enabling stochastic gradient-based +optimization of sampling policies. + +
+
+
+
+
+ + ☆ Kraken: Inherently Parallel Transformers For Efficient Multi-Device + Inference + + +
+ Large Transformer networks are increasingly used in settings where low +inference latency can improve the end-user experience and enable new +applications. However, autoregressive inference is resource intensive and +requires parallelism for efficiency. Parallelism introduces collective +communication that is both expensive and represents a phase when hardware +resources are underutilized. Towards mitigating this, Kraken is an evolution of +the standard Transformer architecture that is designed to complement existing +tensor parallelism schemes for efficient inference on multi-device systems. By +introducing a fixed degree of intra-layer model parallelism, the architecture +allows collective operations to be overlapped with compute, decreasing latency +and increasing hardware utilization. When trained on OpenWebText, Kraken models +reach a similar perplexity as standard Transformers while also preserving their +language modeling capabilities when evaluated on the SuperGLUE benchmark. +Importantly, when tested on multi-GPU systems using TensorRT-LLM engines, +Kraken speeds up Time To First Token by a mean of 35.6% across a range of model +sizes, context lengths, and degrees of tensor parallelism. + +
+
+
+
+
+ + ☆ Ranking and Combining Latent Structured Predictive Scores without + Labeled Data + + +
+ Combining multiple predictors obtained from distributed data sources to an +accurate meta-learner is promising to achieve enhanced performance in lots of +prediction problems. As the accuracy of each predictor is usually unknown, +integrating the predictors to achieve better performance is challenging. +Conventional ensemble learning methods assess the accuracy of predictors based +on extensive labeled data. In practical applications, however, the acquisition +of such labeled data can prove to be an arduous task. Furthermore, the +predictors under consideration may exhibit high degrees of correlation, +particularly when similar data sources or machine learning algorithms were +employed during their model training. In response to these challenges, this +paper introduces a novel structured unsupervised ensemble learning model (SUEL) +to exploit the dependency between a set of predictors with continuous +predictive scores, rank the predictors without labeled data and combine them to +an ensembled score with weights. Two novel correlation-based decomposition +algorithms are further proposed to estimate the SUEL model, constrained +quadratic optimization (SUEL.CQO) and matrix-factorization-based (SUEL.MF) +approaches. The efficacy of the proposed methods is rigorously assessed through +both simulation studies and real-world application of risk genes discovery. The +results compellingly demonstrate that the proposed methods can efficiently +integrate the dependent predictors to an ensemble model without the need of +ground truth data. + +
+
+
+
+
+ + ☆ An Efficient and Explanatory Image and Text Clustering System with + Multimodal Autoencoder Architecture + + +
+ We demonstrate the efficiencies and explanatory abilities of extensions to +the common tools of Autoencoders and LLM interpreters, in the novel context of +comparing different cultural approaches to the same international news event. +We develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model +that extends the modalities of previous CVAE models, by using fully-connected +latent layers to embed in parallel the CNN encodings of video frames, together +with the LSTM encodings of their related text derived from audio. We +incorporate the model within a larger system that includes frame-caption +alignment, latent space vector clustering, and a novel LLM-based cluster +interpreter. We measure, tune, and apply this system to the task of summarizing +a video into three to five thematic clusters, with each theme described by ten +LLM-produced phrases. We apply this system to two news topics, COVID-19 and the +Winter Olympics, and five other topics are in progress. + +
+
+
+
+
+ + ☆ Knowledge-based Neural Ordinary Differential Equations for Cosserat + Rod-based Soft Robots + + +
+ Soft robots have many advantages over rigid robots thanks to their compliant +and passive nature. However, it is generally challenging to model the dynamics +of soft robots due to their high spatial dimensionality, making it difficult to +use model-based methods to accurately control soft robots. It often requires +direct numerical simulation of partial differential equations to simulate soft +robots. This not only requires an accurate numerical model, but also makes soft +robot modeling slow and expensive. Deep learning algorithms have shown promises +in data-driven modeling of soft robots. However, these algorithms usually +require a large amount of data, which are difficult to obtain in either +simulation or real-world experiments of soft robots. In this work, we propose +KNODE-Cosserat, a framework that combines first-principle physics models and +neural ordinary differential equations. We leverage the best from both worlds +-- the generalization ability of physics-based models and the fast speed of +deep learning methods. We validate our framework in both simulation and +real-world experiments. In both cases, we show that the robot model +significantly improves over the baseline models under different metrics. + +
+
+ comment: 8 pages, 11 figures, 4 tables +
+
+
+
+
+ + ☆ MedTsLLM: Leveraging LLMs for Multimodal Medical Time Series Analysis + + +
+ The complexity and heterogeneity of data in many real-world applications pose +significant challenges for traditional machine learning and signal processing +techniques. For instance, in medicine, effective analysis of diverse +physiological signals is crucial for patient monitoring and clinical +decision-making and yet highly challenging. We introduce MedTsLLM, a general +multimodal large language model (LLM) framework that effectively integrates +time series data and rich contextual information in the form of text to analyze +physiological signals, performing three tasks with clinical relevance: semantic +segmentation, boundary detection, and anomaly detection in time series. These +critical tasks enable deeper analysis of physiological signals and can provide +actionable insights for clinicians. We utilize a reprogramming layer to align +embeddings of time series patches with a pretrained LLM's embedding space and +make effective use of raw time series, in conjunction with textual context. +Given the multivariate nature of medical datasets, we develop methods to handle +multiple covariates. We additionally tailor the text prompt to include +patient-specific information. Our model outperforms state-of-the-art baselines, +including deep learning models, other LLMs, and clinical methods across +multiple medical domains, specifically electrocardiograms and respiratory +waveforms. MedTsLLM presents a promising step towards harnessing the power of +LLMs for medical time series analysis that can elevate data-driven tools for +clinicians and improve patient outcomes. + +
+
+ comment: published in Proceedings of Machine Learning Research, MLHC 2024 +
+
+
+
+
+ + ☆ Out-of-Distribution Learning with Human Feedback + + +
+ Out-of-distribution (OOD) learning often relies heavily on statistical +approaches or predefined assumptions about OOD data distributions, hindering +their efficacy in addressing multifaceted challenges of OOD generalization and +OOD detection in real-world deployment environments. This paper presents a +novel framework for OOD learning with human feedback, which can provide +invaluable insights into the nature of OOD shifts and guide effective model +adaptation. Our framework capitalizes on the freely available unlabeled data in +the wild that captures the environmental test-time OOD distributions under both +covariate and semantic shifts. To harness such data, our key idea is to +selectively provide human feedback and label a small number of informative +samples from the wild data distribution, which are then used to train a +multi-class classifier and an OOD detector. By exploiting human feedback, we +enhance the robustness and reliability of machine learning models, equipping +them with the capability to handle OOD scenarios with greater precision. We +provide theoretical insights on the generalization error bounds to justify our +algorithm. Extensive experiments show the superiority of our method, +outperforming the current state-of-the-art by a significant margin. + +
+
+
+
+
+ + ☆ Data Clustering and Visualization with Recursive Goemans-Williamson + MaxCut Algorithm SC + + +
+ In this article, we introduce a novel recursive modification to the classical +Goemans-Williamson MaxCut algorithm, offering improved performance in +vectorized data clustering tasks. Focusing on the clustering of medical +publications, we employ recursive iterations in conjunction with a dimension +relaxation method to significantly enhance density of clustering results. +Furthermore, we propose a unique vectorization technique for articles, +leveraging conditional probabilities for more effective clustering. Our methods +provide advantages in both computational efficiency and clustering accuracy, +substantiated through comprehensive experiments. + +
+
+ comment: Published in the IEEE Conference, CSCI 2023 (Winter Session) +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Laplace Learning on Stiefel Manifolds + + +
+ Motivated by the need to address the degeneracy of canonical Laplace learning +algorithms in low label rates, we propose to reformulate graph-based +semi-supervised learning as a nonconvex generalization of a \emph{Trust-Region +Subproblem} (TRS). This reformulation is motivated by the well-posedness of +Laplacian eigenvectors in the limit of infinite unlabeled data. To solve this +problem, we first show that a first-order condition implies the solution of a +manifold alignment problem and that solutions to the classical \emph{Orthogonal +Procrustes} problem can be used to efficiently find good classifiers that are +amenable to further refinement. To tackle refinement, we develop the framework +of Sequential Subspace Optimization for graph-based SSL. Next, we address the +criticality of selecting supervised samples at low-label rates. We characterize +informative samples with a novel measure of centrality derived from the +principal eigenvectors of a certain submatrix of the graph Laplacian. We +demonstrate that our framework achieves lower classification error compared to +recent state-of-the-art and classical semi-supervised learning methods at +extremely low, medium, and high label rates. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2006.11184 by other authors +
+
+
+
+
+ + ♻ ☆ Agent Instructs Large Language Models to be General Zero-Shot Reasoners ICML 2024 + + +
+ We introduce a method to improve the zero-shot reasoning abilities of large +language models on general language understanding tasks. Specifically, we build +an autonomous agent to instruct the reasoning process of large language models. +We show this approach further unleashes the zero-shot reasoning abilities of +large language models to more tasks. We study the performance of our method on +a wide set of datasets spanning generation, classification, and reasoning. We +show that our method generalizes to most tasks and obtains state-of-the-art +zero-shot performance on 20 of the 29 datasets that we evaluate. For instance, +our method boosts the performance of state-of-the-art large language models by +a large margin, including Vicuna-13b (13.3%), Llama-2-70b-chat (23.2%), and +GPT-3.5 Turbo (17.0%). Compared to zero-shot chain of thought, our improvement +in reasoning is striking, with an average increase of 10.5%. With our method, +Llama-2-70b-chat outperforms zero-shot GPT-3.5 Turbo by 10.2%. + +
+
+ comment: Accepted to ICML 2024 +
+
+
+
+
+ + ♻ ☆ Learning Optimal Signal Temporal Logic Decision Trees for + Classification: A Max-Flow MILP Formulation + + +
+ This paper presents a novel framework for inferring timed temporal logic +properties from data. The dataset comprises pairs of finite-time system traces +and corresponding labels, denoting whether the traces demonstrate specific +desired behaviors, e.g. whether the ship follows a safe route or not. Our +proposed approach leverages decision-tree-based methods to infer Signal +Temporal Logic classifiers using primitive formulae. We formulate the inference +process as a mixed integer linear programming optimization problem, recursively +generating constraints to determine both data classification and tree +structure. Applying a max-flow algorithm on the resultant tree transforms the +problem into a global optimization challenge, leading to improved +classification rates compared to prior methodologies. Moreover, we introduce a +technique to reduce the number of constraints by exploiting the symmetry +inherent in STL primitives, which enhances the algorithm's time performance and +interpretability. To assess our algorithm's effectiveness and classification +performance, we conduct three case studies involving two-class, multi-class, +and complex formula classification scenarios. + +
+
+
+
+
+ + ♻ ☆ Time Series Predictions in Unmonitored Sites: A Survey of Machine + Learning Techniques in Water Resources + + +
+ Prediction of dynamic environmental variables in unmonitored sites remains a +long-standing challenge for water resources science. The majority of the +world's freshwater resources have inadequate monitoring of critical +environmental variables needed for management. Yet, the need to have widespread +predictions of hydrological variables such as river flow and water quality has +become increasingly urgent due to climate and land use change over the past +decades, and their associated impacts on water resources. Modern machine +learning methods increasingly outperform their process-based and empirical +model counterparts for hydrologic time series prediction with their ability to +extract information from large, diverse data sets. We review relevant +state-of-the art applications of machine learning for streamflow, water +quality, and other water resources prediction and discuss opportunities to +improve the use of machine learning with emerging methods for incorporating +watershed characteristics into deep learning models, transfer learning, and +incorporating process knowledge into machine learning models. The analysis here +suggests most prior efforts have been focused on deep learning learning +frameworks built on many sites for predictions at daily time scales in the +United States, but that comparisons between different classes of machine +learning methods are few and inadequate. We identify several open questions for +time series predictions in unmonitored sites that include incorporating dynamic +inputs and site characteristics, mechanistic understanding and spatial context, +and explainable AI techniques in modern machine learning frameworks. + +
+
+ comment: 39 pages, 4 figures, 1 table, Accepted to Environmental Data Science +
+
+
+
+
+ + ♻ ☆ Implicit Causal Representation Learning via Switchable Mechanisms + + +
+ Learning causal representations from observational and interventional data in +the absence of known ground-truth graph structures necessitates implicit latent +causal representation learning. Implicit learning of causal mechanisms +typically involves two categories of interventional data: hard and soft +interventions. In real-world scenarios, soft interventions are often more +realistic than hard interventions, as the latter require fully controlled +environments. Unlike hard interventions, which directly force changes in a +causal variable, soft interventions exert influence indirectly by affecting the +causal mechanism. However, the subtlety of soft interventions impose several +challenges for learning causal models. One challenge is that soft +intervention's effects are ambiguous, since parental relations remain intact. +In this paper, we tackle the challenges of learning causal models using soft +interventions while retaining implicit modelling. We propose ICLR-SM, which +models the effects of soft interventions by employing a causal mechanism switch +variable designed to toggle between different causal mechanisms. In our +experiments, we consistently observe improved learning of identifiable, causal +representations, compared to baseline approaches. + +
+
+
+
+
+ + ♻ ☆ Distilling the Knowledge in Data Pruning + + +
+ With the increasing size of datasets used for training neural networks, data +pruning becomes an attractive field of research. However, most current data +pruning algorithms are limited in their ability to preserve accuracy compared +to models trained on the full data, especially in high pruning regimes. In this +paper we explore the application of data pruning while incorporating knowledge +distillation (KD) when training on a pruned subset. That is, rather than +relying solely on ground-truth labels, we also use the soft predictions from a +teacher network pre-trained on the complete data. By integrating KD into +training, we demonstrate significant improvement across datasets, pruning +methods, and on all pruning fractions. We first establish a theoretical +motivation for employing self-distillation to improve training on pruned data. +Then, we empirically make a compelling and highly practical observation: using +KD, simple random pruning is comparable or superior to sophisticated pruning +methods across all pruning regimes. On ImageNet for example, we achieve +superior accuracy despite training on a random subset of only 50% of the data. +Additionally, we demonstrate a crucial connection between the pruning factor +and the optimal knowledge distillation weight. This helps mitigate the impact +of samples with noisy labels and low-quality images retained by typical pruning +algorithms. Finally, we make an intriguing observation: when using lower +pruning fractions, larger teachers lead to accuracy degradation, while +surprisingly, employing teachers with a smaller capacity than the student's may +improve results. Our code will be made available. + +
+
+
+
+
+ + ♻ ☆ Fair Enough? A map of the current limitations of the requirements to + have fair algorithms + + +
+ In recent years, the increase in the usage and efficiency of Artificial +Intelligence and, more in general, of Automated Decision-Making systems has +brought with it an increasing and welcome awareness of the risks associated +with such systems. One of such risks is that of perpetuating or even amplifying +bias and unjust disparities present in the data from which many of these +systems learn to adjust and optimise their decisions. This awareness has on the +one hand encouraged several scientific communities to come up with more and +more appropriate ways and methods to assess, quantify, and possibly mitigate +such biases and disparities. On the other hand, it has prompted more and more +layers of society, including policy makers, to call for fair algorithms. We +believe that while many excellent and multidisciplinary research is currently +being conducted, what is still fundamentally missing is the awareness that +having fair algorithms is per se a nearly meaningless requirement that needs to +be complemented with many additional social choices to become actionable. +Namely, there is a hiatus between what the society is demanding from Automated +Decision-Making systems, and what this demand actually means in real-world +scenarios. In this work, we outline the key features of such a hiatus and +pinpoint a set of crucial open points that we as a society must address in +order to give a concrete meaning to the increasing demand of fairness in +Automated Decision-Making systems. + +
+
+ comment: 20 pages, 2 figures, 2 tables. V2: added reference, update info on AI + Act. V3: changed structure of open points, update info on AI Act and other + legislation +
+
+
+
+
+ + ♻ ☆ Massive Activations in Large Language Models + + +
+ We observe an empirical phenomenon in Large Language Models (LLMs) -- very +few activations exhibit significantly larger values than others (e.g., 100,000 +times larger). We call them massive activations. First, we demonstrate the +widespread existence of massive activations across various LLMs and +characterize their locations. Second, we find their values largely stay +constant regardless of the input, and they function as indispensable bias terms +in LLMs. Third, these massive activations lead to the concentration of +attention probabilities to their corresponding tokens, and further, implicit +bias terms in the self-attention output. Last, we also study massive +activations in Vision Transformers. Code is available at +https://github.com/locuslab/massive-activations. + +
+
+ comment: First Conference on Language Modeling (COLM), 2024. Website at + https://eric-mingjie.github.io/massive-activations/index.html +
+
+
+
+
+ + ♻ ☆ Value-Based Rationales Improve Social Experience: A Multiagent + Simulation Study ECAI 2024 + + +
+ We propose Exanna, a framework to realize agents that incorporate values in +decision making. An Exannaagent considers the values of itself and others when +providing rationales for its actions and evaluating the rationales provided by +others. Via multiagent simulation, we demonstrate that considering values in +decision making and producing rationales, especially for norm-deviating +actions, leads to (1) higher conflict resolution, (2) better social experience, +(3) higher privacy, and (4) higher flexibility. + +
+
+ comment: 13 pages, 13 figures, 13 tables (and supplementary material with + reproducibility and additional results), accepted at ECAI 2024 +
+
+
+
+
+ + ♻ ☆ RECE: Reduced Cross-Entropy Loss for Large-Catalogue Sequential + Recommenders CIKM'24 + + +
+ Scalability is a major challenge in modern recommender systems. In sequential +recommendations, full Cross-Entropy (CE) loss achieves state-of-the-art +recommendation quality but consumes excessive GPU memory with large item +catalogs, limiting its practicality. Using a GPU-efficient locality-sensitive +hashing-like algorithm for approximating large tensor of logits, this paper +introduces a novel RECE (REduced Cross-Entropy) loss. RECE significantly +reduces memory consumption while allowing one to enjoy the state-of-the-art +performance of full CE loss. Experimental results on various datasets show that +RECE cuts training peak memory usage by up to 12 times compared to existing +methods while retaining or exceeding performance metrics of CE loss. The +approach also opens up new possibilities for large-scale applications in other +domains. + +
+
+ comment: 5 pages, accepted for CIKM'24 +
+
+
+
+
+ + ♻ ☆ CLIP with Generative Latent Replay: a Strong Baseline for Incremental + Learning BMVC 2024 + + +
+ With the emergence of Transformers and Vision-Language Models (VLMs) such as +CLIP, fine-tuning large pre-trained models has recently become a prevalent +strategy in Continual Learning. This has led to the development of numerous +prompting strategies to adapt transformer-based models without incurring +catastrophic forgetting. However, these strategies often compromise the +original zero-shot capabilities of the pre-trained CLIP model and struggle to +adapt to domains that significantly deviate from the pre-training data. In this +work, we propose Continual Generative training for Incremental prompt-Learning, +a simple and novel approach to mitigate forgetting while adapting CLIP. +Briefly, we employ Variational Autoencoders (VAEs) to learn class-conditioned +distributions within the embedding space of the visual encoder. We then exploit +these distributions to sample new synthetic visual embeddings and train the +corresponding class-specific textual prompts during subsequent tasks. Through +extensive experiments on different domains, we show that such a generative +replay approach can adapt to new tasks while improving zero-shot capabilities, +evaluated using a novel metric tailored for CL scenarios. Notably, further +analysis reveals that our approach can bridge the gap with joint prompt tuning. +The codebase is available at https://github.com/aimagelab/mammoth. + +
+
+ comment: 15 pages, 1 figure. Accepted at the The 35th British Machine Vision + Conference 2024 (BMVC 2024), Glasgow, UK +
+
+
+
+
+ + ♻ ☆ Global Optimisation of Black-Box Functions with Generative Models in the + Wasserstein Space + + +
+ We propose a new uncertainty estimator for gradient-free optimisation of +black-box simulators using deep generative surrogate models. Optimisation of +these simulators is especially challenging for stochastic simulators and higher +dimensions. To address these issues, we utilise a deep generative surrogate +approach to model the black box response for the entire parameter space. We +then leverage this knowledge to estimate the proposed uncertainty based on the +Wasserstein distance - the Wasserstein uncertainty. This approach is employed +in a posterior agnostic gradient-free optimisation algorithm that minimises +regret over the entire parameter space. A series of tests were conducted to +demonstrate that our method is more robust to the shape of both the black box +function and the stochastic response of the black box than state-of-the-art +methods, such as efficient global optimisation with a deep Gaussian process +surrogate. + +
+
+ comment: European Conference on Artificial Intelligence 2024 Main Track +
+
+
+
+
+ + ♻ ☆ Optimal Baseline Corrections for Off-Policy Contextual Bandits + + +
+ The off-policy learning paradigm allows for recommender systems and general +ranking applications to be framed as decision-making problems, where we aim to +learn decision policies that optimize an unbiased offline estimate of an online +reward metric. With unbiasedness comes potentially high variance, and prevalent +methods exist to reduce estimation variance. These methods typically make use +of control variates, either additive (i.e., baseline corrections or doubly +robust methods) or multiplicative (i.e., self-normalisation). Our work unifies +these approaches by proposing a single framework built on their equivalence in +learning scenarios. The foundation of our framework is the derivation of an +equivalent baseline correction for all of the existing control variates. +Consequently, our framework enables us to characterize the variance-optimal +unbiased estimator and provide a closed-form solution for it. This optimal +estimator brings significantly improved performance in both evaluation and +learning, and minimizes data requirements. Empirical observations corroborate +our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ Disentangled Representation Learning with Transmitted Information + Bottleneck + + +
+ Encoding only the task-related information from the raw data, \ie, +disentangled representation learning, can greatly contribute to the robustness +and generalizability of models. Although significant advances have been made by +regularizing the information in representations with information theory, two +major challenges remain: 1) the representation compression inevitably leads to +performance drop; 2) the disentanglement constraints on representations are in +complicated optimization. To these issues, we introduce Bayesian networks with +transmitted information to formulate the interaction among input and +representations during disentanglement. Building upon this framework, we +propose \textbf{DisTIB} (\textbf{T}ransmitted \textbf{I}nformation +\textbf{B}ottleneck for \textbf{Dis}entangled representation learning), a novel +objective that navigates the balance between information compression and +preservation. We employ variational inference to derive a tractable estimation +for DisTIB. This estimation can be simply optimized via standard gradient +descent with a reparameterization trick. Moreover, we theoretically prove that +DisTIB can achieve optimal disentanglement, underscoring its superior efficacy. +To solidify our claims, we conduct extensive experiments on various downstream +tasks to demonstrate the appealing efficacy of DisTIB and validate our +theoretical analyses. + +
+
+
+
+
+ + ♻ ☆ LLM Voting: Human Choices and AI Collective Decision Making AAAI + + +
+ This paper investigates the voting behaviors of Large Language Models (LLMs), +specifically GPT-4 and LLaMA-2, their biases, and how they align with human +voting patterns. Our methodology involved using a dataset from a human voting +experiment to establish a baseline for human preferences and conducting a +corresponding experiment with LLM agents. We observed that the choice of voting +methods and the presentation order influenced LLM voting outcomes. We found +that varying the persona can reduce some of these biases and enhance alignment +with human choices. While the Chain-of-Thought approach did not improve +prediction accuracy, it has potential for AI explainability in the voting +process. We also identified a trade-off between preference diversity and +alignment accuracy in LLMs, influenced by different temperature settings. Our +findings indicate that LLMs may lead to less diverse collective outcomes and +biased assumptions when used in voting scenarios, emphasizing the need for +cautious integration of LLMs into democratic processes. + +
+
+ comment: Accepted in AAAI Conference on AI, Ethics, and Society (AIES) +
+
+
+
+
+ + ♻ ☆ A Probabilistic Approach to Learning the Degree of Equivariance in + Steerable CNNs ICML 2024 + + +
+ Steerable convolutional neural networks (SCNNs) enhance task performance by +modelling geometric symmetries through equivariance constraints on weights. +Yet, unknown or varying symmetries can lead to overconstrained weights and +decreased performance. To address this, this paper introduces a probabilistic +method to learn the degree of equivariance in SCNNs. We parameterise the degree +of equivariance as a likelihood distribution over the transformation group +using Fourier coefficients, offering the option to model layer-wise and shared +equivariance. These likelihood distributions are regularised to ensure an +interpretable degree of equivariance across the network. Advantages include the +applicability to many types of equivariant networks through the flexible +framework of SCNNs and the ability to learn equivariance with respect to any +subgroup of any compact group without requiring additional layers. Our +experiments reveal competitive performance on datasets with mixed symmetries, +with learnt likelihood distributions that are representative of the underlying +degree of equivariance. + +
+
+ comment: 9 pages, published at ICML 2024 as main conference paper +
+
+
+
+
+ + ♻ ☆ A Data-Driven Defense against Edge-case Model Poisoning Attacks on + Federated Learning + + +
+ Federated Learning systems are increasingly subjected to a multitude of model +poisoning attacks from clients. Among these, edge-case attacks that target a +small fraction of the input space are nearly impossible to detect using +existing defenses, leading to a high attack success rate. We propose an +effective defense using an external defense dataset, which provides information +about the attack target. The defense dataset contains a mix of poisoned and +clean examples, with only a few known to be clean. The proposed method, +DataDefense, uses this dataset to learn a poisoned data detector model which +marks each example in the defense dataset as poisoned or clean. It also learns +a client importance model that estimates the probability of a client update +being malicious. The global model is then updated as a weighted average of the +client models' updates. The poisoned data detector and the client importance +model parameters are updated using an alternating minimization strategy over +the Federated Learning rounds. Extensive experiments on standard attack +scenarios demonstrate that DataDefense can defend against model poisoning +attacks where other state-of-the-art defenses fail. In particular, DataDefense +is able to reduce the attack success rate by at least ~ 40% on standard attack +setups and by more than 80% on some setups. Furthermore, DataDefense requires +very few defense examples (as few as five) to achieve a near-optimal reduction +in attack success rate. + +
+
+
+
+
+ + ♻ ☆ On the Utility of 3D Hand Poses for Action Recognition ECCV 2024 + + +
+ 3D hand pose is an underexplored modality for action recognition. Poses are +compact yet informative and can greatly benefit applications with limited +compute budgets. However, poses alone offer an incomplete understanding of +actions, as they cannot fully capture objects and environments with which +humans interact. We propose HandFormer, a novel multimodal transformer, to +efficiently model hand-object interactions. HandFormer combines 3D hand poses +at a high temporal resolution for fine-grained motion modeling with sparsely +sampled RGB frames for encoding scene semantics. Observing the unique +characteristics of hand poses, we temporally factorize hand modeling and +represent each joint by its short-term trajectories. This factorized pose +representation combined with sparse RGB samples is remarkably efficient and +highly accurate. Unimodal HandFormer with only hand poses outperforms existing +skeleton-based methods at 5x fewer FLOPs. With RGB, we achieve new +state-of-the-art performance on Assembly101 and H2O with significant +improvements in egocentric action recognition. + +
+
+ comment: ECCV 2024; https://s-shamil.github.io/HandFormer/ +
+
+
+
+
+ + ♻ ☆ Causal modelling without introducing counterfactuals or abstract + distributions ICML 2024 + + +
+ The most common approach to causal modelling is the potential outcomes +framework due to Neyman and Rubin. In this framework, outcomes of +counterfactual treatments are assumed to be well-defined. This metaphysical +assumption is often thought to be problematic yet indispensable. The +conventional approach relies not only on counterfactuals but also on abstract +notions of distributions and assumptions of independence that are not directly +testable. In this paper, we construe causal inference as treatment-wise +predictions for finite populations where all assumptions are testable; this +means that one can not only test predictions themselves (without any +fundamental problem) but also investigate sources of error when they fail. The +new framework highlights the model-dependence of causal claims as well as the +difference between statistical and scientific inference. + +
+
+ comment: Presented at the Humans, Algorithmic Decision-Making and Society + Workshop at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Tree Attention: Topology-aware Decoding for Long-Context Attention on + GPU clusters + + +
+ Self-attention is the core mathematical operation of modern transformer +architectures and is also a significant computational bottleneck due to its +quadratic complexity in the sequence length. In this work, we derive the scalar +energy function whose gradient computes the self-attention block, thus +elucidating the theoretical underpinnings of self-attention, providing a +Bayesian interpretation of the operation and linking it closely with +energy-based models such as Hopfield Networks. Our formulation reveals that the +reduction across the sequence axis can be efficiently computed in parallel +through a tree reduction. Our algorithm, for parallelizing attention +computation across multiple GPUs enables cross-device decoding to be performed +asymptotically faster (up to 8x faster in our experiments) than alternative +approaches such as Ring Attention, while also requiring significantly less +communication volume and incurring 2x less peak memory. Our code is publicly +available here: \url{https://github.com/Zyphra/tree_attention}. + +
+
+
+
+
+ + ♻ ☆ VIRUS-NeRF -- Vision, InfraRed and UltraSonic based Neural Radiance + Fields + + +
+ Autonomous mobile robots are an increasingly integral part of modern factory +and warehouse operations. Obstacle detection, avoidance and path planning are +critical safety-relevant tasks, which are often solved using expensive LiDAR +sensors and depth cameras. We propose to use cost-effective low-resolution +ranging sensors, such as ultrasonic and infrared time-of-flight sensors by +developing VIRUS-NeRF - Vision, InfraRed, and UltraSonic based Neural Radiance +Fields. Building upon Instant Neural Graphics Primitives with a Multiresolution +Hash Encoding (Instant-NGP), VIRUS-NeRF incorporates depth measurements from +ultrasonic and infrared sensors and utilizes them to update the occupancy grid +used for ray marching. Experimental evaluation in 2D demonstrates that +VIRUS-NeRF achieves comparable mapping performance to LiDAR point clouds +regarding coverage. Notably, in small environments, its accuracy aligns with +that of LiDAR measurements, while in larger ones, it is bounded by the utilized +ultrasonic sensors. An in-depth ablation study reveals that adding ultrasonic +and infrared sensors is highly effective when dealing with sparse data and low +view variation. Further, the proposed occupancy grid of VIRUS-NeRF improves the +mapping capabilities and increases the training speed by 46% compared to +Instant-NGP. Overall, VIRUS-NeRF presents a promising approach for +cost-effective local mapping in mobile robotics, with potential applications in +safety and navigation tasks. The code can be found at +https://github.com/ethz-asl/virus nerf. + +
+
+
+
+
+ + ♻ ☆ Bayesian Learning in a Nonlinear Multiscale State-Space Model + + +
+ The ubiquity of multiscale interactions in complex systems is +well-recognized, with development and heredity serving as a prime example of +how processes at different temporal scales influence one another. This work +introduces a novel multiscale state-space model to explore the dynamic +interplay between systems interacting across different time scales, with +feedback between each scale. We propose a Bayesian learning framework to +estimate unknown states by learning the unknown process noise covariances +within this multiscale model. We develop a Particle Gibbs with Ancestor +Sampling (PGAS) algorithm for inference and demonstrate through simulations the +efficacy of our approach. + +
+
+ comment: Corrected a typo +
+
+
+
+
+ + ♻ ☆ Why we should not (always) assume data generating distributions in + Machine Learning ICML 2024 + + +
+ Machine Learning research, as most of Statistics, heavily relies on the +concept of a data-generating probability distribution. As data points are +thought to be sampled from such a distribution, we can learn from observed data +about this distribution and, thus, predict future data points drawn from it +(with some probability of success). Drawing on scholarship across disciplines, +we here argue that this framework is not always a good model. Not only do such +true probability distributions not exist; the framework can also be misleading +and obscure both the choices made and the goals pursued in machine learning +practice. We suggest an alternative framework that focuses on finite +populations rather than abstract distributions; while classical learning theory +can be left almost unchanged, it opens new opportunities, especially to model +sampling. We compile these considerations into five reasons for modelling +machine learning -- in some settings -- with finite distributions rather than +generative distributions, both to be more faithful to practice and to provide +novel theoretical insights. + +
+
+ comment: Presented at the Humans, Algorithmic Decision-Making and Society + Workshop at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Measuring User Understanding in Dialogue-based XAI Systems ECAI 2024 + + +
+ The field of eXplainable Artificial Intelligence (XAI) is increasingly +recognizing the need to personalize and/or interactively adapt the explanation +to better reflect users' explanation needs. While dialogue-based approaches to +XAI have been proposed recently, the state-of-the-art in XAI is still +characterized by what we call one-shot, non-personalized and one-way +explanations. In contrast, dialogue-based systems that can adapt explanations +through interaction with a user promise to be superior to GUI-based or +dashboard explanations as they offer a more intuitive way of requesting +information. In general, while interactive XAI systems are often evaluated in +terms of user satisfaction, there are limited studies that access user's +objective model understanding. This is in particular the case for +dialogue-based XAI approaches. In this paper, we close this gap by carrying out +controlled experiments within a dialogue framework in which we measure +understanding of users in three phases by asking them to simulate the +predictions of the model they are learning about. By this, we can quantify the +level of (improved) understanding w.r.t. how the model works, comparing the +state prior, and after the interaction. We further analyze the data to reveal +patterns of how the interaction between groups with high vs. low understanding +gain differ. Overall, our work thus contributes to our understanding about the +effectiveness of XAI approaches. + +
+
+ comment: Accepted at the ECAI 2024 main conference - final version and code + coming soon. 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Many learning agents interacting with an agent-based market model + + +
+ We consider the dynamics and the interactions of multiple reinforcement +learning optimal execution trading agents interacting with a reactive +Agent-Based Model (ABM) of a financial market in event time. The model +represents a market ecology with 3-trophic levels represented by: optimal +execution learning agents, minimally intelligent liquidity takers, and fast +electronic liquidity providers. The optimal execution agent classes include +buying and selling agents that can either use a combination of limit orders and +market orders, or only trade using market orders. The reward function +explicitly balances trade execution slippage against the penalty of not +executing the order timeously. This work demonstrates how multiple competing +learning agents impact a minimally intelligent market simulation as functions +of the number of agents, the size of agents' initial orders, and the state +spaces used for learning. We use phase space plots to examine the dynamics of +the ABM, when various specifications of learning agents are included. Further, +we examine whether the inclusion of optimal execution agents that can learn is +able to produce dynamics with the same complexity as empirical data. We find +that the inclusion of optimal execution agents changes the stylised facts +produced by ABM to conform more with empirical data, and are a necessary +inclusion for ABMs investigating market micro-structure. However, including +execution agents to chartist-fundamentalist-noise ABMs is insufficient to +recover the complexity observed in empirical data. + +
+
+ comment: 16 pages, 8 figures, 5 tables, enhanced discussion and figures +
+
+
+
+
+ + ♻ ☆ Using Explainable AI for EEG-based Reduced Montage Neonatal Seizure + Detection + + +
+ The neonatal period is the most vulnerable time for the development of +seizures. Seizures in the immature brain lead to detrimental consequences, +therefore require early diagnosis. The gold-standard for neonatal seizure +detection currently relies on continuous video-EEG monitoring; which involves +recording multi-channel electroencephalogram (EEG) alongside real-time video +monitoring within a neonatal intensive care unit (NICU). However, video-EEG +monitoring technology requires clinical expertise and is often limited to +technologically advanced and resourceful settings. Cost-effective new +techniques could help the medical fraternity make an accurate diagnosis and +advocate treatment without delay. In this work, a novel explainable deep +learning model to automate the neonatal seizure detection process with a +reduced EEG montage is proposed, which employs convolutional nets, graph +attention layers, and fully connected layers. Beyond its ability to detect +seizures in real-time with a reduced montage, this model offers the unique +advantage of real-time interpretability. By evaluating the performance on the +Zenodo dataset with 10-fold cross-validation, the presented model achieves an +absolute improvement of 8.31% and 42.86% in area under curve (AUC) and recall, +respectively. + +
+
+ comment: Paper is accepted to IEEE International Conference on Systems, Man, + and Cybernetics (SMC) 2024. Final Version +
+
+
+
+
+ + ♻ ☆ Ramsey Theorems for Trees and a General 'Private Learning Implies Online + Learning' Theorem + + +
+ This work continues to investigate the link between differentially private +(DP) and online learning. Alon, Livni, Malliaris, and Moran (2019) showed that +for binary concept classes, DP learnability of a given class implies that it +has a finite Littlestone dimension (equivalently, that it is online learnable). +Their proof relies on a model-theoretic result by Hodges (1997), which +demonstrates that any binary concept class with a large Littlestone dimension +contains a large subclass of thresholds. In a follow-up work, Jung, Kim, and +Tewari (2020) extended this proof to multiclass PAC learning with a bounded +number of labels. Unfortunately, Hodges's result does not apply in other +natural settings such as multiclass PAC learning with an unbounded label space, +and PAC learning of partial concept classes. + This naturally raises the question of whether DP learnability continues to +imply online learnability in more general scenarios: indeed, Alon, Hanneke, +Holzman, and Moran (2021) explicitly leave it as an open question in the +context of partial concept classes, and the same question is open in the +general multiclass setting. In this work, we give a positive answer to these +questions showing that for general classification tasks, DP learnability +implies online learnability. Our proof reasons directly about Littlestone +trees, without relying on thresholds. We achieve this by establishing several +Ramsey-type theorems for trees, which might be of independent interest. + +
+
+
+
+
+ + ♻ ☆ MetMamba: Regional Weather Forecasting with Spatial-Temporal Mamba Model + + +
+ Deep Learning based Weather Prediction (DLWP) models have been improving +rapidly over the last few years, surpassing state of the art numerical weather +forecasts by significant margins. While much of the optimization effort is +focused on training curriculum to extend forecast range in the global context, +two aspects remains less explored: limited area modeling and better backbones +for weather forecasting. We show in this paper that MetMamba, a DLWP model +built on a state-of-the-art state-space model, Mamba, offers notable +performance gains and unique advantages over other popular backbones using +traditional attention mechanisms and neural operators. We also demonstrate the +feasibility of deep learning based limited area modeling via coupled training +with a global host model. + +
+
+ comment: Typo and grammar; Minor elaboration and clarifications; Use full + organization name in the author section +
+
+
+
+
+ + ♻ ☆ Interaction as Explanation: A User Interaction-based Method for + Explaining Image Classification Models IJCAI 2024 + + +
+ In computer vision, explainable AI (xAI) methods seek to mitigate the +'black-box' problem by making the decision-making process of deep learning +models more interpretable and transparent. Traditional xAI methods concentrate +on visualizing input features that influence model predictions, providing +insights primarily suited for experts. In this work, we present an +interaction-based xAI method that enhances user comprehension of image +classification models through their interaction. Thus, we developed a web-based +prototype allowing users to modify images via painting and erasing, thereby +observing changes in classification results. Our approach enables users to +discern critical features influencing the model's decision-making process, +aligning their mental models with the model's logic. Experiments conducted with +five images demonstrate the potential of the method to reveal feature +importance through user interaction. Our work contributes a novel perspective +to xAI by centering on end-user engagement and understanding, paving the way +for more intuitive and accessible explainability in AI systems. + +
+
+ comment: IJCAI 2024 (International Joint Conference on Artificial Intelligence + 2024) Workshop on Explainable Artificial Intelligence (XAI) +
+
+
+
+
+ + ♻ ☆ ADEdgeDrop: Adversarial Edge Dropping for Robust Graph Neural Networks + + +
+ Although Graph Neural Networks (GNNs) have exhibited the powerful ability to +gather graph-structured information from neighborhood nodes via various +message-passing mechanisms, the performance of GNNs is limited by poor +generalization and fragile robustness caused by noisy and redundant graph data. +As a prominent solution, Graph Augmentation Learning (GAL) has recently +received increasing attention. Among prior GAL approaches, edge-dropping +methods that randomly remove edges from a graph during training are effective +techniques to improve the robustness of GNNs. However, randomly dropping edges +often results in bypassing critical edges, consequently weakening the +effectiveness of message passing. In this paper, we propose a novel adversarial +edge-dropping method (ADEdgeDrop) that leverages an adversarial edge predictor +guiding the removal of edges, which can be flexibly incorporated into diverse +GNN backbones. Employing an adversarial training framework, the edge predictor +utilizes the line graph transformed from the original graph to estimate the +edges to be dropped, which improves the interpretability of the edge-dropping +method. The proposed ADEdgeDrop is optimized alternately by stochastic gradient +descent and projected gradient descent. Comprehensive experiments on six graph +benchmark datasets demonstrate that the proposed ADEdgeDrop outperforms +state-of-the-art baselines across various GNN backbones, demonstrating improved +generalization and robustness. + +
+
+
+
+
+ + ♻ ☆ AutoCLIP: Auto-tuning Zero-Shot Classifiers for Vision-Language Models + + +
+ Classifiers built upon vision-language models such as CLIP have shown +remarkable zero-shot performance across a broad range of image classification +tasks. Prior work has studied different ways of automatically creating +descriptor sets for every class based on prompt templates, ranging from +manually engineered templates over templates obtained from a large language +model to templates built from random words and characters. Up until now, +deriving zero-shot classifiers from the respective encoded class descriptors +has remained nearly unchanged, i.e., classify to the class that maximizes +cosine similarity between its averaged encoded class descriptors and the image +encoding. However, weighing all class descriptors equally can be suboptimal +when certain descriptors match visual clues on a given image better than +others. In this work, we propose AutoCLIP, a method for auto-tuning zero-shot +classifiers. AutoCLIP tunes per-image weights to each prompt template at +inference time, based on statistics of class descriptor-image similarities. +AutoCLIP is fully unsupervised, has only a minor additional computation +overhead, and can be easily implemented in few lines of code. We show that +AutoCLIP outperforms baselines across a broad range of vision-language models, +datasets, and prompt templates consistently and by up to 3 percent point +accuracy. + +
+
+ comment: accepted at TMLR, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ A Survey on Graph Neural Networks and Graph Transformers in Computer + Vision: A Task-Oriented Perspective + + +
+ Graph Neural Networks (GNNs) have gained momentum in graph representation +learning and boosted the state of the art in a variety of areas, such as data +mining (\emph{e.g.,} social network analysis and recommender systems), computer +vision (\emph{e.g.,} object detection and point cloud learning), and natural +language processing (\emph{e.g.,} relation extraction and sequence learning), +to name a few. With the emergence of Transformers in natural language +processing and computer vision, graph Transformers embed a graph structure into +the Transformer architecture to overcome the limitations of local neighborhood +aggregation while avoiding strict structural inductive biases. In this paper, +we present a comprehensive review of GNNs and graph Transformers in computer +vision from a task-oriented perspective. Specifically, we divide their +applications in computer vision into five categories according to the modality +of input data, \emph{i.e.,} 2D natural images, videos, 3D data, vision + +language, and medical images. In each category, we further divide the +applications according to a set of vision tasks. Such a task-oriented taxonomy +allows us to examine how each task is tackled by different GNN-based approaches +and how well these approaches perform. Based on the necessary preliminaries, we +provide the definitions and challenges of the tasks, in-depth coverage of the +representative approaches, as well as discussions regarding insights, +limitations, and future directions. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (T-PAMI) +
+
+
+
+
+ + ♻ ☆ GHQ: Grouped Hybrid Q Learning for Heterogeneous Cooperative Multi-agent + Reinforcement Learning + + +
+ Previous deep multi-agent reinforcement learning (MARL) algorithms have +achieved impressive results, typically in homogeneous scenarios. However, +heterogeneous scenarios are also very common and usually harder to solve. In +this paper, we mainly discuss cooperative heterogeneous MARL problems in +Starcraft Multi-Agent Challenges (SMAC) environment. We firstly define and +describe the heterogeneous problems in SMAC. In order to comprehensively reveal +and study the problem, we make new maps added to the original SMAC maps. We +find that baseline algorithms fail to perform well in those heterogeneous maps. +To address this issue, we propose the Grouped Individual-Global-Max Consistency +(GIGM) and a novel MARL algorithm, Grouped Hybrid Q Learning (GHQ). GHQ +separates agents into several groups and keeps individual parameters for each +group, along with a novel hybrid structure for factorization. To enhance +coordination between groups, we maximize the Inter-group Mutual Information +(IGMI) between groups' trajectories. Experiments on original and new +heterogeneous maps show the fabulous performance of GHQ compared to other +state-of-the-art algorithms. + +
+
+
+
+
+ + ♻ ☆ Watermarking Recommender Systems + + +
+ Recommender systems embody significant commercial value and represent crucial +intellectual property. However, the integrity of these systems is constantly +challenged by malicious actors seeking to steal their underlying models. +Safeguarding against such threats is paramount to upholding the rights and +interests of the model owner. While model watermarking has emerged as a potent +defense mechanism in various domains, its direct application to recommender +systems remains unexplored and non-trivial. In this paper, we address this gap +by introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel +technique tailored specifically for recommender systems. Our approach entails +selecting an initial item and querying it through the oracle model, followed by +the selection of subsequent items with small prediction scores. This iterative +process generates a watermark sequence autoregressively, which is then +ingrained into the model's memory through training. To assess the efficacy of +the watermark, the model is tasked with predicting the subsequent item given a +truncated watermark sequence. Through extensive experimentation and analysis, +we demonstrate the superior performance and robust properties of AOW. Notably, +our watermarking technique exhibits high-confidence extraction capabilities and +maintains effectiveness even in the face of distillation and fine-tuning +processes. + +
+
+
+
+
+ + ♻ ☆ A distribution-guided Mapper algorithm + + +
+ Motivation: The Mapper algorithm is an essential tool to explore shape of +data in topology data analysis. With a dataset as an input, the Mapper +algorithm outputs a graph representing the topological features of the whole +dataset. This graph is often regarded as an approximation of a reeb graph of +data. The classic Mapper algorithm uses fixed interval lengths and overlapping +ratios, which might fail to reveal subtle features of data, especially when the +underlying structure is complex. + Results: In this work, we introduce a distribution guided Mapper algorithm +named D-Mapper, that utilizes the property of the probability model and data +intrinsic characteristics to generate density guided covers and provides +enhanced topological features. Our proposed algorithm is a probabilistic +model-based approach, which could serve as an alternative to non-prababilistic +ones. Moreover, we introduce a metric accounting for both the quality of +overlap clustering and extended persistence homology to measure the performance +of Mapper type algorithm. Our numerical experiments indicate that the D-Mapper +outperforms the classical Mapper algorithm in various scenarios. We also apply +the D-Mapper to a SARS-COV-2 coronavirus RNA sequences dataset to explore the +topological structure of different virus variants. The results indicate that +the D-Mapper algorithm can reveal both vertical and horizontal evolution +processes of the viruses. + Availability: Our package is available at +https://github.com/ShufeiGe/D-Mapper. + +
+
+
+
+
+ + ♻ ☆ ViTime: A Visual Intelligence-Based Foundation Model for Time Series + Forecasting + + +
+ The success of large pretrained models in natural language processing (NLP) +and computer vision (CV) has opened new avenues for constructing foundation +models for time series forecasting (TSF). Traditional TSF foundation models +rely heavily on numerical data fitting. In contrast, the human brain is +inherently skilled at processing visual information, prefer predicting future +trends by observing visualized sequences. From a biomimetic perspective, +utilizing models to directly process numerical sequences might not be the most +effective route to achieving Artificial General Intelligence (AGI). This paper +proposes ViTime, a novel Visual Intelligence-based foundation model for TSF. +ViTime overcomes the limitations of numerical time series data fitting by +utilizing visual data processing paradigms and employs a innovative data +synthesis method during training, called Real Time Series (RealTS). Experiments +on a diverse set of previously unseen forecasting datasets demonstrate that +ViTime achieves state-of-the-art zero-shot performance, even surpassing the +best individually trained supervised models in some situations. These findings +suggest that visual intelligence can significantly enhance time series analysis +and forecasting, paving the way for more advanced and versatile models in the +field. The code for our framework is accessible at +https://github.com/IkeYang/ViTime. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks in EEG-based Emotion Recognition: A Survey + + +
+ Compared to other modalities, EEG-based emotion recognition can intuitively +respond to the emotional patterns in the human brain and, therefore, has become +one of the most concerning tasks in the brain-computer interfaces field. Since +dependencies within brain regions are closely related to emotion, a significant +trend is to develop Graph Neural Networks (GNNs) for EEG-based emotion +recognition. However, brain region dependencies in emotional EEG have +physiological bases that distinguish GNNs in this field from those in other +time series fields. Besides, there is neither a comprehensive review nor +guidance for constructing GNNs in EEG-based emotion recognition. In the survey, +our categorization reveals the commonalities and differences of existing +approaches under a unified framework of graph construction. We analyze and +categorize methods from three stages in the framework to provide clear guidance +on constructing GNNs in EEG-based emotion recognition. In addition, we discuss +several open challenges and future directions, such as Temporal full-connected +graph and Graph condensation. + +
+
+
+
+
+ + ♻ ☆ Interpolation of mountain weather forecasts by machine learning + + +
+ Recent advances in numerical simulation methods based on physical models and +their combination with machine learning have improved the accuracy of weather +forecasts. However, the accuracy decreases in complex terrains such as +mountainous regions because these methods usually use grids of several +kilometers square and simple machine learning models. While deep learning has +also made significant progress in recent years, its direct application is +difficult to utilize the physical knowledge used in the simulation. This paper +proposes a method that uses machine learning to interpolate future weather in +mountainous regions using forecast data from surrounding plains and past +observed data to improve weather forecasts in mountainous regions. We focus on +mountainous regions in Japan and predict temperature and precipitation mainly +using LightGBM as a machine learning model. Despite the use of a small dataset, +through feature engineering and model tuning, our method partially achieves +improvements in the RMSE with significantly less training time. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ AROID: Improving Adversarial Robustness Through Online Instance-Wise + Data Augmentation + + +
+ Deep neural networks are vulnerable to adversarial examples. Adversarial +training (AT) is an effective defense against adversarial examples. However, AT +is prone to overfitting which degrades robustness substantially. Recently, data +augmentation (DA) was shown to be effective in mitigating robust overfitting if +appropriately designed and optimized for AT. This work proposes a new method to +automatically learn online, instance-wise, DA policies to improve robust +generalization for AT. This is the first automated DA method specific for +robustness. A novel policy learning objective, consisting of Vulnerability, +Affinity and Diversity, is proposed and shown to be sufficiently effective and +efficient to be practical for automatic DA generation during AT. Importantly, +our method dramatically reduces the cost of policy search from the 5000 hours +of AutoAugment and the 412 hours of IDBH to 9 hours, making automated DA more +practical to use for adversarial robustness. This allows our method to +efficiently explore a large search space for a more effective DA policy and +evolve the policy as training progresses. Empirically, our method is shown to +outperform all competitive DA methods across various model architectures and +datasets. Our DA policy reinforced vanilla AT to surpass several +state-of-the-art AT methods regarding both accuracy and robustness. It can also +be combined with those advanced AT methods to further boost robustness. Code +and pre-trained models are available at https://github.com/TreeLLi/AROID. + +
+
+ comment: published at the IJCV in press +
+
+
+
+
+ + ♻ ☆ Chain-of-Factors Paper-Reviewer Matching + + +
+ With the rapid increase in paper submissions to academic conferences, the +need for automated and accurate paper-reviewer matching is more critical than +ever. Previous efforts in this area have considered various factors to assess +the relevance of a reviewer's expertise to a paper, such as the semantic +similarity, shared topics, and citation connections between the paper and the +reviewer's previous works. However, most of these studies focus on only one +factor, resulting in an incomplete evaluation of the paper-reviewer relevance. +To address this issue, we propose a unified model for paper-reviewer matching +that jointly considers semantic, topic, and citation factors. To be specific, +during training, we instruction-tune a contextualized language model shared +across all factors to capture their commonalities and characteristics; during +inference, we chain the three factors to enable step-by-step, coarse-to-fine +search for qualified reviewers given a submission. Experiments on four datasets +(one of which is newly contributed by us) spanning various fields such as +machine learning, computer vision, information retrieval, and data mining +consistently demonstrate the effectiveness of our proposed Chain-of-Factors +model in comparison with state-of-the-art paper-reviewer matching methods and +scientific pre-trained language models. + +
+
+
+
+
+ + ♻ ☆ PsyDI: Towards a Personalized and Progressively In-depth Chatbot for + Psychological Measurements + + +
+ In the field of psychology, traditional assessment methods, such as +standardized scales, are frequently critiqued for their static nature, lack of +personalization, and reduced participant engagement, while comprehensive +counseling evaluations are often inaccessible. The complexity of quantifying +psychological traits further limits these methods. Despite advances with large +language models (LLMs), many still depend on single-round Question-and-Answer +interactions. To bridge this gap, we introduce PsyDI, a personalized and +progressively in-depth chatbot designed for psychological measurements, +exemplified by its application in the Myers-Briggs Type Indicator (MBTI) +framework. PsyDI leverages user-related multi-modal information and engages in +customized, multi-turn interactions to provide personalized, easily accessible +measurements, while ensuring precise MBTI type determination. To address the +challenge of unquantifiable psychological traits, we introduce a novel training +paradigm that involves learning the ranking of proxy variables associated with +these traits, culminating in a robust score model for MBTI measurements. The +score model enables PsyDI to conduct comprehensive and precise measurements +through multi-turn interactions within a unified estimation context. Through +various experiments, we validate the efficacy of both the score model and the +PsyDI pipeline, demonstrating its potential to serve as a general framework for +psychological measurements. Furthermore, the online deployment of PsyDI has +garnered substantial user engagement, with over 3,000 visits, resulting in the +collection of numerous multi-turn dialogues annotated with MBTI types, which +facilitates further research. + +
+
+ comment: 29 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Distributional Reinforcement Learning with Dual Expectile-Quantile + Regression + + +
+ Distributional reinforcement learning (RL) has proven useful in multiple +benchmarks as it enables approximating the full distribution of returns and +makes a better use of environment samples. The commonly used quantile +regression approach to distributional RL -- based on asymmetric $L_1$ losses -- +provides a flexible and effective way of learning arbitrary return +distributions. In practice, it is often improved by using a more efficient, +hybrid asymmetric $L_1$-$L_2$ Huber loss for quantile regression. However, by +doing so, distributional estimation guarantees vanish, and we empirically +observe that the estimated distribution rapidly collapses to its mean. Indeed, +asymmetric $L_2$ losses, corresponding to expectile regression, cannot be +readily used for distributional temporal difference learning. Motivated by the +efficiency of $L_2$-based learning, we propose to jointly learn expectiles and +quantiles of the return distribution in a way that allows efficient learning +while keeping an estimate of the full distribution of returns. We prove that +our approach approximately learns the correct return distribution, and we +benchmark a practical implementation on a toy example and at scale. On the +Atari benchmark, our approach matches the performance of the Huber-based IQN-1 +baseline after $200$M training frames but avoids distributional collapse and +keeps estimates of the full distribution of returns. + +
+
+
+
+
+ + ♻ ☆ Adversarially robust clustering with optimality guarantees + + +
+ We consider the problem of clustering data points coming from sub-Gaussian +mixtures. Existing methods that provably achieve the optimal mislabeling error, +such as the Lloyd algorithm, are usually vulnerable to outliers. In contrast, +clustering methods seemingly robust to adversarial perturbations are not known +to satisfy the optimal statistical guarantees. We propose a simple robust +algorithm based on the coordinatewise median that obtains the optimal +mislabeling rate even when we allow adversarial outliers to be present. Our +algorithm achieves the optimal error rate in constant iterations when a weak +initialization condition is satisfied. In the absence of outliers, in fixed +dimensions, our theoretical guarantees are similar to that of the Lloyd +algorithm. Extensive experiments on various simulated and public datasets are +conducted to support the theoretical guarantees of our method. + +
+
+ comment: 38 pages, 9 figures. Updated with remarks, real data analysis, and + typo corrections +
+
+
+
+
+ + ♻ ☆ Reset & Distill: A Recipe for Overcoming Negative Transfer in Continual + Reinforcement Learning + + +
+ We argue that the negative transfer problem occurring when the new task to +learn arrives is an important problem that needs not be overlooked when +developing effective Continual Reinforcement Learning (CRL) algorithms. Through +comprehensive experimental validation, we demonstrate that such issue +frequently exists in CRL and cannot be effectively addressed by several recent +work on mitigating plasticity loss of RL agents. To that end, we develop Reset +& Distill (R&D), a simple yet highly effective method, to overcome the negative +transfer problem in CRL. R&D combines a strategy of resetting the agent's +online actor and critic networks to learn a new task and an offline learning +step for distilling the knowledge from the online actor and previous expert's +action probabilities. We carried out extensive experiments on long sequence of +Meta World tasks and show that our method consistently outperforms recent +baselines, achieving significantly higher success rates across a range of +tasks. Our findings highlight the importance of considering negative transfer +in CRL and emphasize the need for robust strategies like R&D to mitigate its +detrimental effects. + +
+
+
+
+
+ + ♻ ☆ Improving SMOTE via Fusing Conditional VAE for Data-adaptive Noise + Filtering + + +
+ Recent advances in a generative neural network model extend the development +of data augmentation methods. However, the augmentation methods based on the +modern generative models fail to achieve notable performance for class +imbalance data compared to the conventional model, Synthetic Minority +Oversampling Technique (SMOTE). We investigate the problem of the generative +model for imbalanced classification and introduce a framework to enhance the +SMOTE algorithm using Variational Autoencoders (VAE). Our approach +systematically quantifies the density of data points in a low-dimensional +latent space using the VAE, simultaneously incorporating information on class +labels and classification difficulty. Then, the data points potentially +degrading the augmentation are systematically excluded, and the neighboring +observations are directly augmented on the data space. Empirical studies on +several imbalanced datasets represent that this simple process innovatively +improves the conventional SMOTE algorithm over the deep learning models. +Consequently, we conclude that the selection of minority data and the +interpolation in the data space are beneficial for imbalanced classification +problems with a relatively small number of data points. + +
+
+
+
+
+ + ♻ ☆ Learning to Learn without Forgetting using Attention + + +
+ Continual learning (CL) refers to the ability to continually learn over time +by accommodating new knowledge while retaining previously learned experience. +While this concept is inherent in human learning, current machine learning +methods are highly prone to overwrite previously learned patterns and thus +forget past experience. Instead, model parameters should be updated selectively +and carefully, avoiding unnecessary forgetting while optimally leveraging +previously learned patterns to accelerate future learning. Since hand-crafting +effective update mechanisms is difficult, we propose meta-learning a +transformer-based optimizer to enhance CL. This meta-learned optimizer uses +attention to learn the complex relationships between model parameters across a +stream of tasks, and is designed to generate effective weight updates for the +current task while preventing catastrophic forgetting on previously encountered +tasks. Evaluations on benchmark datasets like SplitMNIST, RotatedMNIST, and +SplitCIFAR-100 affirm the efficacy of the proposed approach in terms of both +forward and backward transfer, even on small sets of labeled data, highlighting +the advantages of integrating a meta-learned optimizer within the continual +learning framework. + +
+
+ comment: Published at the 3rd Conference on Lifelong Learning Agents (CoLLAs), + 2024 +
+
+
+
+
+ + ♻ ☆ Breaking the $T^{2/3}$ Barrier for Sequential Calibration + + +
+ A set of probabilistic forecasts is calibrated if each prediction of the +forecaster closely approximates the empirical distribution of outcomes on the +subset of timesteps where that prediction was made. We study the fundamental +problem of online calibrated forecasting of binary sequences, which was +initially studied by Foster & Vohra (1998). They derived an algorithm with +$O(T^{2/3})$ calibration error after $T$ time steps, and showed a lower bound +of $\Omega(T^{1/2})$. These bounds remained stagnant for two decades, until +Qiao & Valiant (2021) improved the lower bound to $\Omega(T^{0.528})$ by +introducing a combinatorial game called sign preservation and showing that +lower bounds for this game imply lower bounds for calibration. + In this paper, we give the first improvement to the $O(T^{2/3})$ upper bound +on calibration error of Foster & Vohra. We do this by introducing a variant of +Qiao & Valiant's game that we call sign preservation with reuse (SPR). We prove +that the relationship between SPR and calibrated forecasting is bidirectional: +not only do lower bounds for SPR translate into lower bounds for calibration, +but algorithms for SPR also translate into new algorithms for calibrated +forecasting. We then give an improved \emph{upper bound} for the SPR game, +which implies, via our equivalence, a forecasting algorithm with calibration +error $O(T^{2/3 - \varepsilon})$ for some $\varepsilon > 0$, improving Foster & +Vohra's upper bound for the first time. Using similar ideas, we then prove a +slightly stronger lower bound than that of Qiao & Valiant, namely +$\Omega(T^{0.54389})$. Our lower bound is obtained by an oblivious adversary, +marking the first $\omega(T^{1/2})$ calibration lower bound for oblivious +adversaries. + +
+
+
+
+
+ + ♻ ☆ Wilsonian Renormalization of Neural Network Gaussian Processes + + +
+ Separating relevant and irrelevant information is key to any modeling process +or scientific inquiry. Theoretical physics offers a powerful tool for achieving +this in the form of the renormalization group (RG). Here we demonstrate a +practical approach to performing Wilsonian RG in the context of Gaussian +Process (GP) Regression. We systematically integrate out the unlearnable modes +of the GP kernel, thereby obtaining an RG flow of the GP in which the data sets +the IR scale. In simple cases, this results in a universal flow of the ridge +parameter, which becomes input-dependent in the richer scenario in which +non-Gaussianities are included. In addition to being analytically tractable, +this approach goes beyond structural analogies between RG and neural networks +by providing a natural connection between RG flow and learnable vs. unlearnable +modes. Studying such flows may improve our understanding of feature learning in +deep neural networks, and enable us to identify potential universality classes +in these models. + +
+
+ comment: 17 pages, 1 figure; rewrote introduction, added references, section + IIIA, section IVA, and appendix C +
+
+
+
+
+ + ♻ ☆ Online Uniform Allocation:Randomized Learning-Augmented Approximation + Algorithms with Application to Digital Health + + +
+ Motivated by applications in digital health, this work studies the novel +problem of online uniform allocation (OUA), where the goal is to distribute a +budget uniformly across unknown decision times. In the OUA problem, the +algorithm is given a budget $b$ and a time horizon $T$, and an adversary then +chooses a value $\tau^* \in [b,T]$, which is revealed to the algorithm online. +At each decision time $i \in [\tau^*]$, the algorithm must determine a +probability that maximizes the budget spent throughout the horizon, respecting +budget constraint $b$, while achieving as uniform a distribution as possible +over $\tau^*$. We present the first randomized algorithm designed for this +problem and subsequently extend it to incorporate learning augmentation. We +provide worst-case approximation guarantees for both algorithms, and illustrate +the utility of the algorithms through both synthetic experiments and a +real-world case study involving the HeartSteps mobile application. Our +numerical results show strong empirical average performance of our proposed +randomized algorithms against previously proposed heuristic solutions. + +
+
+
+
+
+ + ♻ ☆ Learning Payment-Free Resource Allocation Mechanisms + + +
+ We consider the design of mechanisms that allocate limited resources among +self-interested agents using neural networks. Unlike the recent works that +leverage machine learning for revenue maximization in auctions, we consider +welfare maximization as the key objective in the payment-free setting. Without +payment exchange, it is unclear how we can align agents' incentives to achieve +the desired objectives of truthfulness and social welfare simultaneously, +without resorting to approximations. Our work makes novel contributions by +designing an approximate mechanism that desirably trade-off social welfare with +truthfulness. Specifically, (i) we contribute a new end-to-end neural network +architecture, ExS-Net, that accommodates the idea of "money-burning" for +mechanism design without payments; (ii)~we provide a generalization bound that +guarantees the mechanism performance when trained under finite samples; and +(iii) we provide an experimental demonstration of the merits of the proposed +mechanism. + +
+
+
+
+
+ + ♻ ☆ On a Scale-Invariant Approach to Bundle Recommendations in Candy Crush + Saga + + +
+ A good understanding of player preferences is crucial for increasing content +relevancy, especially in mobile games. This paper illustrates the use of +attentive models for producing item recommendations in a mobile game scenario. +The methodology comprises a combination of supervised and unsupervised +approaches to create user-level recommendations while introducing a novel +scale-invariant approach to the prediction. The methodology is subsequently +applied to a bundle recommendation in Candy Crush Saga. The strategy of +deployment, maintenance, and monitoring of ML models that are scaled up to +serve millions of users is presented, along with the best practices and design +patterns adopted to minimize technical debt typical of ML systems. The +recommendation approach is evaluated both offline and online, with a focus on +understanding the increase in engagement, click- and take rates, novelty +effects, recommendation diversity, and the impact of degenerate feedback loops. +We have demonstrated that the recommendation enhances user engagement by 30% +concerning click rate and by more than 40% concerning take rate. In addition, +we empirically quantify the diminishing effects of recommendation accuracy on +user engagement. + +
+
+
+
+
+ + ♻ ☆ Automatic Feature Recognition and Dimensional Attributes Extraction From + CAD Models for Hybrid Additive-Subtractive Manufacturing + + +
+ The integration of Computer-Aided Design (CAD), Computer-Aided Process +Planning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in +modern manufacturing, facilitating seamless transitions from digital designs to +physical products. However, a significant challenge within this integration is +the Automatic Feature Recognition (AFR) of CAD models, especially in the +context of hybrid manufacturing that combines subtractive and additive +manufacturing processes. Traditional AFR methods, focused mainly on the +identification of subtractive (machined) features including holes, fillets, +chamfers, pockets, and slots, fail to recognize features pertinent to additive +manufacturing. Furthermore, the traditional methods fall short in accurately +extracting geometric dimensions and orientations, which are also key factors +for effective manufacturing process planning. This paper presents a novel +approach for creating a synthetic CAD dataset that encompasses features +relevant to both additive and subtractive machining through Python Open +Cascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is +implemented to accurately identify the composite additive-subtractive features +within the synthetic CAD dataset. The key novelty and contribution of the +proposed methodology lie in its ability to recognize a wide range of +manufacturing features, and precisely extracting their dimensions, +orientations, and stock sizes. The proposed model demonstrates remarkable +feature recognition accuracy exceeding 97% and a dimension extraction accuracy +of 100% for identified features. Therefore, the proposed methodology enhances +the integration of CAD, CAPP, and CAM within hybrid manufacturing by providing +precise feature recognition and dimension extraction. It facilitates improved +manufacturing process planning, by enabling more informed decision-making. + +
+
+ comment: 10 pages, 12 figures. This paper has been accepted for presentation + at the ASME IDETC-CIE 2024 conference +
+
+
+
+
+ + ♻ ☆ Research on Autonomous Robots Navigation based on Reinforcement Learning + + +
+ Reinforcement learning continuously optimizes decision-making based on +real-time feedback reward signals through continuous interaction with the +environment, demonstrating strong adaptive and self-learning capabilities. In +recent years, it has become one of the key methods to achieve autonomous +navigation of robots. In this work, an autonomous robot navigation method based +on reinforcement learning is introduced. We use the Deep Q Network (DQN) and +Proximal Policy Optimization (PPO) models to optimize the path planning and +decision-making process through the continuous interaction between the robot +and the environment, and the reward signals with real-time feedback. By +combining the Q-value function with the deep neural network, deep Q network can +handle high-dimensional state space, so as to realize path planning in complex +environments. Proximal policy optimization is a strategy gradient-based method, +which enables robots to explore and utilize environmental information more +efficiently by optimizing policy functions. These methods not only improve the +robot's navigation ability in the unknown environment, but also enhance its +adaptive and self-learning capabilities. Through multiple training and +simulation experiments, we have verified the effectiveness and robustness of +these models in various complex scenarios. + +
+
+
+
+
+ + ♻ ☆ Community Detection Guarantees Using Embeddings Learned by Node2Vec + + +
+ Embedding the nodes of a large network into an Euclidean space is a common +objective in modern machine learning, with a variety of tools available. These +embeddings can then be used as features for tasks such as community +detection/node clustering or link prediction, where they achieve state of the +art performance. With the exception of spectral clustering methods, there is +little theoretical understanding for commonly used approaches to learning +embeddings. In this work we examine the theoretical properties of the +embeddings learned by node2vec. Our main result shows that the use of $k$-means +clustering on the embedding vectors produced by node2vec gives weakly +consistent community recovery for the nodes in (degree corrected) stochastic +block models. We also discuss the use of these embeddings for node and link +prediction tasks. We demonstrate this result empirically, and examine how this +relates to other embedding tools for network data. + +
+
+
+
+
+ + ♻ ☆ Graph Agent Network: Empowering Nodes with Inference Capabilities for + Adversarial Resilience + + +
+ End-to-end training with global optimization have popularized graph neural +networks (GNNs) for node classification, yet inadvertently introduced +vulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit +the inherent opened interfaces of GNNs' input and output, perturbing critical +edges and thus manipulating the classification results. Current defenses, due +to their persistent utilization of global-optimization-based end-to-end +training schemes, inherently encapsulate the vulnerabilities of GNNs. This is +specifically evidenced in their inability to defend against targeted secondary +attacks. In this paper, we propose the Graph Agent Network (GAgN) to address +the aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent +network in which each node is designed as an 1-hop-view agent. Through the +decentralized interactions between agents, they can learn to infer global +perceptions to perform tasks including inferring embeddings, degrees and +neighbor relationships for given nodes. This empowers nodes to filtering +adversarial edges while carrying out classification tasks. Furthermore, agents' +limited view prevents malicious messages from propagating globally in GAgN, +thereby resisting global-optimization-based secondary attacks. We prove that +single-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient +to achieve these functionalities. Experimental results show that GAgN +effectively implements all its intended capabilities and, compared to +state-of-the-art defenses, achieves optimal classification accuracy on the +perturbed datasets. + +
+
+
+
+
+ + ♻ ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ♻ ☆ Perseus: Reducing Energy Bloat in Large Model Training SOSP 24 + + +
+ Training large AI models on numerous GPUs consumes a massive amount of +energy, making power delivery one of the largest limiting factors in building +and operating datacenters for AI workloads. However, we observe that not all +energy consumed during training directly contributes to end-to-end throughput, +and a significant portion can be removed without slowing down training, which +we call energy bloat. In this work, we identify two independent sources of +energy bloat in large model training and propose Perseus, a training system +that mitigates both. To do this, Perseus obtains the "iteration time-energy" +Pareto frontier of any large model training job using an efficient graph +cut-based algorithm and schedules the energy consumption of computations across +time to remove both types of energy bloat. Evaluation on large models including +GPT-3 and Bloom shows that Perseus reduces the energy consumption of large +model training by up to 30% without any throughput loss or hardware +modification, enabling energy reduction -- and therefore cost savings -- +otherwise unattainable before. + +
+
+ comment: SOSP 24 | Open-source part of Zeus at + https://ml.energy/zeus/research_overview/perseus/ +
+
+
+
+
+ + ♻ ☆ SymbolNet: Neural Symbolic Regression with Adaptive Dynamic Pruning + + +
+ Contrary to genetic programming, the neural network approach to symbolic +regression can efficiently handle high-dimensional inputs and leverage gradient +methods for faster equation searching. Common ways of constraining expression +complexity often involve multistage pruning with fine-tuning, which can result +in significant performance loss. In this work, we propose $\tt{SymbolNet}$, a +neural network approach to symbolic regression in a novel framework that allows +dynamic pruning of model weights, input features, and mathematical operators in +a single training process, where both training loss and expression complexity +are optimized simultaneously. We introduce a sparsity regularization term for +each pruning type, which can adaptively adjust its strength, leading to +convergence at a target sparsity ratio. Unlike most existing symbolic +regression methods that struggle with datasets containing more than +$\mathcal{O}(10)$ inputs, we demonstrate the effectiveness of our model on the +LHC jet tagging task (16 inputs), MNIST (784 inputs), and SVHN (3072 inputs). +Our approach enables symbolic regression to achieve fast inference with +nanosecond-scale latency on FPGAs for high-dimensional datasets in environments +with stringent computational resource constraints, such as the high-energy +physics experiments at the LHC. + +
+
+ comment: 24 pages. Minor fixes and formatting, under review +
+
+
+
+
+ + ♻ ☆ AI-guided inverse design and discovery of recyclable vitrimeric polymers + + +
+ Vitrimer is a new, exciting class of sustainable polymers with the ability to +heal due to their dynamic covalent adaptive network that can go through +associative rearrangement reactions. However, a limited choice of constituent +molecules restricts their property space, prohibiting full realization of their +potential applications. To overcome this challenge, we couple molecular +dynamics (MD) simulations and a novel graph variational autoencoder (VAE) +machine learning model for inverse design of vitrimer chemistries with desired +glass transition temperature (Tg) and synthesize a novel vitrimer polymer. We +build the first vitrimer dataset of one million chemistries and calculate Tg on +8,424 of them by high-throughput MD simulations calibrated by a Gaussian +process model. The proposed novel VAE employs dual graph encoders and a latent +dimension overlapping scheme which allows for individual representation of +multi-component vitrimers. By constructing a continuous latent space containing +necessary information of vitrimers, we demonstrate high accuracy and efficiency +of our framework in discovering novel vitrimers with desirable Tg beyond the +training regime. To validate the effectiveness of our framework in experiments, +we generate novel vitrimer chemistries with a target Tg = 323 K. By +incorporating chemical intuition, we synthesize a vitrimer with Tg of 311-317 +K, and experimentally demonstrate healability and flowability. The proposed +framework offers an exciting tool for polymer chemists to design and synthesize +novel, sustainable vitrimer polymers for a facet of applications. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Scalable Deep Compressed Sensing + + +
+ Compressed sensing (CS) is a promising tool for reducing sampling costs. +Current deep neural network (NN)-based CS methods face the challenges of +collecting labeled measurement-ground truth (GT) data and generalizing to real +applications. This paper proposes a novel $\mathbf{S}$elf-supervised +s$\mathbf{C}$alable deep CS method, comprising a deep $\mathbf{L}$earning +scheme called $\mathbf{SCL}$ and a family of $\mathbf{Net}$works named +$\mathbf{SCNet}$, which does not require GT and can handle arbitrary sampling +ratios and matrices once trained on a partial measurement set. Our SCL contains +a dual-domain loss and a four-stage recovery strategy. The former encourages a +cross-consistency on two measurement parts and a sampling-reconstruction +cycle-consistency regarding arbitrary ratios and matrices to maximize +data/information utilization. The latter can progressively leverage common +signal prior in external measurements and internal characteristics of test +samples and learned NNs to improve accuracy. SCNet combines both the explicit +guidance from optimization algorithms with implicit regularization from +advanced NN blocks to learn a collaborative signal representation. Our +theoretical analyses and experiments on simulated and real captured data, +covering 1-/2-/3-D natural and scientific signals, demonstrate the +effectiveness, superior performance, flexibility, and generalization ability of +our method over existing self-supervised methods and its significant potential +in competing against state-of-the-art supervised methods. Code is available at +https://github.com/Guaishou74851/SCNet. + +
+
+ comment: Accepted by Internaltional Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ V-STaR: Training Verifiers for Self-Taught Reasoners + + +
+ Common self-improvement approaches for large language models (LLMs), such as +STaR, iteratively fine-tune LLMs on self-generated solutions to improve their +problem-solving ability. However, these approaches discard the large amounts of +incorrect solutions generated during this process, potentially neglecting +valuable information in such solutions. To address this shortcoming, we propose +V-STaR that utilizes both the correct and incorrect solutions generated during +the self-improvement process to train a verifier using DPO that judges +correctness of model-generated solutions. This verifier is used at inference +time to select one solution among many candidate solutions. Running V-STaR for +multiple iterations results in progressively better reasoners and verifiers, +delivering a 4% to 17% test accuracy improvement over existing self-improvement +and verification approaches on common code generation and math reasoning +benchmarks with LLaMA2 models. + +
+
+
+
+
+ + ♻ ☆ BiEquiFormer: Bi-Equivariant Representations for Global Point Cloud + Registration + + +
+ The goal of this paper is to address the problem of global point cloud +registration (PCR) i.e., finding the optimal alignment between point clouds +irrespective of the initial poses of the scans. This problem is notoriously +challenging for classical optimization methods due to computational +constraints. First, we show that state-of-the-art deep learning methods suffer +from huge performance degradation when the point clouds are arbitrarily placed +in space. We propose that equivariant deep learning should be utilized for +solving this task and we characterize the specific type of bi-equivariance of +PCR. Then, we design BiEquiformer a novel and scalable bi-equivariant pipeline +i.e. equivariant to the independent transformations of the input point clouds. +While a naive approach would process the point clouds independently we design +expressive bi-equivariant layers that fuse the information from both point +clouds. This allows us to extract high-quality superpoint correspondences and +in turn, robust point-cloud registration. Extensive comparisons against +state-of-the-art methods show that our method achieves comparable performance +in the canonical setting and superior performance in the robust setting in both +the 3DMatch and the challenging low-overlap 3DLoMatch dataset. + +
+
+
+
+
+ + ♻ ☆ Efficient Retrieval with Learned Similarities + + +
+ Retrieval plays a fundamental role in recommendation systems, search, and +natural language processing by efficiently finding relevant items from a large +corpus given a query. Dot products have been widely used as the similarity +function in such retrieval tasks, thanks to Maximum Inner Product Search (MIPS) +that enabled efficient retrieval based on dot products. However, +state-of-the-art retrieval algorithms have migrated to learned similarities. +Such algorithms vary in form; the queries can be represented with multiple +embeddings, complex neural networks can be deployed, the item ids can be +decoded directly from queries using beam search, and multiple approaches can be +combined in hybrid solutions. Unfortunately, we lack efficient solutions for +retrieval in these state-of-the-art setups. Our work investigates techniques +for approximate nearest neighbor search with learned similarity functions. We +first prove that Mixture-of-Logits (MoL) is a universal approximator, and can +express all learned similarity functions. We next propose techniques to +retrieve the approximate top K results using MoL with a tight bound. We finally +compare our techniques with existing approaches, showing that MoL sets new +state-of-the-art results on recommendation retrieval tasks, and our approximate +top-k retrieval with learned similarities outperforms baselines by up to two +orders of magnitude in latency, while achieving > .99 recall rate of exact +algorithms. + +
+
+
+
+
+ + ♻ ☆ Non-Determinism and the Lawlessness of Machine Learning Code + + +
+ Legal literature on machine learning (ML) tends to focus on harms, and thus +tends to reason about individual model outcomes and summary error rates. This +focus has masked important aspects of ML that are rooted in its reliance on +randomness -- namely, stochasticity and non-determinism. While some recent work +has begun to reason about the relationship between stochasticity and +arbitrariness in legal contexts, the role of non-determinism more broadly +remains unexamined. In this paper, we clarify the overlap and differences +between these two concepts, and show that the effects of non-determinism, and +consequently its implications for the law, become clearer from the perspective +of reasoning about ML outputs as distributions over possible outcomes. This +distributional viewpoint accounts for randomness by emphasizing the possible +outcomes of ML. Importantly, this type of reasoning is not exclusive with +current legal reasoning; it complements (and in fact can strengthen) analyses +concerning individual, concrete outcomes for specific automated decisions. By +illuminating the important role of non-determinism, we demonstrate that ML code +falls outside of the cyberlaw frame of treating ``code as law,'' as this frame +assumes that code is deterministic. We conclude with a brief discussion of what +work ML can do to constrain the potentially harm-inducing effects of +non-determinism, and we indicate where the law must do work to bridge the gap +between its current individual-outcome focus and the distributional approach +that we recommend. + +
+
+ comment: Proceedings of the 2022 Symposium on Computer Science and Law (CSLAW + '22) +
+
+
+
+
+ + ♻ ☆ Universality of periodic points in bounded discrete time series + + +
+ We consider arbitrary bounded discrete time series originating from dynamical +system. Without any use of the Fourier transform, we find periodic points which +suitably characterizes (i.e. independent of Lyapunov exponent) the +corresponding time series. In particular, bounded discrete time series +generated by the autoregressive model (without the white noise) is equivalent +to a quasi periodic function. + +
+
+
+
+
+ + ♻ ☆ Depth Degeneracy in Neural Networks: Vanishing Angles in Fully Connected + ReLU Networks on Initialization + + +
+ Despite remarkable performance on a variety of tasks, many properties of deep +neural networks are not yet theoretically understood. One such mystery is the +depth degeneracy phenomenon: the deeper you make your network, the closer your +network is to a constant function on initialization. In this paper, we examine +the evolution of the angle between two inputs to a ReLU neural network as a +function of the number of layers. By using combinatorial expansions, we find +precise formulas for how fast this angle goes to zero as depth increases. These +formulas capture microscopic fluctuations that are not visible in the popular +framework of infinite width limits, and leads to qualitatively different +predictions. We validate our theoretical results with Monte Carlo experiments +and show that our results accurately approximate finite network behaviour. +\review{We also empirically investigate how the depth degeneracy phenomenon can +negatively impact training of real networks.} The formulas are given in terms +of the mixed moments of correlated Gaussians passed through the ReLU function. +We also find a surprising combinatorial connection between these mixed moments +and the Bessel numbers that allows us to explicitly evaluate these moments. + +
+
+ comment: Minor updates and exposition improved. Added a section with more + numerical experiments. 45 pages, comments welcome. To appear in Journal of + Machine Learning research +
+
+
+
+
+ + ♻ ☆ The Model Openness Framework: Promoting Completeness and Openness for + Reproducibility, Transparency, and Usability in Artificial Intelligence + + +
+ Generative AI (GAI) offers unprecedented opportunities for research and +innovation, but its commercialization has raised concerns about transparency, +reproducibility, and safety. Many open GAI models lack the necessary components +for full understanding and reproducibility, and some use restrictive licenses +whilst claiming to be ``open-source''. To address these concerns, we propose +the Model Openness Framework (MOF), a ranked classification system that rates +machine learning models based on their completeness and openness, following +principles of open science, open source, open data, and open access. The MOF +requires specific components of the model development lifecycle to be included +and released under appropriate open licenses. This framework aims to prevent +misrepresentation of models claiming to be open, guide researchers and +developers in providing all model components under permissive licenses, and +help individuals and organizations identify models that can be safely adopted +without restrictions. By promoting transparency and reproducibility, the MOF +combats ``openwashing'' practices and establishes completeness and openness as +primary criteria alongside the core tenets of responsible AI. Wide adoption of +the MOF will foster a more open AI ecosystem, benefiting research, innovation, +and adoption of state-of-the-art models. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Optimal Scalarizations for Sublinear Hypervolume Regret + + +
+ Scalarization is a general, parallizable technique that can be deployed in +any multiobjective setting to reduce multiple objectives into one, yet some +have dismissed this versatile approach because linear scalarizations cannot +explore concave regions of the Pareto frontier. To that end, we aim to find +simple non-linear scalarizations that provably explore a diverse set of $k$ +objectives on the Pareto frontier, as measured by the dominated hypervolume. We +show that hypervolume scalarizations with uniformly random weights achieves an +optimal sublinear hypervolume regret bound of $O(T^{-1/k})$, with matching +lower bounds that preclude any algorithm from doing better asymptotically. For +the setting of multiobjective stochastic linear bandits, we utilize properties +of hypervolume scalarizations to derive a novel non-Euclidean analysis to get +regret bounds of $\tilde{O}( d T^{-1/2} + T^{-1/k})$, removing unnecessary +$\text{poly}(k)$ dependencies. We support our theory with strong empirical +performance of using non-linear scalarizations that outperforms both their +linear counterparts and other standard multiobjective algorithms in a variety +of natural settings. + +
+
+ comment: Neurips 2024 +
+
+
+
+
+ + ♻ ☆ A Survey of Meta-Reinforcement Learning + + +
+ While deep reinforcement learning (RL) has fueled multiple high-profile +successes in machine learning, it is held back from more widespread adoption by +its often poor data efficiency and the limited generality of the policies it +produces. A promising approach for alleviating these limitations is to cast the +development of better RL algorithms as a machine learning problem itself in a +process called meta-RL. Meta-RL is most commonly studied in a problem setting +where, given a distribution of tasks, the goal is to learn a policy that is +capable of adapting to any new task from the task distribution with as little +data as possible. In this survey, we describe the meta-RL problem setting in +detail as well as its major variations. We discuss how, at a high level, +meta-RL research can be clustered based on the presence of a task distribution +and the learning budget available for each individual task. Using these +clusters, we then survey meta-RL algorithms and applications. We conclude by +presenting the open problems on the path to making meta-RL part of the standard +toolbox for a deep RL practitioner. + +
+
+
+
+
+ + ♻ ☆ CoTFormer: A Chain-of-Thought Driven Architecture with Budget-Adaptive + Computation Cost at Inference + + +
+ Scaling language models to larger and deeper sizes has led to significant +boosts in performance. Even though the size of these models limits their +application in compute-constrained environments, the race to continually +develop ever larger and deeper foundational models is underway. At the same +time -- regardless of the model size -- task-specific techniques continue to +play a pivotal role in achieving optimal downstream performance. One of these +techniques, called Chain-of-Thought (CoT), is particularly interesting since, +as we point out in this work, it resembles employing a deeper transformer +through re-applying the model multiple times. However, a key subtlety in +computing the attention of past tokens differentiates CoT from simply applying +the model several times. Based on this insight, we propose CoTFormer, a novel +architecture which closely mimics CoT at the token level, allowing us to obtain +significantly improved accuracies close to much larger models. While applying +CoT introduces additional computation costs, we compensate for it by leveraging +CoTFormer's special compatibility with token-wise variable depth. Through a +compute adaptive model -- which automatically allocates the compute to tokens +that need it most -- we show that it is possible to reduce the computation cost +significantly without any reduction in accuracy, and with further compute cost +reductions possible while maintaining a competitive accuracy. + +
+
+
+
+
+ + ♻ ☆ Safe Imitation Learning of Nonlinear Model Predictive Control for + Flexible Robots IROS 2024 + + +
+ Flexible robots may overcome some of the industry's major challenges, such as +enabling intrinsically safe human-robot collaboration and achieving a higher +payload-to-mass ratio. However, controlling flexible robots is complicated due +to their complex dynamics, which include oscillatory behavior and a +high-dimensional state space. Nonlinear model predictive control (NMPC) offers +an effective means to control such robots, but its significant computational +demand often limits its application in real-time scenarios. To enable fast +control of flexible robots, we propose a framework for a safe approximation of +NMPC using imitation learning and a predictive safety filter. Our framework +significantly reduces computation time while incurring a slight loss in +performance. Compared to NMPC, our framework shows more than an eightfold +improvement in computation time when controlling a three-dimensional flexible +robot arm in simulation, all while guaranteeing safety constraints. Notably, +our approach outperforms state-of-the-art reinforcement learning methods. The +development of fast and safe approximate NMPC holds the potential to accelerate +the adoption of flexible robots in industry. The project code is available at: +tinyurl.com/anmpc4fr + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ Trained Random Forests Completely Reveal your Dataset + + +
+ We introduce an optimization-based reconstruction attack capable of +completely or near-completely reconstructing a dataset utilized for training a +random forest. Notably, our approach relies solely on information readily +available in commonly used libraries such as scikit-learn. To achieve this, we +formulate the reconstruction problem as a combinatorial problem under a maximum +likelihood objective. We demonstrate that this problem is NP-hard, though +solvable at scale using constraint programming -- an approach rooted in +constraint propagation and solution-domain reduction. Through an extensive +computational investigation, we demonstrate that random forests trained without +bootstrap aggregation but with feature randomization are susceptible to a +complete reconstruction. This holds true even with a small number of trees. +Even with bootstrap aggregation, the majority of the data can also be +reconstructed. These findings underscore a critical vulnerability inherent in +widely adopted ensemble methods, warranting attention and mitigation. Although +the potential for such reconstruction attacks has been discussed in privacy +research, our study provides clear empirical evidence of their practicability. + +
+
+
+
+
+ + ♻ ☆ Bottleneck Structure in Learned Features: Low-Dimension vs Regularity + Tradeoff + + +
+ Previous work has shown that DNNs with large depth $L$ and +$L_{2}$-regularization are biased towards learning low-dimensional +representations of the inputs, which can be interpreted as minimizing a notion +of rank $R^{(0)}(f)$ of the learned function $f$, conjectured to be the +Bottleneck rank. We compute finite depth corrections to this result, revealing +a measure $R^{(1)}$ of regularity which bounds the pseudo-determinant of the +Jacobian $\left|Jf(x)\right|_{+}$ and is subadditive under composition and +addition. This formalizes a balance between learning low-dimensional +representations and minimizing complexity/irregularity in the feature maps, +allowing the network to learn the `right' inner dimension. Finally, we prove +the conjectured bottleneck structure in the learned features as $L\to\infty$: +for large depths, almost all hidden representations are approximately +$R^{(0)}(f)$-dimensional, and almost all weight matrices $W_{\ell}$ have +$R^{(0)}(f)$ singular values close to 1 while the others are +$O(L^{-\frac{1}{2}})$. Interestingly, the use of large learning rates is +required to guarantee an order $O(L)$ NTK which in turns guarantees infinite +depth convergence of the representations of almost all layers. + +
+
+
+
+
+ + ♻ ☆ Can LLMs Replace Economic Choice Prediction Labs? The Case of + Language-based Persuasion Games + + +
+ Human choice prediction in economic contexts is crucial for applications in +marketing, finance, public policy, and more. This task, however, is often +constrained by the difficulties in acquiring human choice data. With most +experimental economics studies focusing on simple choice settings, the AI +community has explored whether LLMs can substitute for humans in these +predictions and examined more complex experimental economics settings. However, +a key question remains: can LLMs generate training data for human choice +prediction? We explore this in language-based persuasion games, a complex +economic setting involving natural language in strategic interactions. Our +experiments show that models trained on LLM-generated data can effectively +predict human behavior in these games and even outperform models trained on +actual human data. + +
+
+
+
+
+ + ♻ ☆ Just Cluster It: An Approach for Exploration in High-Dimensions using + Clustering and Pre-Trained Representations ICML + + +
+ In this paper we adopt a representation-centric perspective on exploration in +reinforcement learning, viewing exploration fundamentally as a density +estimation problem. We investigate the effectiveness of clustering +representations for exploration in 3-D environments, based on the observation +that the importance of pixel changes between transitions is less pronounced in +3-D environments compared to 2-D environments, where pixel changes between +transitions are typically distinct and significant. We propose a method that +performs episodic and global clustering on random representations and on +pre-trained DINO representations to count states, i.e, estimate pseudo-counts. +Surprisingly, even random features can be clustered effectively to count states +in 3-D environments, however when these become visually more complex, +pre-trained DINO representations are more effective thanks to the pre-trained +inductive biases in the representations. Overall, this presents a pathway for +integrating pre-trained biases into exploration. We evaluate our approach on +the VizDoom and Habitat environments, demonstrating that our method surpasses +other well-known exploration methods in these settings. + +
+
+ comment: Accepted at the International Conference On Machine Learning (ICML) + 2024 +
+
+
+
+
+ + ♻ ☆ Voice-Driven Mortality Prediction in Hospitalized Heart Failure + Patients: A Machine Learning Approach Enhanced with Diagnostic Biomarkers + + +
+ Addressing heart failure (HF) as a prevalent global health concern poses +difficulties in implementing innovative approaches for enhanced patient care. +Predicting mortality rates in HF patients, in particular, is difficult yet +critical, necessitating individualized care, proactive management, and enabling +educated decision-making to enhance outcomes. Recently, the significance of +voice biomarkers coupled with Machine Learning (ML) has surged, demonstrating +remarkable efficacy, particularly in predicting heart failure. The synergy of +voice analysis and ML algorithms provides a non-invasive and easily accessible +means to evaluate patients' health. However, there is a lack of voice +biomarkers for predicting mortality rates among heart failure patients with +standardized speech protocols. Here, we demonstrate a powerful and effective ML +model for predicting mortality rates in hospitalized HF patients through the +utilization of voice biomarkers. By seamlessly integrating voice biomarkers +into routine patient monitoring, this strategy has the potential to improve +patient outcomes, optimize resource allocation, and advance patient-centered HF +management. In this study, a Machine Learning system, specifically a logistic +regression model, is trained to predict patients' 5-year mortality rates using +their speech as input. The model performs admirably and consistently, as +demonstrated by cross-validation and statistical approaches (p-value < 0.001). +Furthermore, integrating NT-proBNP, a diagnostic biomarker in HF, improves the +model's predictive accuracy substantially. + +
+
+ comment: 11 pages, 6 figures, 5 tables. The first 2 authors have contributed + equally +
+
+
+
+
+ + ♻ ☆ Formally Verifying Deep Reinforcement Learning Controllers with Lyapunov + Barrier Certificates + + +
+ Deep reinforcement learning (DRL) is a powerful machine learning paradigm for +generating agents that control autonomous systems. However, the ``black box'' +nature of DRL agents limits their deployment in real-world safety-critical +applications. A promising approach for providing strong guarantees on an +agent's behavior is to use Neural Lyapunov Barrier (NLB) certificates, which +are learned functions over the system whose properties indirectly imply that an +agent behaves as desired. However, NLB-based certificates are typically +difficult to learn and even more difficult to verify, especially for complex +systems. In this work, we present a novel method for training and verifying +NLB-based certificates for discrete-time systems. Specifically, we introduce a +technique for certificate composition, which simplifies the verification of +highly-complex systems by strategically designing a sequence of certificates. +When jointly verified with neural network verification engines, these +certificates provide a formal guarantee that a DRL agent both achieves its +goals and avoids unsafe behavior. Furthermore, we introduce a technique for +certificate filtering, which significantly simplifies the process of producing +formally verified certificates. We demonstrate the merits of our approach with +a case study on providing safety and liveness guarantees for a DRL-controlled +spacecraft. + +
+
+ comment: To appear in FMCAD 2024 +
+
+
+
+
+ + ♻ ☆ Symmetry-enforcing neural networks with applications to constitutive + modeling + + +
+ The use of machine learning techniques to homogenize the effective behavior +of arbitrary microstructures has been shown to be not only efficient but also +accurate. In a recent work, we demonstrated how to combine state-of-the-art +micromechanical modeling and advanced machine learning techniques to homogenize +complex microstructures exhibiting non-linear and history dependent behaviors +(Logarzo et al., 2021). The resulting homogenized model, termed smart +constitutive law (SCL), enables the adoption of microstructurally informed +constitutive laws into finite element solvers at a fraction of the +computational cost required by traditional concurrent multiscale approaches. In +this work, the capabilities of SCLs are expanded via the introduction of a +novel methodology that enforces material symmetries at the neuron level, +applicable across various neural network architectures. This approach utilizes +tensor-based features in neural networks, facilitating the concise and accurate +representation of symmetry-preserving operations, and is general enough to be +extend to problems beyond constitutive modeling. Details on the construction of +these tensor-based neural networks and their application in learning +constitutive laws are presented for both elastic and inelastic materials. The +superiority of this approach over traditional neural networks is demonstrated +in scenarios with limited data and strong symmetries, through comprehensive +testing on various materials, including isotropic neo-Hookean materials and +tensegrity lattice metamaterials. This work is concluded by a discussion on the +potential of this methodology to discover symmetry bases in materials and by an +outline of future research directions. + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ End-to-end Semantic-centric Video-based Multimodal Affective Computing + + +
+ In the pathway toward Artificial General Intelligence (AGI), understanding +human's affection is essential to enhance machine's cognition abilities. For +achieving more sensual human-AI interaction, Multimodal Affective Computing +(MAC) in human-spoken videos has attracted increasing attention. However, +previous methods are mainly devoted to designing multimodal fusion algorithms, +suffering from two issues: semantic imbalance caused by diverse pre-processing +operations and semantic mismatch raised by inconsistent affection content +contained in different modalities comparing with the multimodal ground truth. +Besides, the usage of manual features extractors make they fail in building +end-to-end pipeline for multiple MAC downstream tasks. To address above +challenges, we propose a novel end-to-end framework named SemanticMAC to +compute multimodal semantic-centric affection for human-spoken videos. We +firstly employ pre-trained Transformer model in multimodal data pre-processing +and design Affective Perceiver module to capture unimodal affective +information. Moreover, we present a semantic-centric approach to unify +multimodal representation learning in three ways, including gated feature +interaction, multi-task pseudo label generation, and intra-/inter-sample +contrastive learning. Finally, SemanticMAC effectively learn specific- and +shared-semantic representations in the guidance of semantic-centric labels. +Extensive experimental results demonstrate that our approach surpass the +state-of-the-art methods on 7 public datasets in four MAC downstream tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ 3D Gaussian Editing with A Single Image + + +
+ The modeling and manipulation of 3D scenes captured from the real world are +pivotal in various applications, attracting growing research interest. While +previous works on editing have achieved interesting results through +manipulating 3D meshes, they often require accurately reconstructed meshes to +perform editing, which limits their application in 3D content generation. To +address this gap, we introduce a novel single-image-driven 3D scene editing +approach based on 3D Gaussian Splatting, enabling intuitive manipulation via +directly editing the content on a 2D image plane. Our method learns to optimize +the 3D Gaussians to align with an edited version of the image rendered from a +user-specified viewpoint of the original scene. To capture long-range object +deformation, we introduce positional loss into the optimization process of 3D +Gaussian Splatting and enable gradient propagation through reparameterization. +To handle occluded 3D Gaussians when rendering from the specified viewpoint, we +build an anchor-based structure and employ a coarse-to-fine optimization +strategy capable of handling long-range deformation while maintaining +structural stability. Furthermore, we design a novel masking strategy to +adaptively identify non-rigid deformation regions for fine-scale modeling. +Extensive experiments show the effectiveness of our method in handling +geometric details, long-range, and non-rigid deformation, demonstrating +superior editing flexibility and quality compared to previous approaches. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Automated Retinal Image Analysis and Medical Report Generation through + Deep Learning + + +
+ The increasing prevalence of retinal diseases poses a significant challenge +to the healthcare system, as the demand for ophthalmologists surpasses the +available workforce. This imbalance creates a bottleneck in diagnosis and +treatment, potentially delaying critical care. Traditional methods of +generating medical reports from retinal images rely on manual interpretation, +which is time-consuming and prone to errors, further straining +ophthalmologists' limited resources. This thesis investigates the potential of +Artificial Intelligence (AI) to automate medical report generation for retinal +images. AI can quickly analyze large volumes of image data, identifying subtle +patterns essential for accurate diagnosis. By automating this process, AI +systems can greatly enhance the efficiency of retinal disease diagnosis, +reducing doctors' workloads and enabling them to focus on more complex cases. +The proposed AI-based methods address key challenges in automated report +generation: (1) Improved methods for medical keyword representation enhance the +system's ability to capture nuances in medical terminology; (2) A multi-modal +deep learning approach captures interactions between textual keywords and +retinal images, resulting in more comprehensive medical reports; (3) Techniques +to enhance the interpretability of the AI-based report generation system, +fostering trust and acceptance in clinical practice. These methods are +rigorously evaluated using various metrics and achieve state-of-the-art +performance. This thesis demonstrates AI's potential to revolutionize retinal +disease diagnosis by automating medical report generation, ultimately improving +clinical efficiency, diagnostic accuracy, and patient care. +[https://github.com/Jhhuangkay/DeepOpht-Medical-Report-Generation-for-Retinal-Images-via-Deep-Models-and-Visual-Explanation] + +
+
+ comment: Ph.D. thesis, 124 pages +
+
+
+
+
+ + ☆ An Efficient and Explanatory Image and Text Clustering System with + Multimodal Autoencoder Architecture + + +
+ We demonstrate the efficiencies and explanatory abilities of extensions to +the common tools of Autoencoders and LLM interpreters, in the novel context of +comparing different cultural approaches to the same international news event. +We develop a new Convolutional-Recurrent Variational Autoencoder (CRVAE) model +that extends the modalities of previous CVAE models, by using fully-connected +latent layers to embed in parallel the CNN encodings of video frames, together +with the LSTM encodings of their related text derived from audio. We +incorporate the model within a larger system that includes frame-caption +alignment, latent space vector clustering, and a novel LLM-based cluster +interpreter. We measure, tune, and apply this system to the task of summarizing +a video into three to five thematic clusters, with each theme described by ten +LLM-produced phrases. We apply this system to two news topics, COVID-19 and the +Winter Olympics, and five other topics are in progress. + +
+
+
+
+
+ + ♻ ☆ Lost in Overlap: Exploring Watermark Collision in LLMs + + +
+ The proliferation of large language models (LLMs) in generating content +raises concerns about text copyright. Watermarking methods, particularly +logit-based approaches, embed imperceptible identifiers into text to address +these challenges. However, the widespread usage of watermarking across diverse +LLMs has led to an inevitable issue known as watermark collision during common +tasks, such as paraphrasing or translation. In this paper, we introduce +watermark collision as a novel and general philosophy for watermark attacks, +aimed at enhancing attack performance on top of any other attacking methods. We +also provide a comprehensive demonstration that watermark collision poses a +threat to all logit-based watermark algorithms, impacting not only specific +attack scenarios but also downstream applications. + +
+
+ comment: Long Paper, 7 pages +
+
+
+
+
+ + ♻ ☆ Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies BMVC 2024 + + +
+ Existing methods on audio-visual deepfake detection mainly focus on +high-level features for modeling inconsistencies between audio and visual data. +As a result, these approaches usually overlook finer audio-visual artifacts, +which are inherent to deepfakes. Herein, we propose the introduction of +fine-grained mechanisms for detecting subtle artifacts in both spatial and +temporal domains. First, we introduce a local audio-visual model capable of +capturing small spatial regions that are prone to inconsistencies with audio. +For that purpose, a fine-grained mechanism based on a spatially-local distance +coupled with an attention module is adopted. Second, we introduce a +temporally-local pseudo-fake augmentation to include samples incorporating +subtle temporal inconsistencies in our training set. Experiments on the DFDC +and the FakeAVCeleb datasets demonstrate the superiority of the proposed method +in terms of generalization as compared to the state-of-the-art under both +in-dataset and cross-dataset settings. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Unraveling Instance Associations: A Closer Look for Audio-Visual + Segmentation + + +
+ Audio-visual segmentation (AVS) is a challenging task that involves +accurately segmenting sounding objects based on audio-visual cues. The +effectiveness of audio-visual learning critically depends on achieving accurate +cross-modal alignment between sound and visual objects. Successful audio-visual +learning requires two essential components: 1) a challenging dataset with +high-quality pixel-level multi-class annotated images associated with audio +files, and 2) a model that can establish strong links between audio information +and its corresponding visual object. However, these requirements are only +partially addressed by current methods, with training sets containing biased +audio-visual data, and models that generalise poorly beyond this biased +training set. In this work, we propose a new cost-effective strategy to build +challenging and relatively unbiased high-quality audio-visual segmentation +benchmarks. We also propose a new informative sample mining method for +audio-visual supervised contrastive learning to leverage discriminative +contrastive samples to enforce cross-modal understanding. We show empirical +results that demonstrate the effectiveness of our benchmark. Furthermore, +experiments conducted on existing AVS datasets and on our new benchmark show +that our method achieves state-of-the-art (SOTA) segmentation accuracy. + +
+
+ comment: Code is available at https://github.com/cyh-0/CAVP +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Sketch-Aware Interpolation Network for High-Quality + Animation Sketch Inbetweening + + +
+ Hand-drawn 2D animation workflow is typically initiated with the creation of +sketch keyframes. Subsequent manual inbetweens are crafted for smoothness, +which is a labor-intensive process and the prospect of automatic animation +sketch interpolation has become highly appealing. Yet, common frame +interpolation methods are generally hindered by two key issues: 1) limited +texture and colour details in sketches, and 2) exaggerated alterations between +two sketch keyframes. To overcome these issues, we propose a novel deep +learning method - Sketch-Aware Interpolation Network (SAIN). This approach +incorporates multi-level guidance that formulates region-level correspondence, +stroke-level correspondence and pixel-level dynamics. A multi-stream +U-Transformer is then devised to characterize sketch inbetweening patterns +using these multi-level guides through the integration of self / +cross-attention mechanisms. Additionally, to facilitate future research on +animation sketch inbetweening, we constructed a large-scale dataset - STD-12K, +comprising 30 sketch animation series in diverse artistic styles. Comprehensive +experiments on this dataset convincingly show that our proposed SAIN surpasses +the state-of-the-art interpolation methods. + +
+
+
+
+
+ + ♻ ☆ Sound-VECaps: Improving Audio Generation with Visual Enhanced Captions + + +
+ Generative models have shown significant achievements in audio generation +tasks. However, existing models struggle with complex and detailed prompts, +leading to potential performance degradation. We hypothesize that this problem +stems from the simplicity and scarcity of the training data. This work aims to +create a large-scale audio dataset with rich captions for improving audio +generation models. We first develop an automated pipeline to generate detailed +captions by transforming predicted visual captions, audio captions, and tagging +labels into comprehensive descriptions using a Large Language Model (LLM). The +resulting dataset, Sound-VECaps, comprises 1.66M high-quality audio-caption +pairs with enriched details including audio event orders, occurred places and +environment information. We then demonstrate that training the text-to-audio +generation models with Sound-VECaps significantly improves the performance on +complex prompts. Furthermore, we conduct ablation studies of the models on +several downstream audio-language tasks, showing the potential of Sound-VECaps +in advancing audio-text representation learning. Our dataset and models are +available online. + +
+
+ comment: 5 pages with 1 appendix +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 90 + +
+
+
+ + ☆ Fingerspelling within Sign Language Translation + + +
+ Fingerspelling poses challenges for sign language processing due to its +high-frequency motion and use for open-vocabulary terms. While prior work has +studied fingerspelling recognition, there has been little attention to +evaluating how well sign language translation models understand fingerspelling +in the context of entire sentences -- and improving this capability. We +manually annotate instances of fingerspelling within FLEURS-ASL and use them to +evaluate the effect of two simple measures to improve fingerspelling +recognition within American Sign Language to English translation: 1) use a +model family (ByT5) with character- rather than subword-level tokenization, and +2) mix fingerspelling recognition data into the translation training mixture. +We find that 1) substantially improves understanding of fingerspelling (and +therefore translation quality overall), but the effect of 2) is mixed. + +
+
+
+
+
+ + ☆ Diversity Empowers Intelligence: Integrating Expertise of Software + Engineering Agents + + +
+ Large language model (LLM) agents have shown great potential in solving +real-world software engineering (SWE) problems. The most advanced open-source +SWE agent can resolve over 27% of real GitHub issues in SWE-Bench Lite. +However, these sophisticated agent frameworks exhibit varying strengths, +excelling in certain tasks while underperforming in others. To fully harness +the diversity of these agents, we propose DEI (Diversity Empowered +Intelligence), a framework that leverages their unique expertise. DEI functions +as a meta-module atop existing SWE agent frameworks, managing agent collectives +for enhanced problem-solving. Experimental results show that a DEI-guided +committee of agents is able to surpass the best individual agent's performance +by a large margin. For instance, a group of open-source SWE agents, with a +maximum individual resolve rate of 27.3% on SWE-Bench Lite, can achieve a 34.3% +resolve rate with DEI, making a 25% improvement and beating most closed-source +solutions. Our best-performing group excels with a 55% resolve rate, securing +the highest ranking on SWE-Bench Lite. Our findings contribute to the growing +body of research on collaborative AI systems and their potential to solve +complex software engineering challenges. + +
+
+
+
+
+ + ☆ A Survey on Model MoErging: Recycling and Routing Among Specialized + Experts for Collaborative Learning + + +
+ The availability of performant pre-trained models has led to a proliferation +of fine-tuned expert models that are specialized to a particular domain or +task. Model MoErging methods aim to recycle expert models to create an +aggregate system with improved performance or generalization. A key component +of MoErging methods is the creation of a router that decides which expert +model(s) to use for a particular input or application. The promise, +effectiveness, and large design space of MoErging has spurred the development +of many new methods over the past few years. This rapid pace of development has +made it challenging to compare different MoErging methods, which are rarely +compared to one another and are often validated in different experimental +setups. To remedy such gaps, we present a comprehensive survey of MoErging +methods that includes a novel taxonomy for cataloging key design choices and +clarifying suitable applications for each method. Apart from surveying MoErging +research, we inventory software tools and applications that make use of +MoErging. We additionally discuss related fields of study such as model +merging, multitask learning, and mixture-of-experts models. Taken as a whole, +our survey provides a unified overview of existing MoErging methods and creates +a solid foundation for future work in this burgeoning field. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ LongWriter: Unleashing 10,000+ Word Generation from Long Context LLMs + + +
+ Current long context large language models (LLMs) can process inputs up to +100,000 tokens, yet struggle to generate outputs exceeding even a modest length +of 2,000 words. Through controlled experiments, we find that the model's +effective generation length is inherently bounded by the sample it has seen +during supervised fine-tuning (SFT). In other words, their output limitation is +due to the scarcity of long-output examples in existing SFT datasets. To +address this, we introduce AgentWrite, an agent-based pipeline that decomposes +ultra-long generation tasks into subtasks, enabling off-the-shelf LLMs to +generate coherent outputs exceeding 20,000 words. Leveraging AgentWrite, we +construct LongWriter-6k, a dataset containing 6,000 SFT data with output +lengths ranging from 2k to 32k words. By incorporating this dataset into model +training, we successfully scale the output length of existing models to over +10,000 words while maintaining output quality. We also develop LongBench-Write, +a comprehensive benchmark for evaluating ultra-long generation capabilities. +Our 9B parameter model, further improved through DPO, achieves state-of-the-art +performance on this benchmark, surpassing even much larger proprietary models. +In general, our work demonstrates that existing long context LLM already +possesses the potential for a larger output window--all you need is data with +extended output during model alignment to unlock this capability. Our code & +models are at: https://github.com/THUDM/LongWriter. + +
+
+
+
+
+ + ☆ The News Comment Gap and Algorithmic Agenda Setting in Online Forums + + +
+ The disparity between news stories valued by journalists and those preferred +by readers, known as the "News Gap", is well-documented. However, the +difference in expectations regarding news related user-generated content is +less studied. Comment sections, hosted by news websites, are popular venues for +reader engagement, yet still subject to editorial decisions. It is thus +important to understand journalist vs reader comment preferences and how these +are served by various comment ranking algorithms that represent discussions +differently. We analyse 1.2 million comments from Austrian newspaper Der +Standard to understand the "News Comment Gap" and the effects of different +ranking algorithms. We find that journalists prefer positive, timely, complex, +direct responses, while readers favour comments similar to article content from +elite authors. We introduce the versatile Feature-Oriented Ranking Utility +Metric (FORUM) to assess the impact of different ranking algorithms and find +dramatic differences in how they prioritise the display of comments by +sentiment, topical relevance, lexical diversity, and readability. Journalists +can exert substantial influence over the discourse through both curatorial and +algorithmic means. Understanding these choices' implications is vital in +fostering engaging and civil discussions while aligning with journalistic +objectives, especially given the increasing legal scrutiny and societal +importance of online discourse. + +
+
+
+
+
+ + ☆ TableGuard -- Securing Structured & Unstructured Data + + +
+ With the increasing demand for data sharing across platforms and +organizations, ensuring the privacy and security of sensitive information has +become a critical challenge. This paper introduces "TableGuard". An innovative +approach to data obfuscation tailored for relational databases. Building on the +principles and techniques developed in prior work on context-sensitive +obfuscation, TableGuard applies these methods to ensure that API calls return +only obfuscated data, thereby safeguarding privacy when sharing data with third +parties. TableGuard leverages advanced context-sensitive obfuscation techniques +to replace sensitive data elements with contextually appropriate alternatives. +By maintaining the relational integrity and coherence of the data, our approach +mitigates the risks of cognitive dissonance and data leakage. We demonstrate +the implementation of TableGuard using a BERT based transformer model, which +identifies and obfuscates sensitive entities within relational tables. Our +evaluation shows that TableGuard effectively balances privacy protection with +data utility, minimizing information loss while ensuring that the obfuscated +data remains functionally useful for downstream applications. The results +highlight the importance of domain-specific obfuscation strategies and the role +of context length in preserving data integrity. The implications of this +research are significant for organizations that need to share data securely +with external parties. TableGuard offers a robust framework for implementing +privacy-preserving data sharing mechanisms, thereby contributing to the broader +field of data privacy and security. + +
+
+ comment: 7 pages, 3 tables, 1 figure +
+
+
+
+
+ + ☆ Generative AI for automatic topic labelling + + +
+ Topic Modeling has become a prominent tool for the study of scientific +fields, as they allow for a large scale interpretation of research trends. +Nevertheless, the output of these models is structured as a list of keywords +which requires a manual interpretation for the labelling. This paper proposes +to assess the reliability of three LLMs, namely flan, GPT-4o, and GPT-4 mini +for topic labelling. Drawing on previous research leveraging BERTopic, we +generate topics from a dataset of all the scientific articles (n=34,797) +authored by all biology professors in Switzerland (n=465) between 2008 and +2020, as recorded in the Web of Science database. We assess the output of the +three models both quantitatively and qualitatively and find that, first, both +GPT models are capable of accurately and precisely label topics from the +models' output keywords. Second, 3-word labels are preferable to grasp the +complexity of research topics. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ The advantages of context specific language models: the case of the + Erasmian Language Model + + +
+ The current trend to improve language model performance seems to be based on +scaling up with the number of parameters (e.g. the state of the art GPT4 model +has approximately 1.7 trillion parameters) or the amount of training data fed +into the model. However this comes at significant costs in terms of +computational resources and energy costs that compromise the sustainability of +AI solutions, as well as risk relating to privacy and misuse. In this paper we +present the Erasmian Language Model (ELM) a small context specific, 900 million +parameter model, pre-trained and fine-tuned by and for Erasmus University +Rotterdam. We show how the model performs adequately in a classroom context for +essay writing, and how it achieves superior performance in subjects that are +part of its context. This has implications for a wide range of institutions and +organizations, showing that context specific language models may be a viable +alternative for resource constrained, privacy sensitive use cases. + +
+
+ comment: 12 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ Diagnosis extraction from unstructured Dutch echocardiogram reports + using span- and document-level characteristic classification + + +
+ Clinical machine learning research and AI driven clinical decision support +models rely on clinically accurate labels. Manually extracting these labels +with the help of clinical specialists is often time-consuming and expensive. +This study tests the feasibility of automatic span- and document-level +diagnosis extraction from unstructured Dutch echocardiogram reports. + We included 115,692 unstructured echocardiogram reports from the UMCU a large +university hospital in the Netherlands. A randomly selected subset was manually +annotated for the occurrence and severity of eleven commonly described cardiac +characteristics. We developed and tested several automatic labelling techniques +at both span and document levels, using weighted and macro F1-score, precision, +and recall for performance evaluation. We compared the performance of span +labelling against document labelling methods, which included both direct +document classifiers and indirect document classifiers that rely on span +classification results. + The SpanCategorizer and MedRoBERTa.nl models outperformed all other span and +document classifiers, respectively. The weighted F1-score varied between +characteristics, ranging from 0.60 to 0.93 in SpanCategorizer and 0.96 to 0.98 +in MedRoBERTa.nl. Direct document classification was superior to indirect +document classification using span classifiers. SetFit achieved competitive +document classification performance using only 10\% of the training data. +Utilizing a reduced label set yielded near-perfect document classification +results. + We recommend using our published SpanCategorizer and MedRoBERTa.nl models for +span- and document-level diagnosis extraction from Dutch echocardiography +reports. For settings with limited training data, SetFit may be a promising +alternative for document classification. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ Evaluating Cultural Adaptability of a Large Language Model via + Simulation of Synthetic Personas + + +
+ The success of Large Language Models (LLMs) in multicultural environments +hinges on their ability to understand users' diverse cultural backgrounds. We +measure this capability by having an LLM simulate human profiles representing +various nationalities within the scope of a questionnaire-style psychological +experiment. Specifically, we employ GPT-3.5 to reproduce reactions to +persuasive news articles of 7,286 participants from 15 countries; comparing the +results with a dataset of real participants sharing the same demographic +traits. Our analysis shows that specifying a person's country of residence +improves GPT-3.5's alignment with their responses. In contrast, using native +language prompting introduces shifts that significantly reduce overall +alignment, with some languages particularly impairing performance. These +findings suggest that while direct nationality information enhances the model's +cultural adaptability, native language cues do not reliably improve simulation +fidelity and can detract from the model's effectiveness. + +
+
+ comment: 18 pages, 8 figures, Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ☆ Re-TASK: Revisiting LLM Tasks from Capability, Skill, and Knowledge + Perspectives + + +
+ As large language models (LLMs) continue to scale, their enhanced performance +often proves insufficient for solving domain-specific tasks. Systematically +analyzing their failures and effectively enhancing their performance remain +significant challenges. This paper introduces the Re-TASK framework, a novel +theoretical model that Revisits LLM Tasks from cApability, Skill, Knowledge +perspectives, guided by the principles of Bloom's Taxonomy and Knowledge Space +Theory. The Re-TASK framework provides a systematic methodology to deepen our +understanding, evaluation, and enhancement of LLMs for domain-specific tasks. +It explores the interplay among an LLM's capabilities, the knowledge it +processes, and the skills it applies, elucidating how these elements are +interconnected and impact task performance. Our application of the Re-TASK +framework reveals that many failures in domain-specific tasks can be attributed +to insufficient knowledge or inadequate skill adaptation. With this insight, we +propose structured strategies for enhancing LLMs through targeted knowledge +injection and skill adaptation. Specifically, we identify key capability items +associated with tasks and employ a deliberately designed prompting strategy to +enhance task performance, thereby reducing the need for extensive fine-tuning. +Alternatively, we fine-tune the LLM using capability-specific instructions, +further validating the efficacy of our framework. Experimental results confirm +the framework's effectiveness, demonstrating substantial improvements in both +the performance and applicability of LLMs. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Leveraging Language Models for Emotion and Behavior Analysis in + Education + + +
+ The analysis of students' emotions and behaviors is crucial for enhancing +learning outcomes and personalizing educational experiences. Traditional +methods often rely on intrusive visual and physiological data collection, +posing privacy concerns and scalability issues. This paper proposes a novel +method leveraging large language models (LLMs) and prompt engineering to +analyze textual data from students. Our approach utilizes tailored prompts to +guide LLMs in detecting emotional and engagement states, providing a +non-intrusive and scalable solution. We conducted experiments using Qwen, +ChatGPT, Claude2, and GPT-4, comparing our method against baseline models and +chain-of-thought (CoT) prompting. Results demonstrate that our method +significantly outperforms the baselines in both accuracy and contextual +understanding. This study highlights the potential of LLMs combined with prompt +engineering to offer practical and effective tools for educational emotion and +behavior analysis. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ LoRA$^2$ : Multi-Scale Low-Rank Approximations for Fine-Tuning Large + Language Models + + +
+ Fine-tuning large language models (LLMs) with high parameter efficiency for +downstream tasks has become a new paradigm. Low-Rank Adaptation (LoRA) +significantly reduces the number of trainable parameters for fine-tuning. +Although it has demonstrated commendable performance, updating parameters +within a single scale may not be the optimal choice for complex downstream +tasks.In this paper, we extend the LoRA to multiple scales, dubbed as LoRA$^2$. +We first combine orthogonal projection theory to train a set of LoRAs in two +mutually orthogonal planes. Then, we improve the importance score algorithm, +which reduce parameter sensitivity score calculations by approximately 98.5\%. +By pruning singular values with lower importance scores, thereby enhancing +adaptability to various downstream tasks. Extensive experiments are conducted +on two widely used pre-trained models to validate the effectiveness of +LoRA$^2$. Results show that it significantly reduces the number of trainable +parameters to just 0.72\% compared to full fine-tuning, while still delivering +highly impressive performance. Even when the parameters are further reduced to +0.17M, it still achieves comparable results to the baseline with 8 times more +parameters. Our code is available here: +https://anonymous.4open.science/r/LoRA-2-5B4C + +
+
+
+
+
+ + ☆ Causal Agent based on Large Language Model + + +
+ Large language models (LLMs) have achieved significant success across various +domains. However, the inherent complexity of causal problems and causal theory +poses challenges in accurately describing them in natural language, making it +difficult for LLMs to comprehend and use them effectively. Causal methods are +not easily conveyed through natural language, which hinders LLMs' ability to +apply them accurately. Additionally, causal datasets are typically tabular, +while LLMs excel in handling natural language data, creating a structural +mismatch that impedes effective reasoning with tabular data. This lack of +causal reasoning capability limits the development of LLMs. To address these +challenges, we have equipped the LLM with causal tools within an agent +framework, named the Causal Agent, enabling it to tackle causal problems. The +causal agent comprises tools, memory, and reasoning modules. In the tools +module, the causal agent applies causal methods to align tabular data with +natural language. In the reasoning module, the causal agent employs the ReAct +framework to perform reasoning through multiple iterations with the tools. In +the memory module, the causal agent maintains a dictionary instance where the +keys are unique names and the values are causal graphs. To verify the causal +ability of the causal agent, we established a benchmark consisting of four +levels of causal problems: variable level, edge level, causal graph level, and +causal effect level. We generated a test dataset of 1.3K using ChatGPT-3.5 for +these four levels of issues and tested the causal agent on the datasets. Our +methodology demonstrates remarkable efficacy on the four-level causal problems, +with accuracy rates all above 80%. For further insights and implementation +details, our code is accessible via the GitHub repository +https://github.com/Kairong-Han/Causal_Agent. + +
+
+
+
+
+ + ☆ MAQA: Evaluating Uncertainty Quantification in LLMs Regarding Data + Uncertainty + + +
+ Although large language models (LLMs) are capable of performing various +tasks, they still suffer from producing plausible but incorrect responses. To +improve the reliability of LLMs, recent research has focused on uncertainty +quantification to predict whether a response is correct or not. However, most +uncertainty quantification methods have been evaluated on questions requiring a +single clear answer, ignoring the existence of data uncertainty that arises +from irreducible randomness. Instead, these methods only consider model +uncertainty, which arises from a lack of knowledge. In this paper, we +investigate previous uncertainty quantification methods under the presence of +data uncertainty. Our contributions are two-fold: 1) proposing a new +Multi-Answer Question Answering dataset, MAQA, consisting of world knowledge, +mathematical reasoning, and commonsense reasoning tasks to evaluate uncertainty +quantification regarding data uncertainty, and 2) assessing 5 uncertainty +quantification methods of diverse white- and black-box LLMs. Our findings show +that entropy and consistency-based methods estimate the model uncertainty well +even under data uncertainty, while other methods for white- and black-box LLMs +struggle depending on the tasks. Additionally, methods designed for white-box +LLMs suffer from overconfidence in reasoning tasks compared to simple knowledge +queries. We believe our observations will pave the way for future work on +uncertainty quantification in realistic setting. + +
+
+
+
+
+ + ☆ Layerwise Recurrent Router for Mixture-of-Experts + + +
+ The scaling of large language models (LLMs) has revolutionized their +capabilities in various tasks, yet this growth must be matched with efficient +computational strategies. The Mixture-of-Experts (MoE) architecture stands out +for its ability to scale model size without significantly increasing training +costs. Despite their advantages, current MoE models often display parameter +inefficiency. For instance, a pre-trained MoE-based LLM with 52 billion +parameters might perform comparably to a standard model with 6.7 billion +parameters. Being a crucial part of MoE, current routers in different layers +independently assign tokens without leveraging historical routing information, +potentially leading to suboptimal token-expert combinations and the parameter +inefficiency problem. To alleviate this issue, we introduce the Layerwise +Recurrent Router for Mixture-of-Experts (RMoE). RMoE leverages a Gated +Recurrent Unit (GRU) to establish dependencies between routing decisions across +consecutive layers. Such layerwise recurrence can be efficiently parallelly +computed for input tokens and introduces negotiable costs. Our extensive +empirical evaluations demonstrate that RMoE-based language models consistently +outperform a spectrum of baseline models. Furthermore, RMoE integrates a novel +computation stage orthogonal to existing methods, allowing seamless +compatibility with other MoE architectures. Our analyses attribute RMoE's gains +to its effective cross-layer information sharing, which also improves expert +selection and diversity. Our code is at https://github.com/qiuzh20/RMoE + +
+
+
+
+
+ + ☆ Unlock the Power of Frozen LLMs in Knowledge Graph Completion + + +
+ Classical knowledge graph completion (KGC) methods rely solely on structural +information, struggling with the inherent sparsity of knowledge graphs (KGs). +Large Language Models (LLMs) learn extensive knowledge from large corpora with +powerful context modeling, which is ideal for mitigating the limitations of +previous methods. Directly fine-tuning LLMs offers great capability but comes +at the cost of huge time and memory consumption, while utilizing frozen LLMs +yields suboptimal results. In this work, we aim to leverage LLMs for KGC +effectively and efficiently. We capture the context-aware hidden states of +knowledge triples by employing prompts to stimulate the intermediate layers of +LLMs. We then train a data-efficient classifier on these hidden states to +harness the inherent capabilities of frozen LLMs in KGC. We also generate +entity descriptions with subgraph sampling on KGs, reducing the ambiguity of +triplets and enriching the knowledge representation. Extensive experiments on +standard benchmarks showcase the efficiency and effectiveness of our approach. +We outperform classical KGC methods on most datasets and match the performance +of fine-tuned LLMs. Additionally, compared to fine-tuned LLMs, we boost GPU +memory efficiency by \textbf{$188\times$} and speed up training+inference by +\textbf{$13.48\times$}. + +
+
+
+
+
+ + ☆ Fast-and-Frugal Text-Graph Transformers are Effective Link Predictors + + +
+ Link prediction models can benefit from incorporating textual descriptions of +entities and relations, enabling fully inductive learning and flexibility in +dynamic graphs. We address the challenge of also capturing rich structured +information about the local neighbourhood of entities and their relations, by +introducing a Transformer-based approach that effectively integrates textual +descriptions with graph structure, reducing the reliance on resource-intensive +text encoders. Our experiments on three challenging datasets show that our +Fast-and-Frugal Text-Graph (FnF-TG) Transformers achieve superior performance +compared to the previous state-of-the-art methods, while maintaining efficiency +and scalability. + +
+
+
+
+
+ + ☆ Sumotosima: A Framework and Dataset for Classifying and Summarizing + Otoscopic Images + + +
+ Otoscopy is a diagnostic procedure to examine the ear canal and eardrum using +an otoscope. It identifies conditions like infections, foreign bodies, ear drum +perforations and ear abnormalities. We propose a novel resource efficient deep +learning and transformer based framework, Sumotosima (Summarizer for otoscopic +images), an end-to-end pipeline for classification followed by summarization. +Our framework works on combination of triplet and cross-entropy losses. +Additionally, we use Knowledge Enhanced Multimodal BART whose input is fused +textual and image embedding. The objective is to provide summaries that are +well-suited for patients, ensuring clarity and efficiency in understanding +otoscopic images. Given the lack of existing datasets, we have curated our own +OCASD (Otoscopic Classification And Summary Dataset), which includes 500 images +with 5 unique categories annotated with their class and summaries by +Otolaryngologists. Sumotosima achieved a result of 98.03%, which is 7.00%, +3.10%, 3.01% higher than K-Nearest Neighbors, Random Forest and Support Vector +Machines, respectively, in classification tasks. For summarization, Sumotosima +outperformed GPT-4o and LLaVA by 88.53% and 107.57% in ROUGE scores, +respectively. We have made our code and dataset publicly available at +https://github.com/anas2908/Sumotosima + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Multilingual Models for Check-Worthy Social Media Posts Detection + + +
+ This work presents an extensive study of transformer-based NLP models for +detection of social media posts that contain verifiable factual claims and +harmful claims. The study covers various activities, including dataset +collection, dataset pre-processing, architecture selection, setup of settings, +model training (fine-tuning), model testing, and implementation. The study +includes a comprehensive analysis of different models, with a special focus on +multilingual models where the same model is capable of processing social media +posts in both English and in low-resource languages such as Arabic, Bulgarian, +Dutch, Polish, Czech, Slovak. The results obtained from the study were +validated against state-of-the-art models, and the comparison demonstrated the +robustness of the proposed models. The novelty of this work lies in the +development of multi-label multilingual classification models that can +simultaneously detect harmful posts and posts that contain verifiable factual +claims in an efficient way. + +
+
+
+
+
+ + ☆ Exploring the anatomy of articulation rate in spontaneous English + speech: relationships between utterance length effects and social factors + + +
+ Speech rate has been shown to vary across social categories such as gender, +age, and dialect, while also being conditioned by properties of speech +planning. The effect of utterance length, where speech rate is faster and less +variable for longer utterances, has also been shown to reduce the role of +social factors once it has been accounted for, leaving unclear the relationship +between social factors and speech production in conditioning speech rate. +Through modelling of speech rate across 13 English speech corpora, it is found +that utterance length has the largest effect on speech rate, though this effect +itself varies little across corpora and speakers. While age and gender also +modulate speech rate, their effects are much smaller in magnitude. These +findings suggest utterance length effects may be conditioned by articulatory +and perceptual constraints, and that social influences on speech rate should be +interpreted in the broader context of how speech rate variation is structured. + +
+
+ comment: Proceedings of Interspeech 2024. 5 pages, 4 figures +
+
+
+
+
+ + ☆ Large language models can consistently generate high-quality content for + election disinformation operations + + +
+ Advances in large language models have raised concerns about their potential +use in generating compelling election disinformation at scale. This study +presents a two-part investigation into the capabilities of LLMs to automate +stages of an election disinformation operation. First, we introduce DisElect, a +novel evaluation dataset designed to measure LLM compliance with instructions +to generate content for an election disinformation operation in localised UK +context, containing 2,200 malicious prompts and 50 benign prompts. Using +DisElect, we test 13 LLMs and find that most models broadly comply with these +requests; we also find that the few models which refuse malicious prompts also +refuse benign election-related prompts, and are more likely to refuse to +generate content from a right-wing perspective. Secondly, we conduct a series +of experiments (N=2,340) to assess the "humanness" of LLMs: the extent to which +disinformation operation content generated by an LLM is able to pass as +human-written. Our experiments suggest that almost all LLMs tested released +since 2022 produce election disinformation operation content indiscernible by +human evaluators over 50% of the time. Notably, we observe that multiple models +achieve above-human levels of humanness. Taken together, these findings suggest +that current LLMs can be used to generate high-quality content for election +disinformation operations, even in hyperlocalised scenarios, at far lower costs +than traditional methods, and offer researchers and policymakers an empirical +benchmark for the measurement and evaluation of these capabilities in current +and future models. + +
+
+
+
+
+ + ☆ Enhancing Visual Dialog State Tracking through Iterative Object-Entity + Alignment in Multi-Round Conversations + + +
+ Visual Dialog (VD) is a task where an agent answers a series of image-related +questions based on a multi-round dialog history. However, previous VD methods +often treat the entire dialog history as a simple text input, disregarding the +inherent conversational information flows at the round level. In this paper, we +introduce Multi-round Dialogue State Tracking model (MDST), a framework that +addresses this limitation by leveraging the dialogue state learned from dialog +history to answer questions. MDST captures each round of dialog history, +constructing internal dialogue state representations defined as 2-tuples of +vision-language representations. These representations effectively ground the +current question, enabling the generation of accurate answers. Experimental +results on the VisDial v1.0 dataset demonstrate that MDST achieves a new +state-of-the-art performance in generative setting. Furthermore, through a +series of human studies, we validate the effectiveness of MDST in generating +long, consistent, and human-like answers while consistently answering a series +of questions correctly. + +
+
+ comment: This article has been accepted in CAAI Transactions on Intelligence + Technology! Article ID: CIT2_12370, Article DOI: 10.1049/cit2.12370 +
+
+
+
+
+ + ☆ Latin Treebanks in Review: An Evaluation of Morphological Tagging Across + Time + + +
+ Existing Latin treebanks draw from Latin's long written tradition, spanning +17 centuries and a variety of cultures. Recent efforts have begun to harmonize +these treebanks' annotations to better train and evaluate morphological +taggers. However, the heterogeneity of these treebanks must be carefully +considered to build effective and reliable data. In this work, we review +existing Latin treebanks to identify the texts they draw from, identify their +overlap, and document their coverage across time and genre. We additionally +design automated conversions of their morphological feature annotations into +the conventions of standard Latin grammar. From this, we build new time-period +data splits that draw from the existing treebanks which we use to perform a +broad cross-time analysis for POS and morphological feature tagging. We find +that BERT-based taggers outperform existing taggers while also being more +robust to cross-domain shifts. + +
+
+
+
+
+ + ☆ Pragmatic inference of scalar implicature by LLMs + + +
+ This study investigates how Large Language Models (LLMs), particularly BERT +(Devlin et al., 2019) and GPT-2 (Radford et al., 2019), engage in pragmatic +inference of scalar implicature, such as some. Two sets of experiments were +conducted using cosine similarity and next sentence/token prediction as +experimental methods. The results in experiment 1 showed that, both models +interpret some as pragmatic implicature not all in the absence of context, +aligning with human language processing. In experiment 2, in which Question +Under Discussion (QUD) was presented as a contextual cue, BERT showed +consistent performance regardless of types of QUDs, while GPT-2 encountered +processing difficulties since a certain type of QUD required pragmatic +inference for implicature. The findings revealed that, in terms of theoretical +approaches, BERT inherently incorporates pragmatic implicature not all within +the term some, adhering to Default model (Levinson, 2000). In contrast, GPT-2 +seems to encounter processing difficulties in inferring pragmatic implicature +within context, consistent with Context-driven model (Sperber and Wilson, +2002). + +
+
+ comment: This research was presented at the Association for Computational + Linguistics conference, held on August 11-16 +
+
+
+
+
+ + ☆ Amuro & Char: Analyzing the Relationship between Pre-Training and + Fine-Tuning of Large Language Models + + +
+ The development of large language models leads to the formation of a +pre-train-then-align paradigm, in which the model is typically pre-trained on a +large text corpus and undergoes a tuning stage to align the model with human +preference or downstream tasks. In this work, we investigate the relationship +between pre-training and fine-tuning by fine-tuning multiple intermediate +pre-trained model checkpoints. Our results on 18 datasets suggest that i) +continual pre-training improves the model in a latent way that unveils after +fine-tuning; ii) with extra fine-tuning, the datasets that the model does not +demonstrate capability gain much more than those that the model performs well +during the pre-training stage; iii) although model benefits significantly +through supervised fine-tuning, it may forget previously known domain knowledge +and the tasks that are not seen during fine-tuning; iv) the model resembles +high sensitivity to evaluation prompts after supervised fine-tuning, but this +sensitivity can be alleviated by more pre-training. + +
+
+
+
+
+ + ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ☆ EditScribe: Non-Visual Image Editing with Natural Language Verification + Loops + + +
+ Image editing is an iterative process that requires precise visual evaluation +and manipulation for the output to match the editing intent. However, current +image editing tools do not provide accessible interaction nor sufficient +feedback for blind and low vision individuals to achieve this level of control. +To address this, we developed EditScribe, a prototype system that makes image +editing accessible using natural language verification loops powered by large +multimodal models. Using EditScribe, the user first comprehends the image +content through initial general and object descriptions, then specifies edit +actions using open-ended natural language prompts. EditScribe performs the +image edit, and provides four types of verification feedback for the user to +verify the performed edit, including a summary of visual changes, AI judgement, +and updated general and object descriptions. The user can ask follow-up +questions to clarify and probe into the edits or verification feedback, before +performing another edit. In a study with ten blind or low-vision users, we +found that EditScribe supported participants to perform and verify image edit +actions non-visually. We observed different prompting strategies from +participants, and their perceptions on the various types of verification +feedback. Finally, we discuss the implications of leveraging natural language +verification loops to make visual authoring non-visually accessible. + +
+
+ comment: ASSETS 2024 +
+
+
+
+
+ + ☆ IFShip: A Large Vision-Language Model for Interpretable Fine-grained + Ship Classification via Domain Knowledge-Enhanced Instruction Tuning + + +
+ End-to-end interpretation is currently the prevailing paradigm for remote +sensing fine-grained ship classification (RS-FGSC) task. However, its inference +process is uninterpretable, leading to criticism as a black box model. To +address this issue, we propose a large vision-language model (LVLM) named +IFShip for interpretable fine-grained ship classification. Unlike traditional +methods, IFShip excels in interpretability by accurately conveying the +reasoning process of FGSC in natural language. Specifically, we first design a +domain knowledge-enhanced Chain-of-Thought (COT) prompt generation mechanism. +This mechanism is used to semi-automatically construct a task-specific +instruction-following dataset named TITANIC-FGS, which emulates human-like +logical decision-making. We then train the IFShip model using task instructions +tuned with the TITANIC-FGS dataset. Building on IFShip, we develop an FGSC +visual chatbot that redefines the FGSC problem as a step-by-step reasoning task +and conveys the reasoning process in natural language. Experimental results +reveal that the proposed method surpasses state-of-the-art FGSC algorithms in +both classification interpretability and accuracy. Moreover, compared to LVLMs +like LLaVA and MiniGPT-4, our approach demonstrates superior expertise in the +FGSC task. It provides an accurate chain of reasoning when fine-grained ship +types are recognizable to the human eye and offers interpretable explanations +when they are not. + +
+
+
+
+
+ + ☆ WorldScribe: Towards Context-Aware Live Visual Descriptions + + +
+ Automated live visual descriptions can aid blind people in understanding +their surroundings with autonomy and independence. However, providing +descriptions that are rich, contextual, and just-in-time has been a +long-standing challenge in accessibility. In this work, we develop WorldScribe, +a system that generates automated live real-world visual descriptions that are +customizable and adaptive to users' contexts: (i) WorldScribe's descriptions +are tailored to users' intents and prioritized based on semantic relevance. +(ii) WorldScribe is adaptive to visual contexts, e.g., providing consecutively +succinct descriptions for dynamic scenes, while presenting longer and detailed +ones for stable settings. (iii) WorldScribe is adaptive to sound contexts, +e.g., increasing volume in noisy environments, or pausing when conversations +start. Powered by a suite of vision, language, and sound recognition models, +WorldScribe introduces a description generation pipeline that balances the +tradeoffs between their richness and latency to support real-time use. The +design of WorldScribe is informed by prior work on providing visual +descriptions and a formative study with blind participants. Our user study and +subsequent pipeline evaluation show that WorldScribe can provide real-time and +fairly accurate visual descriptions to facilitate environment understanding +that is adaptive and customized to users' contexts. Finally, we discuss the +implications and further steps toward making live visual descriptions more +context-aware and humanized. + +
+
+ comment: UIST 2024 +
+
+
+
+
+ + ☆ Towards Robust and Cost-Efficient Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) have demonstrated strong reasoning and +memorization capabilities via pretraining on massive textual corpora. However, +training LLMs on human-written text entails significant risk of privacy and +copyright violations, which demands an efficient machine unlearning framework +to remove knowledge of sensitive data without retraining the model from +scratch. While Gradient Ascent (GA) is widely used for unlearning by reducing +the likelihood of generating unwanted information, the unboundedness of +increasing the cross-entropy loss causes not only unstable optimization, but +also catastrophic forgetting of knowledge that needs to be retained. We also +discover its joint application under low-rank adaptation results in +significantly suboptimal computational cost vs. generative performance +trade-offs. In light of this limitation, we propose two novel techniques for +robust and cost-efficient unlearning on LLMs. We first design an Inverted Hinge +loss that suppresses unwanted tokens by increasing the probability of the next +most likely token, thereby retaining fluency and structure in language +generation. We also propose to initialize low-rank adapter weights based on +Fisher-weighted low-rank approximation, which induces faster unlearning and +better knowledge retention by allowing model updates to be focused on +parameters that are important in generating textual data we wish to remove. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Generalized knowledge-enhanced framework for biomedical entity and + relation extraction + + +
+ In recent years, there has been an increasing number of frameworks developed +for biomedical entity and relation extraction. This research effort aims to +address the accelerating growth in biomedical publications and the intricate +nature of biomedical texts, which are written for mainly domain experts. To +handle these challenges, we develop a novel framework that utilizes external +knowledge to construct a task-independent and reusable background knowledge +graph for biomedical entity and relation extraction. The design of our model is +inspired by how humans learn domain-specific topics. In particular, humans +often first acquire the most basic and common knowledge regarding a field to +build the foundational knowledge and then use that as a basis for extending to +various specialized topics. Our framework employs such common-knowledge-sharing +mechanism to build a general neural-network knowledge graph that is learning +transferable to different domain-specific biomedical texts effectively. +Experimental evaluations demonstrate that our model, equipped with this +generalized and cross-transferable knowledge base, achieves competitive +performance benchmarks, including BioRelEx for binding interaction detection +and ADE for Adverse Drug Effect identification. + +
+
+
+
+
+ + ☆ CROME: Cross-Modal Adapters for Efficient Multimodal LLM + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable +image-language capabilities, but their widespread use faces challenges in +cost-effective training and adaptation. Existing approaches often necessitate +expensive language model retraining and limited adaptability. Additionally, the +current focus on zero-shot performance improvements offers insufficient +guidance for task-specific tuning. We propose CROME, an efficient +vision-language instruction tuning framework. It features a novel gated +cross-modal adapter that effectively combines visual and textual +representations prior to input into a frozen LLM. This lightweight adapter, +trained with minimal parameters, enables efficient cross-modal understanding. +Notably, CROME demonstrates superior zero-shot performance on standard visual +question answering and instruction-following benchmarks. Moreover, it yields +fine-tuning with exceptional parameter efficiency, competing with task-specific +specialist state-of-the-art methods. CROME demonstrates the potential of pre-LM +alignment for building scalable, adaptable, and parameter-efficient multimodal +models. + +
+
+
+
+
+ + ☆ A Perspective on Large Language Models, Intelligent Machines, and + Knowledge Acquisition + + +
+ Large Language Models (LLMs) are known for their remarkable ability to +generate synthesized 'knowledge', such as text documents, music, images, etc. +However, there is a huge gap between LLM's and human capabilities for +understanding abstract concepts and reasoning. We discuss these issues in a +larger philosophical context of human knowledge acquisition and the Turing +test. In addition, we illustrate the limitations of LLMs by analyzing GPT-4 +responses to questions ranging from science and math to common sense reasoning. +These examples show that GPT-4 can often imitate human reasoning, even though +it lacks understanding. However, LLM responses are synthesized from a large LLM +model trained on all available data. In contrast, human understanding is based +on a small number of abstract concepts. Based on this distinction, we discuss +the impact of LLMs on acquisition of human knowledge and education. + +
+
+
+
+
+ + ☆ Biomedical Event Extraction via Structure-aware Generation + + +
+ Biomedical Event Extraction (BEE) is a critical task that involves modeling +complex relationships between fine-grained entities in biomedical text data. +However, most existing BEE models rely on classification methods that neglect +the label semantics and argument dependency structure within the data. To +address these limitations, we propose GenBEE, a generative model enhanced with +a structure-aware prefix for biomedical event extraction. GenBEE constructs +event prompts that leverage knowledge distilled from large language models +(LLMs), thereby incorporating both label semantics and argument dependency +relationships. Additionally, GenBEE introduces a structural prefix learning +module that generates structure-aware prefixes with structural prompts, +enriching the generation process with structural features. Extensive +experiments on three benchmark datasets demonstrate the effectiveness of GenBEE +and it achieves state-of-the-art performance on the MLEE and GE11 datasets. +Furthermore, our analysis shows that the structural prefixes effectively bridge +the gap between structural prompts and the representation space of generative +models, enabling better integration of event structural information. + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ☆ OpenEP: Open-Ended Future Event Prediction + + +
+ Future event prediction (FEP) is a long-standing and crucial task in the +world, as understanding the evolution of events enables early risk +identification, informed decision-making, and strategic planning. Existing work +typically treats event prediction as classification tasks and confines the +outcomes of future events to a fixed scope, such as yes/no questions, candidate +set, and taxonomy, which is difficult to include all possible outcomes of +future events. In this paper, we introduce OpenEP (an Open-Ended Future Event +Prediction task), which generates flexible and diverse predictions aligned with +real-world scenarios. This is mainly reflected in two aspects: firstly, the +predictive questions are diverse, covering different stages of event +development and perspectives; secondly, the outcomes are flexible, without +constraints on scope or format. To facilitate the study of this task, we +construct OpenEPBench, an open-ended future event prediction dataset. For +question construction, we pose questions from seven perspectives, including +location, time, event development, event outcome, event impact, event response, +and other, to facilitate an in-depth analysis and understanding of the +comprehensive evolution of events. For outcome construction, we collect +free-form text containing the outcomes as ground truth to provide semantically +complete and detail-enriched outcomes. Furthermore, we propose StkFEP, a +stakeholder-enhanced future event prediction framework, that incorporates event +characteristics for open-ended settings. Our method extracts stakeholders +involved in events to extend questions to gather diverse information. We also +collect historically events that are relevant and similar to the question to +reveal potential evolutionary patterns. Experiment results indicate that +accurately predicting future events in open-ended settings is challenging for +existing LLMs. + +
+
+
+
+
+ + ☆ CTISum: A New Benchmark Dataset For Cyber Threat Intelligence + Summarization + + +
+ Cyber Threat Intelligence (CTI) summarization task requires the system to +generate concise and accurate highlights from raw intelligence data, which +plays an important role in providing decision-makers with crucial information +to quickly detect and respond to cyber threats in the cybersecurity domain. +However, efficient techniques for summarizing CTI reports, including facts, +analytical insights, attack processes, etc., have largely been unexplored, +primarily due to the lack of available dataset. To this end, we present CTISum, +a new benchmark for CTI summarization task. Considering the importance of +attack process, a novel fine-grained subtask of attack process summarization is +proposed to enable defenders to assess risk, identify security gaps, +vulnerabilities, and so on. Specifically, we first design a multi-stage +annotation pipeline to gather and annotate the CTI data, and then benchmark the +CTISum with a collection of extractive and abstractive summarization methods. +Experimental results show that current state-of-the-art models exhibit +limitations when applied to CTISum, underscoring the fact that automatically +producing concise summaries of CTI reports remains an open research challenge. + +
+
+
+
+
+ + ☆ SparkRA: A Retrieval-Augmented Knowledge Service System Based on Spark + Large Language Model + + +
+ Large language models (LLMs) have shown remarkable achievements across +various language tasks.To enhance the performance of LLMs in scientific +literature services, we developed the scientific literature LLM (SciLit-LLM) +through pre-training and supervised fine-tuning on scientific literature, +building upon the iFLYTEK Spark LLM. Furthermore, we present a knowledge +service system Spark Research Assistant (SparkRA) based on our SciLit-LLM. +SparkRA is accessible online and provides three primary functions: literature +investigation, paper reading, and academic writing. As of July 30, 2024, +SparkRA has garnered over 50,000 registered users, with a total usage count +exceeding 1.3 million. + +
+
+
+
+
+ + ☆ Social Debiasing for Fair Multi-modal LLMs + + +
+ Multi-modal Large Language Models (MLLMs) have advanced significantly, +offering powerful vision-language understanding capabilities. However, these +models often inherit severe social biases from their training datasets, leading +to unfair predictions based on attributes like race and gender. This paper +addresses the issue of social biases in MLLMs by i) Introducing a comprehensive +Counterfactual dataset with Multiple Social Concepts (CMSC), which provides a +more diverse and extensive training set compared to existing datasets. ii) +Proposing an Anti-Stereotype Debiasing strategy (ASD). Our method works by +revisiting the MLLM training process, rescaling the autoregressive loss +function, and improving data sampling methods to counteract biases. Through +extensive experiments on various MLLMs, our CMSC dataset and ASD method +demonstrate a significant reduction in social biases while maintaining the +models' original performance. + +
+
+
+
+
+ + ☆ AquilaMoE: Efficient Training for MoE Models with Scale-Up and Scale-Out + Strategies + + +
+ In recent years, with the rapid application of large language models across +various fields, the scale of these models has gradually increased, and the +resources required for their pre-training have grown exponentially. Training an +LLM from scratch will cost a lot of computation resources while scaling up from +a smaller model is a more efficient approach and has thus attracted significant +attention. In this paper, we present AquilaMoE, a cutting-edge bilingual 8*16B +Mixture of Experts (MoE) language model that has 8 experts with 16 billion +parameters each and is developed using an innovative training methodology +called EfficientScale. This approach optimizes performance while minimizing +data requirements through a two-stage process. The first stage, termed +Scale-Up, initializes the larger model with weights from a pre-trained smaller +model, enabling substantial knowledge transfer and continuous pretraining with +significantly less data. The second stage, Scale-Out, uses a pre-trained dense +model to initialize the MoE experts, further enhancing knowledge transfer and +performance. Extensive validation experiments on 1.8B and 7B models compared +various initialization schemes, achieving models that maintain and reduce loss +during continuous pretraining. Utilizing the optimal scheme, we successfully +trained a 16B model and subsequently the 8*16B AquilaMoE model, demonstrating +significant improvements in performance and training efficiency. + +
+
+
+
+
+ + ☆ Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality + Parallel Data Outperforms Traditional Web-Crawled Data + + +
+ Recent research in neural machine translation (NMT) has shown that training +on high-quality machine-generated data can outperform training on +human-generated data. This work accompanies the first-ever release of a +LLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and +multi-sentence examples. We perform extensive experiments to demonstrate the +quality of our dataset in terms of its downstream impact on NMT model +performance. We find that training from scratch on our (machine-generated) +dataset outperforms training on the (web-crawled) WMT'23 training dataset +(which is 300 times larger), and also outperforms training on the top-quality +subset of the WMT'23 training dataset. We also find that performing +self-distillation by finetuning the LLM which generated this dataset +outperforms the LLM's strong few-shot baseline. These findings corroborate the +quality of our dataset, and demonstrate the value of high-quality +machine-generated data in improving performance of NMT models. + +
+
+
+
+
+ + ☆ Using Advanced LLMs to Enhance Smaller LLMs: An Interpretable Knowledge + Distillation Approach + + +
+ Advanced Large language models (LLMs) like GPT-4 or LlaMa 3 provide superior +performance in complex human-like interactions. But they are costly, or too +large for edge devices such as smartphones and harder to self-host, leading to +security and privacy concerns. This paper introduces a novel interpretable +knowledge distillation approach to enhance the performance of smaller, more +economical LLMs that firms can self-host. We study this problem in the context +of building a customer service agent aimed at achieving high customer +satisfaction through goal-oriented dialogues. Unlike traditional knowledge +distillation, where the "student" model learns directly from the "teacher" +model's responses via fine-tuning, our interpretable "strategy" teaching +approach involves the teacher providing strategies to improve the student's +performance in various scenarios. This method alternates between a "scenario +generation" step and a "strategies for improvement" step, creating a customized +library of scenarios and optimized strategies for automated prompting. The +method requires only black-box access to both student and teacher models; hence +it can be used without manipulating model parameters. In our customer service +application, the method improves performance, and the learned strategies are +transferable to other LLMs and scenarios beyond the training set. The method's +interpretabilty helps safeguard against potential harms through human audit. + +
+
+
+
+
+ + ☆ Neural embedding of beliefs reveals the role of relative dissonance in + human decision-making + + +
+ Beliefs serve as the foundation for human cognition and decision-making. They +guide individuals in deriving meaning from their lives, shaping their +behaviors, and forming social connections. Therefore, a model that encapsulates +beliefs and their interrelationships is crucial for quantitatively studying the +influence of beliefs on our actions. Despite its importance, research on the +interplay between human beliefs has often been limited to a small set of +beliefs pertaining to specific issues, with a heavy reliance on surveys or +experiments. Here, we propose a method for extracting nuanced relations between +thousands of beliefs by leveraging large-scale user participation data from an +online debate platform and mapping these beliefs to an embedding space using a +fine-tuned large language model (LLM). This belief embedding space effectively +encapsulates the interconnectedness of diverse beliefs as well as polarization +across various social issues. We discover that the positions within this belief +space predict new beliefs of individuals. Furthermore, we find that the +relative distance between one's existing beliefs and new beliefs can serve as a +quantitative estimate of cognitive dissonance, allowing us to predict new +beliefs. Our study highlights how modern LLMs, when combined with collective +online records of human beliefs, can offer insights into the fundamental +principles that govern human belief formation and decision-making processes. + +
+
+ comment: 26 pages, 6 figures, SI +
+
+
+
+
+ + ☆ BERT's Conceptual Cartography: Mapping the Landscapes of Meaning + + +
+ Conceptual Engineers want to make words better. However, they often +underestimate how varied our usage of words is. In this paper, we take the +first steps in exploring the contextual nuances of words by creating conceptual +landscapes -- 2D surfaces representing the pragmatic usage of words -- that +conceptual engineers can use to inform their projects. We use the spoken +component of the British National Corpus and BERT to create contextualised word +embeddings, and use Gaussian Mixture Models, a selection of metrics, and +qualitative analysis to visualise and numerically represent lexical landscapes. +Such an approach has not yet been used in the conceptual engineering literature +and provides a detailed examination of how different words manifest in various +contexts that is potentially useful to conceptual engineering projects. Our +findings highlight the inherent complexity of conceptual engineering, revealing +that each word exhibits a unique and intricate landscape. Conceptual Engineers +cannot, therefore, use a one-size-fits-all approach when improving words -- a +task that may be practically intractable at scale. + +
+
+
+
+
+ + ☆ Unlocking Efficiency: Adaptive Masking for Gene Transformer Models ECAI 2024 + + +
+ Gene transformer models such as Nucleotide Transformer, DNABert, and LOGO are +trained to learn optimal gene sequence representations by using the Masked +Language Modeling (MLM) training objective over the complete Human Reference +Genome. However, the typical tokenization methods employ a basic sliding window +of tokens, such as k-mers, that fail to utilize gene-centric semantics. This +could result in the (trivial) masking of easily predictable sequences, leading +to inefficient MLM training. Time-variant training strategies are known to +improve pretraining efficiency in both language and vision tasks. In this work, +we focus on using curriculum masking where we systematically increase the +difficulty of masked token prediction task by using a Pointwise Mutual +Information-based difficulty criterion, as gene sequences lack well-defined +semantic units similar to words or sentences of NLP domain. Our proposed +Curriculum Masking-based Gene Masking Strategy (CM-GEMS) demonstrates superior +representation learning capabilities compared to baseline masking approaches +when evaluated on downstream gene sequence classification tasks. We perform +extensive evaluation in both few-shot (five datasets) and full dataset settings +(Genomic Understanding Evaluation benchmark consisting of 27 tasks). Our +findings reveal that CM-GEMS outperforms state-of-the-art models (DNABert-2, +Nucleotide transformer, DNABert) trained at 120K steps, achieving similar +results in just 10K and 1K steps. We also demonstrate that Curriculum-Learned +LOGO (a 2-layer DNABert-like model) can achieve nearly 90% of the +state-of-the-art model performance of 120K steps. We will make the models and +codes publicly available at https://github.com/roysoumya/curriculum-GeneMask. + +
+
+ comment: 10 pages, 5 figures. Accepted for publication at the 27th European + Conference on Artificial Intelligence (ECAI 2024) +
+
+
+
+
+ + ☆ Self-folding Self-replication + + +
+ Inspired by protein folding, we explored the construction of +three-dimensional structures and machines from one-dimensional chains of simple +building blocks. This approach not only allows us to recreate the +self-replication mechanism introduced earlier, but also significantly +simplifies the process. We introduced a new set of folding blocks that +facilitate the formation of secondary structures such as {\alpha}-helices and +\b{eta}-sheets, as well as more advanced tertiary and quaternary structures, +including self-replicating machines. The introduction of rotational degrees of +freedom leads to a reduced variety of blocks and, most importantly, reduces the +overall size of the machines by a factor of five. In addition, we present a +universal copier-constructor, a highly efficient self-replicating mechanism +composed of approximately 40 blocks, including the restictions posed on it. The +paper also addresses evolutionary considerations, outlining several steps on +the evolutionary ladder towards more sophisticated self-replicating systems. +Finally, this study offers a clear rationale for nature's preference for +one-dimensional chains in constructing three-dimensional structures. + +
+
+
+
+
+ + ☆ Language Models as Models of Language + + +
+ This chapter critically examines the potential contributions of modern +language models to theoretical linguistics. Despite their focus on engineering +goals, these models' ability to acquire sophisticated linguistic knowledge from +mere exposure to data warrants a careful reassessment of their relevance to +linguistic theory. I review a growing body of empirical evidence suggesting +that language models can learn hierarchical syntactic structure and exhibit +sensitivity to various linguistic phenomena, even when trained on +developmentally plausible amounts of data. While the competence/performance +distinction has been invoked to dismiss the relevance of such models to +linguistic theory, I argue that this assessment may be premature. By carefully +controlling learning conditions and making use of causal intervention methods, +experiments with language models can potentially constrain hypotheses about +language acquisition and competence. I conclude that closer collaboration +between theoretical linguists and computational researchers could yield +valuable insights, particularly in advancing debates about linguistic nativism. + +
+
+ comment: Forthcoming in Nefdt, R., Dupre, G., \& Stanton, K. (eds.), The + Oxford Handbook of the Philosophy of Linguistics. Oxford University Press +
+
+
+
+
+ + ☆ ELLA: Empowering LLMs for Interpretable, Accurate and Informative Legal + Advice + + +
+ Despite remarkable performance in legal consultation exhibited by legal Large +Language Models(LLMs) combined with legal article retrieval components, there +are still cases when the advice given is incorrect or baseless. To alleviate +these problems, we propose {\bf ELLA}, a tool for {\bf E}mpowering {\bf L}LMs +for interpretable, accurate, and informative {\bf L}egal {\bf A}dvice. ELLA +visually presents the correlation between legal articles and LLM's response by +calculating their similarities, providing users with an intuitive legal basis +for the responses. Besides, based on the users' queries, ELLA retrieves +relevant legal articles and displays them to users. Users can interactively +select legal articles for LLM to generate more accurate responses. ELLA also +retrieves relevant legal cases for user reference. Our user study shows that +presenting the legal basis for the response helps users understand better. The +accuracy of LLM's responses also improves when users intervene in selecting +legal articles for LLM. Providing relevant legal cases also aids individuals in +obtaining comprehensive information. + +
+
+
+
+
+ + ♻ ☆ NEO-BENCH: Evaluating Robustness of Large Language Models with + Neologisms ACL 2024 + + +
+ The performance of Large Language Models (LLMs) degrades from the temporal +drift between data used for model training and newer text seen during +inference. One understudied avenue of language change causing data drift is the +emergence of neologisms -- new word forms -- over time. We create a diverse +resource of recent English neologisms by using several popular collection +methods. We analyze temporal drift using neologisms by comparing sentences +containing new words with near-identical sentences that replace neologisms with +existing substitute words. Model performance is nearly halved in machine +translation when a single neologism is introduced in a sentence. Motivated by +these results, we construct a benchmark to evaluate LLMs' ability to generalize +to neologisms with various natural language understanding tasks and model +perplexity. Models with later knowledge cutoff dates yield lower perplexities +and perform better in downstream tasks. LLMs are also affected differently +based on the linguistic origins of words, indicating that neologisms are +complex for static LLMs to address. We will release our benchmark and code for +reproducing our experiments. + +
+
+ comment: accepted to ACL 2024 main conference, 9 pages +
+
+
+
+
+ + ♻ ☆ FuxiTranyu: A Multilingual Large Language Model Trained with Balanced + Data + + +
+ Large language models (LLMs) have demonstrated prowess in a wide range of +tasks. However, many LLMs exhibit significant performance discrepancies between +high- and low-resource languages. To mitigate this challenge, we present +FuxiTranyu, an open-source multilingual LLM, which is designed to satisfy the +need of the research community for balanced and high-performing multilingual +capabilities. FuxiTranyu-8B, the base model with 8 billion parameters, is +trained from scratch on a meticulously balanced multilingual data repository +that contains 600 billion tokens covering 43 natural languages and 16 +programming languages. In addition to the base model, we also develop two +instruction-tuned models: FuxiTranyu-8B-SFT that is fine-tuned on a diverse +multilingual instruction dataset, and FuxiTranyu-8B-DPO that is further refined +with DPO on a preference dataset for enhanced alignment ability. Extensive +experiments on a wide range of multilingual benchmarks demonstrate the +competitive performance of FuxiTranyu against existing multilingual LLMs, e.g., +BLOOM-7B, PolyLM-13B, Llama-2-Chat-7B and Mistral-7B-Instruct. Interpretability +analyses at both the neuron and representation level suggest that FuxiTranyu is +able to learn consistent multilingual representations across different +languages. To promote further research into multilingual LLMs and their working +mechanisms, we release both the base and instruction-tuned FuxiTranyu models +together with 58 pretraining checkpoints at HuggingFace and Github. + +
+
+
+
+
+ + ♻ ☆ A Novel Cartography-Based Curriculum Learning Method Applied on RoNLI: + The First Romanian Natural Language Inference Corpus ACL 2024 + + +
+ Natural language inference (NLI), the task of recognizing the entailment +relationship in sentence pairs, is an actively studied topic serving as a proxy +for natural language understanding. Despite the relevance of the task in +building conversational agents and improving text classification, machine +translation and other NLP tasks, to the best of our knowledge, there is no +publicly available NLI corpus for the Romanian language. To this end, we +introduce the first Romanian NLI corpus (RoNLI) comprising 58K training +sentence pairs, which are obtained via distant supervision, and 6K validation +and test sentence pairs, which are manually annotated with the correct labels. +We conduct experiments with multiple machine learning methods based on distant +learning, ranging from shallow models based on word embeddings to +transformer-based neural networks, to establish a set of competitive baselines. +Furthermore, we improve on the best model by employing a new curriculum +learning strategy based on data cartography. Our dataset and code to reproduce +the baselines are available at https://github.com/Eduard6421/RONLI. + +
+
+ comment: Accepted at ACL 2024 (Main) +
+
+
+
+
+ + ♻ ☆ Who's asking? User personas and the mechanics of latent misalignment + + +
+ Despite investments in improving model safety, studies show that misaligned +capabilities remain latent in safety-tuned models. In this work, we shed light +on the mechanics of this phenomenon. First, we show that even when model +generations are safe, harmful content can persist in hidden representations and +can be extracted by decoding from earlier layers. Then, we show that whether +the model divulges such content depends significantly on its perception of who +it is talking to, which we refer to as user persona. In fact, we find +manipulating user persona to be even more effective for eliciting harmful +content than direct attempts to control model refusal. We study both natural +language prompting and activation steering as control methods and show that +activation steering is significantly more effective at bypassing safety +filters. We investigate why certain personas break model safeguards and find +that they enable the model to form more charitable interpretations of otherwise +dangerous queries. Finally, we show we can predict a persona's effect on +refusal given only the geometry of its steering vector. + +
+
+
+
+
+ + ♻ ☆ Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought remarkable +generative capabilities across diverse tasks. However, despite the impressive +achievements, these LLMs still have numerous inherent vulnerabilities, +particularly when faced with jailbreak attacks. By investigating jailbreak +attacks, we can uncover hidden weaknesses in LLMs and inform the development of +more robust defense mechanisms to fortify their security. In this paper, we +further explore the boundary of jailbreak attacks on LLMs and propose +Analyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes +advantage of LLMs' growing analyzing and reasoning capability and reveals their +underlying vulnerabilities when facing analyzing-based tasks. We conduct a +detailed evaluation of ABJ across various open-source and closed-source LLMs, +which achieves 94.8% attack success rate (ASR) and 1.06 attack efficiency (AE) +on GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and +efficiency. Our research highlights the importance of prioritizing and +enhancing the safety of LLMs to mitigate the risks of misuse. The code is +publicly available at hhttps://github.com/theshi-1128/ABJ-Attack. Warning: This +paper contains examples of LLMs that might be offensive or harmful. + +
+
+
+
+
+ + ♻ ☆ WRDScore: New Metric for Evaluation of Natural Language Generation + Models + + +
+ Evaluating natural language generation models, particularly for method name +prediction, poses significant challenges. A robust metric must account for the +versatility of method naming, considering both semantic and syntactic +variations. Traditional overlap-based metrics, such as ROUGE, fail to capture +these nuances. Existing embedding-based metrics often suffer from imbalanced +precision and recall, lack normalized scores, or make unrealistic assumptions +about sequences. To address these limitations, we leverage the theory of +optimal transport and construct WRDScore, a novel metric that strikes a balance +between simplicity and effectiveness. In the WRDScore framework, we define +precision as the maximum degree to which the predicted sequence's tokens are +included in the reference sequence, token by token. Recall is calculated as the +total cost of the optimal transport plan that maps the reference sequence to +the predicted one. Finally, WRDScore is computed as the harmonic mean of +precision and recall, balancing these two complementary metrics. Our metric is +lightweight, normalized, and precision-recall-oriented, avoiding unrealistic +assumptions while aligning well with human judgments. Experiments on a +human-curated dataset confirm the superiority of WRDScore over other available +text metrics. + +
+
+ comment: Accepted to IEEE Xplore +
+
+
+
+
+ + ♻ ☆ A Novel Computational and Modeling Foundation for Automatic Coherence + Assessment + + +
+ Coherence is an essential property of well-written texts, that refers to the +way textual units relate to one another. In the era of generative AI, coherence +assessment is essential for many NLP tasks; summarization, generation, +long-form question-answering, and more. However, in NLP {coherence} is an +ill-defined notion, not having a formal definition or evaluation metrics, that +would allow for large-scale automatic and systematic coherence assessment. To +bridge this gap, in this work we employ the formal linguistic definition of +\citet{Reinhart:1980} of what makes a discourse coherent, consisting of three +conditions -- {\em cohesion, consistency} and {\em relevance} -- and formalize +these conditions as respective computational tasks. We hypothesize that (i) a +model trained on all of these tasks will learn the features required for +coherence detection, and that (ii) a joint model for all tasks will exceed the +performance of models trained on each task individually. On two benchmarks for +coherence scoring rated by humans, one containing 500 automatically-generated +short stories and another containing 4k real-world texts, our experiments +confirm that jointly training on the proposed tasks leads to better performance +on each task compared with task-specific models, and to better performance on +assessing coherence overall, compared with strong baselines. We conclude that +the formal and computational setup of coherence as proposed here provides a +solid foundation for advanced methods of large-scale automatic assessment of +coherence. + +
+
+
+
+
+ + ♻ ☆ Generative AI for Immersive Communication: The Next Frontier in + Internet-of-Senses Through 6G + + +
+ Over the past two decades, the Internet-of-Things (IoT) has become a +transformative concept, and as we approach 2030, a new paradigm known as the +Internet of Senses (IoS) is emerging. Unlike conventional Virtual Reality (VR), +IoS seeks to provide multi-sensory experiences, acknowledging that in our +physical reality, our perception extends far beyond just sight and sound; it +encompasses a range of senses. This article explores the existing technologies +driving immersive multi-sensory media, delving into their capabilities and +potential applications. This exploration includes a comparative analysis +between conventional immersive media streaming and a proposed use case that +leverages semantic communication empowered by generative Artificial +Intelligence (AI). The focal point of this analysis is the substantial +reduction in bandwidth consumption by 99.93% in the proposed scheme. Through +this comparison, we aim to underscore the practical applications of generative +AI for immersive media. Concurrently addressing major challenges in this field, +such as temporal synchronization of multiple media, ensuring high throughput, +minimizing the End-to-End (E2E) latency, and robustness to low bandwidth while +outlining future trajectories. + +
+
+
+
+
+ + ♻ ☆ SUBLLM: A Novel Efficient Architecture with Token Sequence Subsampling + for LLM ECAI 2024 + + +
+ While Large Language Models (LLMs) have achieved remarkable success in +various fields, the efficiency of training and inference remains a major +challenge. To address this issue, we propose SUBLLM, short for +Subsampling-Upsampling-Bypass Large Language Model, an innovative architecture +that extends the core decoder-only framework by incorporating subsampling, +upsampling, and bypass modules. The subsampling modules are responsible for +shortening the sequence, while the upsampling modules restore the sequence +length, and the bypass modules enhance convergence. In comparison to LLaMA, the +proposed SUBLLM exhibits significant enhancements in both training and +inference speeds as well as memory usage, while maintaining competitive +few-shot performance. During training, SUBLLM increases speeds by 26% and cuts +memory by 10GB per GPU. In inference, it boosts speeds by up to 37% and reduces +memory by 1GB per GPU. The training and inference speeds can be enhanced by 34% +and 52% respectively when the context window is expanded to 8192. Our code is +available at https://github.com/XiaoMi/subllm. + +
+
+ comment: 9 pages, 3 figures, accepted by ECAI 2024 +
+
+
+
+
+ + ♻ ☆ DSLR: Document Refinement with Sentence-Level Re-ranking and + Reconstruction to Enhance Retrieval-Augmented Generation + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +improved their performance across various Natural Language Processing (NLP) +tasks. However, LLMs still struggle with generating non-factual responses due +to limitations in their parametric memory. Retrieval-Augmented Generation (RAG) +systems address this issue by incorporating external knowledge with a retrieval +module. Despite their successes, however, current RAG systems face challenges +with retrieval failures and the limited ability of LLMs to filter out +irrelevant information. Therefore, in this work, we propose DSLR (Document +Refinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised +framework that decomposes retrieved documents into sentences, filters out +irrelevant sentences, and reconstructs them again into coherent passages. We +experimentally validate DSLR on multiple open-domain QA datasets and the +results demonstrate that DSLR significantly enhances the RAG performance over +conventional fixed-size passage. Furthermore, our DSLR enhances performance in +specific, yet realistic scenarios without the need for additional training, +providing an effective and efficient solution for refining retrieved documents +in RAG systems. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ MICL: Improving In-Context Learning through Multiple-Label Words in + Demonstration + + +
+ In-context learning (ICL) enables large language models (LLMs) to perform new +tasks by using sample-label pairs as demonstrations. However, variations in +demonstrations can lead to significantly different performances. Current +research mainly focuses on selecting demonstration samples, preassuming the +class name to be the label word when creating sample-label pairs. However, the +choice of label words is crucial for ICL performance. Besides, we observe that +using a single class name in demonstration may not yield optimal results while +using multiple label words in one sample-label pair can enhance ICL +performance. In this paper, we propose a comprehensive approach that organizes +both samples and labels in demonstrations based on LLMs' output space +distribution. This approach uses multiple label words in one sample-label pair +to enhance label instruction. Evaluation results from seven classification +datasets show that this demonstration organization method, which incorporates +multiple label words to provide diverse label information, improves ICL +performance. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Review-driven Personalized Preference Reasoning with Large Language + Models for Recommendation + + +
+ Recent advancements in Large Language Models (LLMs) have demonstrated +exceptional performance across a wide range of tasks, generating significant +interest in their application to recommendation systems. However, existing +methods have not fully capitalized on the potential of LLMs, often constrained +by limited input information or failing to fully utilize their advanced +reasoning capabilities. To address these limitations, we introduce EXP3RT, a +novel LLM-based recommender designed to leverage rich preference information +contained in user and item reviews. EXP3RT is basically fine-tuned through +distillation from a teacher LLM to perform three key tasks in order: EXP3RT +first extracts and encapsulates essential subjective preferences from raw +reviews, aggregates and summarizes them according to specific criteria to +create user and item profiles. It then generates detailed step-by-step +reasoning followed by predicted rating, i.e., reasoning-enhanced rating +prediction, by considering both subjective and objective information from +user/item profiles and item descriptions. This personalized preference +reasoning from EXP3RT enhances rating prediction accuracy and also provides +faithful and reasonable explanations for recommendation. Extensive experiments +show that EXP3RT outperforms existing methods on both rating prediction and +candidate item reranking for top-k recommendation, while significantly +enhancing the explainability of recommendation systems. + +
+
+
+
+
+ + ♻ ☆ SAFETY-J: Evaluating Safety with Critique + + +
+ The deployment of Large Language Models (LLMs) in content generation raises +significant safety concerns, particularly regarding the transparency and +interpretability of content evaluations. Current methods, primarily focused on +binary safety classifications, lack mechanisms for detailed critique, limiting +their utility for model improvement and user trust. To address these +limitations, we introduce SAFETY-J, a bilingual generative safety evaluator for +English and Chinese with critique-based judgment. SAFETY-J utilizes a robust +training dataset that includes diverse dialogues and augmented query-response +pairs to assess safety across various scenarios comprehensively. We establish +an automated meta-evaluation benchmark that objectively assesses the quality of +critiques with minimal human intervention, facilitating scalable and continuous +improvement. Additionally, SAFETY-J employs an iterative preference learning +technique to dynamically refine safety assessments based on meta-evaluations +and critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced +and accurate safety evaluations, thereby enhancing both critique quality and +predictive reliability in complex content scenarios. To facilitate further +research and application, we open-source SAFETY-J's training protocols, +datasets, and code at https://github.com/GAIR-NLP/Safety-J. + +
+
+
+
+
+ + ♻ ☆ EXAONE 3.0 7.8B Instruction Tuned Language Model + + +
+ We introduce EXAONE 3.0 instruction-tuned language model, the first open +model in the family of Large Language Models (LLMs) developed by LG AI +Research. Among different model sizes, we publicly release the 7.8B +instruction-tuned model to promote open research and innovations. Through +extensive evaluations across a wide range of public and in-house benchmarks, +EXAONE 3.0 demonstrates highly competitive real-world performance with +instruction-following capability against other state-of-the-art open models of +similar size. Our comparative analysis shows that EXAONE 3.0 excels +particularly in Korean, while achieving compelling performance across general +tasks and complex reasoning. With its strong real-world effectiveness and +bilingual proficiency, we hope that EXAONE keeps contributing to advancements +in Expert AI. Our EXAONE 3.0 instruction-tuned model is available at +https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct + +
+
+
+
+
+ + ♻ ☆ Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache + Consumption + + +
+ Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022, +have revolutionized various industries with their advanced language +comprehension. However, their efficiency is challenged by the Transformer +architecture' s struggle with handling long texts. KV-Cache has emerged as a +pivotal solution to this issue, converting the time complexity of token +generation from quadratic to linear, albeit with increased GPU memory overhead +proportional to conversation length. With the development of the LLM community +and academia, various KV-Cache compression methods have been proposed. In this +review, we dissect the various properties of KV-Cache and elaborate on various +methods currently used to optimize the KV-Cache space usage of LLMs. These +methods span the pre-training phase, deployment phase, and inference phase, and +we summarize the commonalities and differences among these methods. +Additionally, we list some metrics for evaluating the long-text capabilities of +large language models, from both efficiency and capability perspectives. Our +review thus sheds light on the evolving landscape of LLM optimization, offering +insights into future advancements in this dynamic field. + +
+
+ comment: to be published in CoLM 2024 +
+
+
+
+
+ + ♻ ☆ InterCLIP-MEP: Interactive CLIP and Memory-Enhanced Predictor for + Multi-modal Sarcasm Detection + + +
+ The prevalence of sarcasm in social media, conveyed through text-image +combinations, presents significant challenges for sentiment analysis and +intention mining. Existing multi-modal sarcasm detection methods have been +proven to overestimate performance, as they struggle to effectively capture the +intricate sarcastic cues that arise from the interaction between an image and +text. To address these issues, we propose InterCLIP-MEP, a novel framework for +multi-modal sarcasm detection. Specifically, we introduce an Interactive CLIP +(InterCLIP) as the backbone to extract text-image representations, enhancing +them by embedding cross-modality information directly within each encoder, +thereby improving the representations to capture text-image interactions +better. Furthermore, an efficient training strategy is designed to adapt +InterCLIP for our proposed Memory-Enhanced Predictor (MEP). MEP uses a dynamic, +fixed-length dual-channel memory to store historical knowledge of valuable test +samples during inference. It then leverages this memory as a non-parametric +classifier to derive the final prediction, offering a more robust recognition +of multi-modal sarcasm. Experiments demonstrate that InterCLIP-MEP achieves +state-of-the-art performance on the MMSD2.0 benchmark, with an accuracy +improvement of 1.08% and an F1 score improvement of 1.51% over the previous +best method. + +
+
+ comment: 9 pages, 6 figures, 3 tables; Code and data are available at + https://github.com/CoderChen01/InterCLIP-MEP +
+
+
+
+
+ + ♻ ☆ Women Are Beautiful, Men Are Leaders: Gender Stereotypes in Machine + Translation and Language Modeling + + +
+ We present GEST -- a new dataset for measuring gender-stereotypical reasoning +in masked LMs and English-to-X machine translation systems. GEST contains +samples that are compatible with 9 Slavic languages and English for 16 gender +stereotypes about men and women (e.g., Women are beautiful, Men are leaders). +The definition of said stereotypes was informed by gender experts. We used GEST +to evaluate 11 masked LMs and 4 machine translation systems. We discovered +significant and consistent amounts of stereotypical reasoning in almost all the +evaluated models and languages. + +
+
+
+
+
+ + ♻ ☆ SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning + + +
+ Recent development in Large Language Models (LLMs) and Multi-modal Large +Language Models (MLLMs) have leverage Attention-based Transformer architectures +and achieved superior performance and generalization capabilities. They have +since covered extensive areas of traditional learning tasks. For instance, +text-based tasks such as text-classification and sequence-labeling, as well as +multi-modal tasks like Visual Question Answering (VQA) and Optical Character +Recognition (OCR), which were previously addressed using different models, can +now be tackled based on one foundation model. Consequently, the training and +lightweight fine-tuning of LLMs and MLLMs, especially those based on +Transformer architecture, has become particularly important. In recognition of +these overwhelming needs, we develop SWIFT, a customizable one-stop +infrastructure for large models. With support of over $300+$ LLMs and $50+$ +MLLMs, SWIFT stands as the open-source framework that provide the \textit{most +comprehensive support} for fine-tuning large models. In particular, it is the +first training framework that provides systematic support for MLLMs. In +addition to the core functionalities of fine-tuning, SWIFT also integrates +post-training processes such as inference, evaluation, and model quantization, +to facilitate fast adoptions of large models in various application scenarios. +With a systematic integration of various training techniques, SWIFT offers +helpful utilities such as benchmark comparisons among different training +techniques for large models. For fine-tuning models specialized in agent +framework, we show that notable improvements on the ToolBench leader-board can +be achieved by training with customized dataset on SWIFT, with an increase of +5.2%-21.8% in the Act.EM metric over various baseline models, a reduction in +hallucination by 1.6%-14.1%, and an average performance improvement of 8%-17%. + +
+
+
+
+
+ + ♻ ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of + Text-To-Image Models + + +
+ Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have +demonstrated remarkable prompt-based image generation capabilities. +Multilingual encoders may have a substantial impact on the cultural agency of +these models, as language is a conduit of culture. In this study, we explore +the cultural perception embedded in TTI models by characterizing culture across +three hierarchical tiers: cultural dimensions, cultural domains, and cultural +concepts. Based on this ontology, we derive prompt templates to unlock the +cultural knowledge in TTI models, and propose a comprehensive suite of +evaluation techniques, including intrinsic evaluations using the CLIP space, +extrinsic evaluations with a Visual-Question-Answer (VQA) model and human +assessments, to evaluate the cultural content of TTI-generated images. To +bolster our research, we introduce the CulText2I dataset, derived from six +diverse TTI models and spanning ten languages. Our experiments provide insights +regarding Do, What, Which and How research questions about the nature of +cultural encoding in TTI models, paving the way for cross-cultural applications +of these models. + +
+
+ comment: Project page: https://venturamor.github.io/CulText2IWeb/ +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal + Large Language Models + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in executing instructions for a variety of single-image tasks. +Despite this progress, significant challenges remain in modeling long image +sequences. In this work, we introduce the versatile multi-modal large language +model, mPLUG-Owl3, which enhances the capability for long image-sequence +understanding in scenarios that incorporate retrieved image-text knowledge, +interleaved image-text, and lengthy videos. Specifically, we propose novel +hyper attention blocks to efficiently integrate vision and language into a +common language-guided semantic space, thereby facilitating the processing of +extended multi-image scenarios. Extensive experimental results suggest that +mPLUG-Owl3 achieves state-of-the-art performance among models with a similar +size on single-image, multi-image, and video benchmarks. Moreover, we propose a +challenging long visual sequence evaluation named Distractor Resistance to +assess the ability of models to maintain focus amidst distractions. Finally, +with the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance +on ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to +the development of more efficient and powerful multimodal large language +models. + +
+
+
+
+
+ + ♻ ☆ CEval: A Benchmark for Evaluating Counterfactual Text Generation + + +
+ Counterfactual text generation aims to minimally change a text, such that it +is classified differently. Judging advancements in method development for +counterfactual text generation is hindered by a non-uniform usage of data sets +and metrics in related work. We propose CEval, a benchmark for comparing +counterfactual text generation methods. CEval unifies counterfactual and text +quality metrics, includes common counterfactual datasets with human +annotations, standard baselines (MICE, GDBA, CREST) and the open-source +language model LLAMA-2. Our experiments found no perfect method for generating +counterfactual text. Methods that excel at counterfactual metrics often produce +lower-quality text while LLMs with simple prompts generate high-quality text +but struggle with counterfactual criteria. By making CEval available as an +open-source Python library, we encourage the community to contribute more +methods and maintain consistent evaluation in future work. + +
+
+
+
+
+ + ♻ ☆ Clinical information extraction for Low-resource languages with Few-shot + learning using Pre-trained language models and Prompting + + +
+ Automatic extraction of medical information from clinical documents poses +several challenges: high costs of required clinical expertise, limited +interpretability of model predictions, restricted computational resources and +privacy regulations. Recent advances in domain-adaptation and prompting methods +showed promising results with minimal training data using lightweight masked +language models, which are suited for well-established interpretability +methods. We are first to present a systematic evaluation of these methods in a +low-resource setting, by performing multi-class section classification on +German doctor's letters. We conduct extensive class-wise evaluations supported +by Shapley values, to validate the quality of our small training data set and +to ensure the interpretability of model predictions. We demonstrate that a +lightweight, domain-adapted pretrained model, prompted with just 20 shots, +outperforms a traditional classification model by 30.5% accuracy. Our results +serve as a process-oriented guideline for clinical information extraction +projects working with low-resource. + +
+
+ comment: Paper accepted for publication in the journal: Natural Language + Engineering (Cambridge Core) +
+
+
+
+
+ + ♻ ☆ Can Deception Detection Go Deeper? Dataset, Evaluation, and Benchmark + for Deception Reasoning + + +
+ Deception detection has attracted increasing attention due to its importance +in real-world scenarios. Its main goal is to detect deceptive behaviors from +multimodal clues such as gestures, facial expressions, prosody, etc. However, +these bases are usually subjective and related to personal habits. Therefore, +we extend deception detection to deception reasoning, further providing +objective evidence to support subjective judgment. Specifically, we provide +potential lies and basic facts and then analyze why this sentence may be a lie +by combining factual inconsistencies and intent behind them. Compared with +deception detection, this task is more applicable to real-world scenarios. For +example, in interrogation, the police should judge whether a person is lying +based on solid evidence. This paper presents our initial attempts at this task, +including constructing a dataset and defining evaluation metrics. Meanwhile, +this task can serve as a benchmark for evaluating the complex reasoning +capability of large language models. Our code and data are provided in the +supplementary material. + +
+
+
+
+
+ + ♻ ☆ Dual-Space Knowledge Distillation for Large Language Models + + +
+ Knowledge distillation (KD) is known as a promising solution to compress +large language models (LLMs) via transferring their knowledge to smaller +models. During this process, white-box KD methods usually minimize the distance +between the output distributions of the two models so that more knowledge can +be transferred. However, in the current white-box KD framework, the output +distributions are from the respective output spaces of the two models, using +their own prediction heads. We argue that the space discrepancy will lead to +low similarity between the teacher model and the student model on both +representation and distribution levels. Furthermore, this discrepancy also +hinders the KD process between models with different vocabularies, which is +common for current LLMs. To address these issues, we propose a dual-space +knowledge distillation (DSKD) framework that unifies the output spaces of the +two models for KD. On the basis of DSKD, we further develop a cross-model +attention mechanism, which can automatically align the representations of the +two models with different vocabularies. Thus, our framework is not only +compatible with various distance functions for KD (e.g., KL divergence) like +the current framework, but also supports KD between any two LLMs regardless of +their vocabularies. Experiments on task-agnostic instruction-following +benchmarks show that DSKD significantly outperforms the current white-box KD +framework with various distance functions, and also surpasses existing KD +methods for LLMs with different vocabularies. + +
+
+ comment: 17 pages, 11 figures, code available at: + https://github.com/songmzhang/DSKD +
+
+
+
+
+ + ♻ ☆ Advancing Post-OCR Correction: A Comparative Study of Synthetic Data ACL 2024 + + +
+ This paper explores the application of synthetic data in the post-OCR domain +on multiple fronts by conducting experiments to assess the impact of data +volume, augmentation, and synthetic data generation methods on model +performance. Furthermore, we introduce a novel algorithm that leverages +computer vision feature detection algorithms to calculate glyph similarity for +constructing post-OCR synthetic data. Through experiments conducted across a +variety of languages, including several low-resource ones, we demonstrate that +models like ByT5 can significantly reduce Character Error Rates (CER) without +the need for manually annotated data, and our proposed synthetic data +generation method shows advantages over traditional methods, particularly in +low-resource languages. + +
+
+ comment: ACL 2024 findings +
+
+
+
+
+ + ♻ ☆ Negative Object Presence Evaluation (NOPE) to Measure Object + Hallucination in Vision-Language Models ACL 2024 + + +
+ Object hallucination poses a significant challenge in vision-language (VL) +models, often leading to the generation of nonsensical or unfaithful responses +with non-existent objects. However, the absence of a general measurement for +evaluating object hallucination in VL models has hindered our understanding and +ability to mitigate this issue. In this work, we present NOPE (Negative Object +Presence Evaluation), a novel benchmark designed to assess object hallucination +in VL models through visual question answering (VQA). We propose a +cost-effective and scalable approach utilizing large language models to +generate 29.5k synthetic negative pronoun (NegP) data of high quality for NOPE. +We extensively investigate the performance of 10 state-of-the-art VL models in +discerning the non-existence of objects in visual questions, where the ground +truth answers are denoted as NegP (e.g., "none"). Additionally, we evaluate +their standard performance on visual questions on 9 other VQA datasets. Through +our experiments, we demonstrate that no VL model is immune to the vulnerability +of object hallucination, as all models achieve accuracy below 10\% on NegP. +Furthermore, we uncover that lexically diverse visual questions, question types +with large scopes, and scene-relevant objects capitalize the risk of object +hallucination in VL models. + +
+
+ comment: Published in ALVR Workshop at ACL 2024 +
+
+
+
+
+ + ♻ ☆ Application of LLM Agents in Recruitment: A Novel Framework for Resume + Screening + + +
+ The automation of resume screening is a crucial aspect of the recruitment +process in organizations. Automated resume screening systems often encompass a +range of natural language processing (NLP) tasks. This paper introduces a novel +Large Language Models (LLMs) based agent framework for resume screening, aimed +at enhancing efficiency and time management in recruitment processes. Our +framework is distinct in its ability to efficiently summarize and grade each +resume from a large dataset. Moreover, it utilizes LLM agents for +decision-making. To evaluate our framework, we constructed a dataset from +actual resumes and simulated a resume screening process. Subsequently, the +outcomes of the simulation experiment were compared and subjected to detailed +analysis. The results demonstrate that our automated resume screening framework +is 11 times faster than traditional manual methods. Furthermore, by fine-tuning +the LLMs, we observed a significant improvement in the F1 score, reaching +87.73\%, during the resume sentence classification phase. In the resume +summarization and grading phase, our fine-tuned model surpassed the baseline +performance of the GPT-3.5 model. Analysis of the decision-making efficacy of +the LLM agents in the final offer stage further underscores the potential of +LLM agents in transforming resume screening processes. + +
+
+ comment: Accept by Journal of Information Processing,(2024), 18 pages, 19 + figures +
+
+
+
+
+ + ♻ ☆ Source-Aware Training Enables Knowledge Attribution in Language Models + + +
+ Large language models (LLMs) learn a vast amount of knowledge during +pretraining, but they are often oblivious to the source(s) of such knowledge. +We investigate the problem of intrinsic source citation, where LLMs are +required to cite the pretraining source supporting a generated response. +Intrinsic source citation can enhance LLM transparency, interpretability, and +verifiability. To give LLMs such ability, we explore source-aware training -- a +recipe that involves (i) training the LLM to associate unique source document +identifiers with the knowledge in each document, followed by (ii) an +instruction-tuning stage to teach the LLM to cite a supporting pretraining +source when prompted. Source-aware training borrows from existing +pretraining/fine-tuning frameworks and requires minimal changes to the model +architecture or implementation. Through experiments on synthetic data, we +demonstrate that our training recipe can enable faithful attribution to the +pretraining data without a substantial impact on the model's perplexity +compared to standard pretraining. Our findings also highlight the importance of +pretraining data augmentation in achieving attribution. Code and data available +here: \url{https://github.com/mukhal/intrinsic-source-citation} + +
+
+ comment: COLM '24 +
+
+
+
+
+ + ♻ ☆ Judging the Judges: A Systematic Investigation of Position Bias in + Pairwise Comparative Assessments by LLMs + + +
+ LLM-as-a-Judge offers a promising alternative to human judges across various +tasks, yet inherent biases, particularly position bias - a systematic +preference for answers based on their position in the prompt - compromise its +effectiveness. Our study investigates this issue by developing a framework to +systematically study and quantify position bias using metrics such as +repetitional consistency, positional consistency, and positional fairness. We +conduct experiments with 9 judge models across 22 tasks from the MTBench and +DevBench benchmarks and nearly 40 answer-generating models, generating +approximately 80,000 evaluation instances. This comprehensive assessment +reveals significant variations in bias across judges and tasks. Although GPT-4 +often excels in positional consistency and fairness, some more cost-effective +models perform comparably or even better in specific tasks, highlighting +essential trade-offs between consistency, fairness, and cost. Our results also +demonstrate high consistency of judgment across repetitions, confirming that +position bias is not due to random variations. This research significantly +contributes to the field by introducing new concepts for understanding position +bias and providing a multi-dimensional framework for evaluation. These insights +guide the selection of optimal judge models, enhance benchmark design, and lay +the foundation for future research into effective debiasing strategies, +ultimately enhancing the reliability of LLM evaluators. + +
+
+ comment: 70 pages, around 200 figures and subfigures +
+
+
+
+
+ + ♻ ☆ Unveiling Factual Recall Behaviors of Large Language Models through + Knowledge Neurons + + +
+ In this paper, we investigate whether Large Language Models (LLMs) actively +recall or retrieve their internal repositories of factual knowledge when faced +with reasoning tasks. Through an analysis of LLMs' internal factual recall at +each reasoning step via Knowledge Neurons, we reveal that LLMs fail to harness +the critical factual associations under certain circumstances. Instead, they +tend to opt for alternative, shortcut-like pathways to answer reasoning +questions. By manually manipulating the recall process of parametric knowledge +in LLMs, we demonstrate that enhancing this recall process directly improves +reasoning performance whereas suppressing it leads to notable degradation. +Furthermore, we assess the effect of Chain-of-Thought (CoT) prompting, a +powerful technique for addressing complex reasoning tasks. Our findings +indicate that CoT can intensify the recall of factual knowledge by encouraging +LLMs to engage in orderly and reliable reasoning. Furthermore, we explored how +contextual conflicts affect the retrieval of facts during the reasoning process +to gain a comprehensive understanding of the factual recall behaviors of LLMs. +Code and data will be available soon. + +
+
+
+
+
+ + ♻ ☆ Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using + EmotionBench + + +
+ Evaluating Large Language Models' (LLMs) anthropomorphic capabilities has +become increasingly important in contemporary discourse. Utilizing the emotion +appraisal theory from psychology, we propose to evaluate the empathy ability of +LLMs, \ie, how their feelings change when presented with specific situations. +After a careful and comprehensive survey, we collect a dataset containing over +400 situations that have proven effective in eliciting the eight emotions +central to our study. Categorizing the situations into 36 factors, we conduct a +human evaluation involving more than 1,200 subjects worldwide. With the human +evaluation results as references, our evaluation includes seven LLMs, covering +both commercial and open-source models, including variations in model sizes, +featuring the latest iterations, such as GPT-4, Mixtral-8x22B, and LLaMA-3.1. +We find that, despite several misalignments, LLMs can generally respond +appropriately to certain situations. Nevertheless, they fall short in alignment +with the emotional behaviors of human beings and cannot establish connections +between similar situations. Our EmotionBench, including collected dataset of +situations, the human evaluation results, and the code of our testing +framework, is publicly available at https://github.com/CUHK-ARISE/EmotionBench. + +
+
+ comment: Add LLaMA-3.1, Mixtral-8x22B; 10 pages of main text; 14 pages of + appendices +
+
+
+
+
+ + ♻ ☆ LePaRD: A Large-Scale Dataset of Judges Citing Precedents + + +
+ We present the Legal Passage Retrieval Dataset LePaRD. LePaRD is a massive +collection of U.S. federal judicial citations to precedent in context. The +dataset aims to facilitate work on legal passage prediction, a challenging +practice-oriented legal retrieval and reasoning task. Legal passage prediction +seeks to predict relevant passages from precedential court decisions given the +context of a legal argument. We extensively evaluate various retrieval +approaches on LePaRD, and find that classification appears to work best. +However, we note that legal precedent prediction is a difficult task, and there +remains significant room for improvement. We hope that by publishing LePaRD, we +will encourage others to engage with a legal NLP task that promises to help +expand access to justice by reducing the burden associated with legal research. +A subset of the LePaRD dataset is freely available and the whole dataset will +be released upon publication. + +
+
+
+
+
+ + ♻ ☆ Bi-DCSpell: A Bi-directional Detector-Corrector Interactive Framework + for Chinese Spelling Check + + +
+ Chinese Spelling Check (CSC) aims to detect and correct potentially +misspelled characters in Chinese sentences. Naturally, it involves the +detection and correction subtasks, which interact with each other dynamically. +Such interactions are bi-directional, i.e., the detection result would help +reduce the risk of over-correction and under-correction while the knowledge +learnt from correction would help prevent false detection. Current CSC +approaches are of two types: correction-only or single-directional +detection-to-correction interactive frameworks. Nonetheless, they overlook the +bi-directional interactions between detection and correction. This paper aims +to fill the gap by proposing a Bi-directional Detector-Corrector framework for +CSC (Bi-DCSpell). Notably, Bi-DCSpell contains separate detection and +correction encoders, followed by a novel interactive learning module +facilitating bi-directional feature interactions between detection and +correction to improve each other's representation learning. Extensive +experimental results demonstrate a robust correction performance of Bi-DCSpell +on widely used benchmarking datasets while possessing a satisfactory detection +ability. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Training LLMs over Neurally Compressed Text + + +
+ In this paper, we explore the idea of training large language models (LLMs) +over highly compressed text. While standard subword tokenizers compress text by +a small factor, neural text compressors can achieve much higher rates of +compression. If it were possible to train LLMs directly over neurally +compressed text, this would confer advantages in training and serving +efficiency, as well as easier handling of long text spans. The main obstacle to +this goal is that strong compression tends to produce opaque outputs that are +not well-suited for learning. In particular, we find that text na\"ively +compressed via Arithmetic Coding is not readily learnable by LLMs. To overcome +this, we propose Equal-Info Windows, a novel compression technique whereby text +is segmented into blocks that each compress to the same bit length. Using this +method, we demonstrate effective learning over neurally compressed text that +improves with scale, and outperforms byte-level baselines by a wide margin on +perplexity and inference speed benchmarks. While our method delivers worse +perplexity than subword tokenizers for models trained with the same parameter +count, it has the benefit of shorter sequence lengths. Shorter sequence lengths +require fewer autoregressive generation steps, and reduce latency. Finally, we +provide extensive analysis of the properties that contribute to learnability, +and offer concrete suggestions for how to further improve the performance of +high-compression tokenizers. + +
+
+
+
+
+ + ♻ ☆ DataNarrative: Automated Data-Driven Storytelling with Visualizations + and Texts + + +
+ Data-driven storytelling is a powerful method for conveying insights by +combining narrative techniques with visualizations and text. These stories +integrate visual aids, such as highlighted bars and lines in charts, along with +textual annotations explaining insights. However, creating such stories +requires a deep understanding of the data and meticulous narrative planning, +often necessitating human intervention, which can be time-consuming and +mentally taxing. While Large Language Models (LLMs) excel in various NLP tasks, +their ability to generate coherent and comprehensive data stories remains +underexplored. In this work, we introduce a novel task for data story +generation and a benchmark containing 1,449 stories from diverse sources. To +address the challenges of crafting coherent data stories, we propose a +multiagent framework employing two LLM agents designed to replicate the human +storytelling process: one for understanding and describing the data +(Reflection), generating the outline, and narration, and another for +verification at each intermediary step. While our agentic framework generally +outperforms non-agentic counterparts in both model-based and human evaluations, +the results also reveal unique challenges in data story generation. + +
+
+
+
+
+ + ♻ ☆ MiTTenS: A Dataset for Evaluating Gender Mistranslation + + +
+ Translation systems, including foundation models capable of translation, can +produce errors that result in gender mistranslation, and such errors can be +especially harmful. To measure the extent of such potential harms when +translating into and out of English, we introduce a dataset, MiTTenS, covering +26 languages from a variety of language families and scripts, including several +traditionally under-represented in digital resources. The dataset is +constructed with handcrafted passages that target known failure patterns, +longer synthetically generated passages, and natural passages sourced from +multiple domains. We demonstrate the usefulness of the dataset by evaluating +both neural machine translation systems and foundation models, and show that +all systems exhibit gender mistranslation and potential harm, even in high +resource languages. + +
+
+ comment: GitHub repository https://github.com/google-research-datasets/mittens +
+
+
+
+
+ + ♻ ☆ Sociodemographic Bias in Language Models: A Survey and Forward Path + + +
+ Sociodemographic bias in language models (LMs) has the potential for harm +when deployed in real-world settings. This paper presents a comprehensive +survey of the past decade of research on sociodemographic bias in LMs, +organized into a typology that facilitates examining the different aims: types +of bias, quantifying bias, and debiasing techniques. We track the evolution of +the latter two questions, then identify current trends and their limitations, +as well as emerging techniques. To guide future research towards more effective +and reliable solutions, and to help authors situate their work within this +broad landscape, we conclude with a checklist of open questions. + +
+
+ comment: 23 pages, 3 figure +
+
+
+
+
+ + ♻ ☆ Retrieval-enhanced Knowledge Editing in Language Models for Multi-Hop + Question Answering CIKM 2024 + + +
+ Large Language Models (LLMs) have shown proficiency in question-answering +tasks but often struggle to integrate real-time knowledge, leading to +potentially outdated or inaccurate responses. This problem becomes even more +challenging when dealing with multi-hop questions, since they require LLMs to +update and integrate multiple knowledge pieces relevant to the questions. To +tackle the problem, we propose the Retrieval-Augmented model Editing (RAE) +framework for multi-hop question answering. RAE first retrieves edited facts +and then refines the language model through in-context learning. Specifically, +our retrieval approach, based on mutual information maximization, leverages the +reasoning abilities of LLMs to identify chain facts that traditional +similarity-based searches might miss. In addition, our framework includes a +pruning strategy to eliminate redundant information from the retrieved facts, +which enhances the editing accuracy and mitigates the hallucination problem. +Our framework is supported by theoretical justification for its fact retrieval +efficacy. Finally, comprehensive evaluation across various LLMs validates RAE's +ability in providing accurate answers with updated knowledge. Our code is +available at: https://github.com/sycny/RAE. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Recent Advances in Generative AI and Large Language Models: Current + Status, Challenges, and Perspectives + + +
+ The emergence of Generative Artificial Intelligence (AI) and Large Language +Models (LLMs) has marked a new era of Natural Language Processing (NLP), +introducing unprecedented capabilities that are revolutionizing various +domains. This paper explores the current state of these cutting-edge +technologies, demonstrating their remarkable advancements and wide-ranging +applications. Our paper contributes to providing a holistic perspective on the +technical foundations, practical applications, and emerging challenges within +the evolving landscape of Generative AI and LLMs. We believe that understanding +the generative capabilities of AI systems and the specific context of LLMs is +crucial for researchers, practitioners, and policymakers to collaboratively +shape the responsible and ethical integration of these technologies into +various domains. Furthermore, we identify and address main research gaps, +providing valuable insights to guide future research endeavors within the AI +research community. + +
+
+ comment: This version is accepted for publication in the Journal of IEEE + Transactions on Artificial Intelligence (TAI) +
+
+
+
+
+ + ♻ ☆ Large Model Strategic Thinking, Small Model Efficiency: Transferring + Theory of Mind in Large Language Models + + +
+ As the performance of larger, newer Large Language Models continues to +improve for strategic Theory of Mind (ToM) tasks, the demand for these state of +the art models increases commensurately. However, their deployment is costly +both in terms of processing power and time. In this paper, we investigate the +feasibility of creating smaller, simulation-ready agents by way of fine-tuning. +To do this, we present a large pre-trained model with 20 unique scenarios that +combine a social context with a social dilemma, recording its answers, and +using them for Q\&A fine-tuning on a smaller model of the same family. Our +focus is on in-context game-theoretic decision-making, the same domain within +which human interaction occurs and that requires both a theory of mind (or a +semblance thereof) and an understanding of social dynamics. We find that the +fine-tuned smaller language model exhibited significant performance closer to +that of its larger relative, and that their improvements extended in areas and +contexts beyond the ones provided in the training examples. On average for all +games, through fine-tuning, the smaller model showed a \%46 improvement in +aligning with the behavior of the larger model, with \%100 representing +complete alignment. This suggests that our pipeline represents an efficient +method to transmit some form of theory of mind to smaller models, creating +improved and cheaply deployable algorithms in the process. Despite their +simplicity and their associated shortcomings and limitations, our findings +represent a stepping stone in the pursuit and training of specialized models +for strategic and social decision making. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Never Lost in the Middle: Mastering Long-Context Question Answering with + Position-Agnostic Decompositional Training ACL 2024 + + +
+ While large language models (LLMs) are equipped with longer text input +capabilities than before, they are struggling to seek correct information in +long contexts. The "lost in the middle" problem challenges most LLMs, referring +to the dramatic decline in accuracy when correct information is located in the +middle. To overcome this crucial issue, this paper proposes to enhance the +information searching and reflection ability of LLMs in long contexts via +specially designed tasks called Attention Strengthening Multi-doc QA (ASM QA). +Following these tasks, our model excels in focusing more precisely on the +desired information. Experimental results show substantial improvement in +Multi-doc QA and other benchmarks, superior to state-of-the-art models by 13.7% +absolute gain in shuffled settings, by 21.5% in passage retrieval task. We +release our model, Ziya-Reader to promote related research in the community. + +
+
+ comment: Accepted by ACL 2024 main conference +
+
+
+
+
+ + ♻ ☆ Better Alignment with Instruction Back-and-Forth Translation + + +
+ We propose a new method, instruction back-and-forth translation, to construct +high-quality synthetic data grounded in world knowledge for aligning large +language models (LLMs). Given documents from a web corpus, we generate and +curate synthetic instructions using the backtranslation approach proposed by Li +et al.(2023a), and rewrite the responses to improve their quality further based +on the initial documents. Fine-tuning with the resulting (backtranslated +instruction, rewritten response) pairs yields higher win rates on AlpacaEval +than using other common instruction datasets such as Humpback, ShareGPT, Open +Orca, Alpaca-GPT4 and Self-instruct. We also demonstrate that rewriting the +responses with an LLM outperforms direct distillation, and the two generated +text distributions exhibit significant distinction in embedding space. Further +analysis shows that our backtranslated instructions are of higher quality than +other sources of synthetic instructions, while our responses are more diverse +and complex than those obtained from distillation. Overall we find that +instruction back-and-forth translation combines the best of both worlds -- +making use of the information diversity and quantity found on the web, while +ensuring the quality of the responses which is necessary for effective +alignment. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 131 + +
+
+
+ + ☆ Fingerspelling within Sign Language Translation + + +
+ Fingerspelling poses challenges for sign language processing due to its +high-frequency motion and use for open-vocabulary terms. While prior work has +studied fingerspelling recognition, there has been little attention to +evaluating how well sign language translation models understand fingerspelling +in the context of entire sentences -- and improving this capability. We +manually annotate instances of fingerspelling within FLEURS-ASL and use them to +evaluate the effect of two simple measures to improve fingerspelling +recognition within American Sign Language to English translation: 1) use a +model family (ByT5) with character- rather than subword-level tokenization, and +2) mix fingerspelling recognition data into the translation training mixture. +We find that 1) substantially improves understanding of fingerspelling (and +therefore translation quality overall), but the effect of 2) is mixed. + +
+
+
+
+
+ + ☆ PSM: Learning Probabilistic Embeddings for Multi-scale Zero-Shot + Soundscape Mapping ACM MM 2024 + + +
+ A soundscape is defined by the acoustic environment a person perceives at a +location. In this work, we propose a framework for mapping soundscapes across +the Earth. Since soundscapes involve sound distributions that span varying +spatial scales, we represent locations with multi-scale satellite imagery and +learn a joint representation among this imagery, audio, and text. To capture +the inherent uncertainty in the soundscape of a location, we design the +representation space to be probabilistic. We also fuse ubiquitous metadata +(including geolocation, time, and data source) to enable learning of spatially +and temporally dynamic representations of soundscapes. We demonstrate the +utility of our framework by creating large-scale soundscape maps integrating +both audio and text with temporal control. To facilitate future research on +this task, we also introduce a large-scale dataset, GeoSound, containing over +$300k$ geotagged audio samples paired with both low- and high-resolution +satellite imagery. We demonstrate that our method outperforms the existing +state-of-the-art on both GeoSound and the existing SoundingEarth dataset. Our +dataset and code is available at https://github.com/mvrl/PSM. + +
+
+ comment: Accepted at ACM MM 2024 +
+
+
+
+
+ + ☆ KAN You See It? KANs and Sentinel for Effective and Explainable Crop + Field Segmentation ECCV 2024 + + +
+ Segmentation of crop fields is essential for enhancing agricultural +productivity, monitoring crop health, and promoting sustainable practices. Deep +learning models adopted for this task must ensure accurate and reliable +predictions to avoid economic losses and environmental impact. The newly +proposed Kolmogorov-Arnold networks (KANs) offer promising advancements in the +performance of neural networks. This paper analyzes the integration of KAN +layers into the U-Net architecture (U-KAN) to segment crop fields using +Sentinel-2 and Sentinel-1 satellite images and provides an analysis of the +performance and explainability of these networks. Our findings indicate a 2\% +improvement in IoU compared to the traditional full-convolutional U-Net model +in fewer GFLOPs. Furthermore, gradient-based explanation techniques show that +U-KAN predictions are highly plausible and that the network has a very high +ability to focus on the boundaries of cultivated areas rather than on the areas +themselves. The per-channel relevance analysis also reveals that some channels +are irrelevant to this task. + +
+
+ comment: Accepted at ECCV 2024 CVPPA Workshop +
+
+
+
+
+ + ☆ PathInsight: Instruction Tuning of Multimodal Datasets and Models for + Intelligence Assisted Diagnosis in Histopathology + + +
+ Pathological diagnosis remains the definitive standard for identifying +tumors. The rise of multimodal large models has simplified the process of +integrating image analysis with textual descriptions. Despite this advancement, +the substantial costs associated with training and deploying these complex +multimodal models, together with a scarcity of high-quality training datasets, +create a significant divide between cutting-edge technology and its application +in the clinical setting. We had meticulously compiled a dataset of +approximately 45,000 cases, covering over 6 different tasks, including the +classification of organ tissues, generating pathology report descriptions, and +addressing pathology-related questions and answers. We have fine-tuned +multimodal large models, specifically LLaVA, Qwen-VL, InternLM, with this +dataset to enhance instruction-based performance. We conducted a qualitative +assessment of the capabilities of the base model and the fine-tuned model in +performing image captioning and classification tasks on the specific dataset. +The evaluation results demonstrate that the fine-tuned model exhibits +proficiency in addressing typical pathological questions. We hope that by +making both our models and datasets publicly available, they can be valuable to +the medical and research communities. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Efficient Human-Object-Interaction (EHOI) Detection via Interaction + Label Coding and Conditional Decision + + +
+ Human-Object Interaction (HOI) detection is a fundamental task in image +understanding. While deep-learning-based HOI methods provide high performance +in terms of mean Average Precision (mAP), they are computationally expensive +and opaque in training and inference processes. An Efficient HOI (EHOI) +detector is proposed in this work to strike a good balance between detection +performance, inference complexity, and mathematical transparency. EHOI is a +two-stage method. In the first stage, it leverages a frozen object detector to +localize the objects and extract various features as intermediate outputs. In +the second stage, the first-stage outputs predict the interaction type using +the XGBoost classifier. Our contributions include the application of error +correction codes (ECCs) to encode rare interaction cases, which reduces the +model size and the complexity of the XGBoost classifier in the second stage. +Additionally, we provide a mathematical formulation of the relabeling and +decision-making process. Apart from the architecture, we present qualitative +results to explain the functionalities of the feedforward modules. Experimental +results demonstrate the advantages of ECC-coded interaction labels and the +excellent balance of detection performance and complexity of the proposed EHOI +method. + +
+
+
+
+
+ + ☆ Imagen 3 + + +
+ We introduce Imagen 3, a latent diffusion model that generates high quality +images from text prompts. We describe our quality and responsibility +evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at +the time of evaluation. In addition, we discuss issues around safety and +representation, as well as methods we used to minimize the potential harm of +our models. + +
+
+
+
+
+ + ☆ Low-Bitwidth Floating Point Quantization for Efficient High-Quality + Diffusion Models + + +
+ Diffusion models are emerging models that generate images by iteratively +denoising random Gaussian noise using deep neural networks. These models +typically exhibit high computational and memory demands, necessitating +effective post-training quantization for high-performance inference. Recent +works propose low-bitwidth (e.g., 8-bit or 4-bit) quantization for diffusion +models, however 4-bit integer quantization typically results in low-quality +images. We observe that on several widely used hardware platforms, there is +little or no difference in compute capability between floating-point and +integer arithmetic operations of the same bitwidth (e.g., 8-bit or 4-bit). +Therefore, we propose an effective floating-point quantization method for +diffusion models that provides better image quality compared to integer +quantization methods. We employ a floating-point quantization method that was +effective for other processing tasks, specifically computer vision and natural +language tasks, and tailor it for diffusion models by integrating weight +rounding learning during the mapping of the full-precision values to the +quantized values in the quantization process. We comprehensively study integer +and floating-point quantization methods in state-of-the-art diffusion models. +Our floating-point quantization method not only generates higher-quality images +than that of integer quantization methods, but also shows no noticeable +degradation compared to full-precision models (32-bit floating-point), when +both weights and activations are quantized to 8-bit floating-point values, +while has minimal degradation with 4-bit weights and 8-bit activations. + +
+
+
+
+
+ + ☆ SpectralGaussians: Semantic, spectral 3D Gaussian splatting for + multi-spectral scene representation, visualization and analysis + + +
+ We propose a novel cross-spectral rendering framework based on 3D Gaussian +Splatting (3DGS) that generates realistic and semantically meaningful splats +from registered multi-view spectrum and segmentation maps. This extension +enhances the representation of scenes with multiple spectra, providing insights +into the underlying materials and segmentation. We introduce an improved +physically-based rendering approach for Gaussian splats, estimating reflectance +and lights per spectra, thereby enhancing accuracy and realism. In a +comprehensive quantitative and qualitative evaluation, we demonstrate the +superior performance of our approach with respect to other recent +learning-based spectral scene representation approaches (i.e., XNeRF and +SpectralNeRF) as well as other non-spectral state-of-the-art learning-based +approaches. Our work also demonstrates the potential of spectral scene +understanding for precise scene editing techniques like style transfer, +inpainting, and removal. Thereby, our contributions address challenges in +multi-spectral scene representation, rendering, and editing, offering new +possibilities for diverse applications. + +
+
+
+
+
+ + ☆ Prompt-Based Segmentation at Multiple Resolutions and Lighting + Conditions using Segment Anything Model 2 + + +
+ This paper provides insight into the effectiveness of zero-shot, +prompt-based, Segment Anything Model (SAM), and its updated version, SAM 2, and +the non-promptable, conventional convolutional network (CNN), in segmenting +solar panels, in RGB aerial imagery, across lighting conditions, spatial +resolutions, and prompt strategies. SAM 2 demonstrates improvements over SAM, +particularly in sub-optimal lighting conditions when prompted by points. Both +SAMs, prompted by user-box, outperformed CNN, in all scenarios. Additionally, +YOLOv9 prompting outperformed user points prompting. In high-resolution +imagery, both in optimal and sub-optimal lighting conditions, Eff-UNet +outperformed both SAM models prompted by YOLOv9 boxes, positioning Eff-UNet as +the appropriate model for automatic segmentation in high-resolution data. In +low-resolution data, user box prompts were found crucial to achieve a +reasonable performance. This paper provides details on strengths and +limitations of each model and outlines robustness of user prompted image +segmentation models in inconsistent resolution and lighting conditions of +remotely sensed data. + +
+
+
+
+
+ + ☆ Event-Stream Super Resolution using Sigma-Delta Neural Network ECCV + + +
+ This study introduces a novel approach to enhance the spatial-temporal +resolution of time-event pixels based on luminance changes captured by event +cameras. These cameras present unique challenges due to their low resolution +and the sparse, asynchronous nature of the data they collect. Current event +super-resolution algorithms are not fully optimized for the distinct data +structure produced by event cameras, resulting in inefficiencies in capturing +the full dynamism and detail of visual scenes with improved computational +complexity. To bridge this gap, our research proposes a method that integrates +binary spikes with Sigma Delta Neural Networks (SDNNs), leveraging +spatiotemporal constraint learning mechanism designed to simultaneously learn +the spatial and temporal distributions of the event stream. The proposed +network is evaluated using widely recognized benchmark datasets, including +N-MNIST, CIFAR10-DVS, ASL-DVS, and Event-NFS. A comprehensive evaluation +framework is employed, assessing both the accuracy, through root mean square +error (RMSE), and the computational efficiency of our model. The findings +demonstrate significant improvements over existing state-of-the-art methods, +specifically, the proposed method outperforms state-of-the-art performance in +computational efficiency, achieving a 17.04-fold improvement in event sparsity +and a 32.28-fold increase in synaptic operation efficiency over traditional +artificial neural networks, alongside a two-fold better performance over +spiking neural networks. + +
+
+ comment: ECCV: The 18th European Conference on Computer Vision ECCV 2024 NeVi + Workshop +
+
+
+
+
+ + ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. + To overcome these constraints, this paper presents the Inter-class Feature +Compensator (INFER), an innovative distillation approach that transcends the +class-specific data-label framework widely utilized in current dataset +distillation methods. Specifically, INFER leverages a Universal Feature +Compensator (UFC) to enhance feature integration across classes, enabling the +generation of multiple additional synthetic instances from a single UFC input. +This significantly improves the efficiency of the distillation budget. + Moreover, INFER enriches inter-class interactions during the distillation, +thereby enhancing the effectiveness and generalizability of the distilled data. +By allowing for the linear interpolation of labels similar to those in the +original dataset, INFER meticulously optimizes the synthetic data and +dramatically reduces the size of soft labels in the synthetic dataset to almost +zero, establishing a new benchmark for efficiency and effectiveness in dataset +distillation. + +
+
+
+
+
+ + ☆ SceneGPT: A Language Model for 3D Scene Understanding + + +
+ Building models that can understand and reason about 3D scenes is difficult +owing to the lack of data sources for 3D supervised training and large-scale +training regimes. In this work we ask - How can the knowledge in a pre-trained +language model be leveraged for 3D scene understanding without any 3D +pre-training. The aim of this work is to establish whether pre-trained LLMs +possess priors/knowledge required for reasoning in 3D space and how can we +prompt them such that they can be used for general purpose spatial reasoning +and object understanding in 3D. To this end, we present SceneGPT, an LLM based +scene understanding system which can perform 3D spatial reasoning without +training or explicit 3D supervision. The key components of our framework are - +1) a 3D scene graph, that serves as scene representation, encoding the objects +in the scene and their spatial relationships 2) a pre-trained LLM that can be +adapted with in context learning for 3D spatial reasoning. We evaluate our +framework qualitatively on object and scene understanding tasks including +object semantics, physical properties and affordances (object-level) and +spatial understanding (scene-level). + +
+
+ comment: UBC Report +
+
+
+
+
+ + ☆ Divide and Conquer: Improving Multi-Camera 3D Perception with 2D + Semantic-Depth Priors and Input-Dependent Queries + + +
+ 3D perception tasks, such as 3D object detection and Bird's-Eye-View (BEV) +segmentation using multi-camera images, have drawn significant attention +recently. Despite the fact that accurately estimating both semantic and 3D +scene layouts are crucial for this task, existing techniques often neglect the +synergistic effects of semantic and depth cues, leading to the occurrence of +classification and position estimation errors. Additionally, the +input-independent nature of initial queries also limits the learning capacity +of Transformer-based models. To tackle these challenges, we propose an +input-aware Transformer framework that leverages Semantics and Depth as priors +(named SDTR). Our approach involves the use of an S-D Encoder that explicitly +models semantic and depth priors, thereby disentangling the learning process of +object categorization and position estimation. Moreover, we introduce a +Prior-guided Query Builder that incorporates the semantic prior into the +initial queries of the Transformer, resulting in more effective input-aware +queries. Extensive experiments on the nuScenes and Lyft benchmarks demonstrate +the state-of-the-art performance of our method in both 3D object detection and +BEV segmentation tasks. + +
+
+ comment: Accepted by TIP 2024 +
+
+
+
+
+ + ☆ EE3P3D: Event-based Estimation of Periodic Phenomena Frequency using 3D + Correlation + + +
+ We present a novel method for measuring the frequency of periodic phenomena, +e.g., rotation, flicker and vibration, by an event camera, a device +asynchronously reporting brightness changes at independently operating pixels +with high temporal resolution. The approach assumes that for a periodic +phenomenon, a highly similar set of events is generated within a specific +spatio-temporal window at a time difference corresponding to the phenomenon's +period. The sets of similar events are detected by 3D spatio-temporal +correlation in the event stream space. The proposed method, EE3P3D, is +evaluated on a dataset of 12 sequences of periodic phenomena, i.e. flashing +light and vibration, and periodic motion, e.g., rotation, ranging from 3.2 Hz +to 2 kHz (equivalent to 192 - 120 000 RPM). EE3P3D significantly outperforms +published methods on this dataset, achieving a mean relative error of 0.1%. + +
+
+ comment: 15 paper pages + 11 suppl pages, 15 figues, 4 tables +
+
+
+
+
+ + ☆ Automatic Feature Recognition and Dimensional Attributes Extraction From + CAD Models for Hybrid Additive-Subtractive Manufacturing + + +
+ The integration of Computer-Aided Design (CAD), Computer-Aided Process +Planning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in +modern manufacturing, facilitating seamless transitions from digital designs to +physical products. However, a significant challenge within this integration is +the Automatic Feature Recognition (AFR) of CAD models, especially in the +context of hybrid manufacturing that combines subtractive and additive +manufacturing processes. Traditional AFR methods, focused mainly on the +identification of subtractive (machined) features including holes, fillets, +chamfers, pockets, and slots, fail to recognize features pertinent to additive +manufacturing. Furthermore, the traditional methods fall short in accurately +extracting geometric dimensions and orientations, which are also key factors +for effective manufacturing process planning. This paper presents a novel +approach for creating a synthetic CAD dataset that encompasses features +relevant to both additive and subtractive machining through Python Open +Cascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is +implemented to accurately identify the composite additive-subtractive features +within the synthetic CAD dataset. The key novelty and contribution of the +proposed methodology lie in its ability to recognize a wide range of +manufacturing features, and precisely extracting their dimensions, +orientations, and stock sizes. The proposed model demonstrates remarkable +feature recognition accuracy exceeding 97% and a dimension extraction accuracy +of 100% for identified features. Therefore, the proposed methodology enhances +the integration of CAD, CAPP, and CAM within hybrid manufacturing by providing +precise feature recognition and dimension extraction. It facilitates improved +manufacturing process planning, by enabling more informed decision-making. + +
+
+ comment: 10 pages, 12 figures. This paper has been accepted for presentation + at the ASME IDETC-CIE 2024 conference +
+
+
+
+
+ + ☆ PBIR-NIE: Glossy Object Capture under Non-Distant Lighting + + +
+ Glossy objects present a significant challenge for 3D reconstruction from +multi-view input images under natural lighting. In this paper, we introduce +PBIR-NIE, an inverse rendering framework designed to holistically capture the +geometry, material attributes, and surrounding illumination of such objects. We +propose a novel parallax-aware non-distant environment map as a lightweight and +efficient lighting representation, accurately modeling the near-field +background of the scene, which is commonly encountered in real-world capture +setups. This feature allows our framework to accommodate complex parallax +effects beyond the capabilities of standard infinite-distance environment maps. +Our method optimizes an underlying signed distance field (SDF) through +physics-based differentiable rendering, seamlessly connecting surface gradients +between a triangle mesh and the SDF via neural implicit evolution (NIE). To +address the intricacies of highly glossy BRDFs in differentiable rendering, we +integrate the antithetic sampling algorithm to mitigate variance in the Monte +Carlo gradient estimator. Consequently, our framework exhibits robust +capabilities in handling glossy object reconstruction, showcasing superior +quality in geometry, relighting, and material estimation. + +
+
+
+
+
+ + ☆ A Comprehensive Survey on Synthetic Infrared Image synthesis + + +
+ Synthetic infrared (IR) scene and target generation is an important computer +vision problem as it allows the generation of realistic IR images and targets +for training and testing of various applications, such as remote sensing, +surveillance, and target recognition. It also helps reduce the cost and risk +associated with collecting real-world IR data. This survey paper aims to +provide a comprehensive overview of the conventional mathematical +modelling-based methods and deep learning-based methods used for generating +synthetic IR scenes and targets. The paper discusses the importance of +synthetic IR scene and target generation and briefly covers the mathematics of +blackbody and grey body radiations, as well as IR image-capturing methods. The +potential use cases of synthetic IR scenes and target generation are also +described, highlighting the significance of these techniques in various fields. +Additionally, the paper explores possible new ways of developing new techniques +to enhance the efficiency and effectiveness of synthetic IR scenes and target +generation while highlighting the need for further research to advance this +field. + +
+
+ comment: Submitted in Journal of Infrared Physics & Technology +
+
+
+
+
+ + ☆ Dynamic and Compressive Adaptation of Transformers From Images to Videos + + +
+ Recently, the remarkable success of pre-trained Vision Transformers (ViTs) +from image-text matching has sparked an interest in image-to-video adaptation. +However, most current approaches retain the full forward pass for each frame, +leading to a high computation overhead for processing entire videos. In this +paper, we present InTI, a novel approach for compressive image-to-video +adaptation using dynamic Inter-frame Token Interpolation. InTI aims to softly +preserve the informative tokens without disrupting their coherent +spatiotemporal structure. Specifically, each token pair at identical positions +within neighbor frames is linearly aggregated into a new token, where the +aggregation weights are generated by a multi-scale context-aware network. In +this way, the information of neighbor frames can be adaptively compressed in a +point-by-point manner, thereby effectively reducing the number of processed +frames by half each time. Importantly, InTI can be seamlessly integrated with +existing adaptation methods, achieving strong performance without extra-complex +design. On Kinetics-400, InTI reaches a top-1 accuracy of 87.1 with a +remarkable 37.5% reduction in GFLOPs compared to naive adaptation. When +combined with additional temporal modules, InTI achieves a top-1 accuracy of +87.6 with a 37% reduction in GFLOPs. Similar conclusions have been verified in +other common datasets. + +
+
+
+
+
+ + ☆ GLGait: A Global-Local Temporal Receptive Field Network for Gait + Recognition in the Wild ACM MM2024 + + +
+ Gait recognition has attracted increasing attention from academia and +industry as a human recognition technology from a distance in non-intrusive +ways without requiring cooperation. Although advanced methods have achieved +impressive success in lab scenarios, most of them perform poorly in the wild. +Recently, some Convolution Neural Networks (ConvNets) based methods have been +proposed to address the issue of gait recognition in the wild. However, the +temporal receptive field obtained by convolution operations is limited for long +gait sequences. If directly replacing convolution blocks with visual +transformer blocks, the model may not enhance a local temporal receptive field, +which is important for covering a complete gait cycle. To address this issue, +we design a Global-Local Temporal Receptive Field Network (GLGait). GLGait +employs a Global-Local Temporal Module (GLTM) to establish a global-local +temporal receptive field, which mainly consists of a Pseudo Global Temporal +Self-Attention (PGTA) and a temporal convolution operation. Specifically, PGTA +is used to obtain a pseudo global temporal receptive field with less memory and +computation complexity compared with a multi-head self-attention (MHSA). The +temporal convolution operation is used to enhance the local temporal receptive +field. Besides, it can also aggregate pseudo global temporal receptive field to +a true holistic temporal receptive field. Furthermore, we also propose a +Center-Augmented Triplet Loss (CTL) in GLGait to reduce the intra-class +distance and expand the positive samples in the training stage. Extensive +experiments show that our method obtains state-of-the-art results on +in-the-wild datasets, $i.e.$, Gait3D and GREW. The code is available at +https://github.com/bgdpgz/GLGait. + +
+
+ comment: Accepted by ACM MM2024 +
+
+
+
+
+ + ☆ FlatFusion: Delving into Details of Sparse Transformer-based + Camera-LiDAR Fusion for Autonomous Driving + + +
+ The integration of data from diverse sensor modalities (e.g., camera and +LiDAR) constitutes a prevalent methodology within the ambit of autonomous +driving scenarios. Recent advancements in efficient point cloud transformers +have underscored the efficacy of integrating information in sparse formats. +When it comes to fusion, since image patches are dense in pixel space with +ambiguous depth, it necessitates additional design considerations for effective +fusion. In this paper, we conduct a comprehensive exploration of design choices +for Transformer-based sparse cameraLiDAR fusion. This investigation encompasses +strategies for image-to-3D and LiDAR-to-2D mapping, attention neighbor +grouping, single modal tokenizer, and micro-structure of Transformer. By +amalgamating the most effective principles uncovered through our investigation, +we introduce FlatFusion, a carefully designed framework for sparse camera-LiDAR +fusion. Notably, FlatFusion significantly outperforms state-of-the-art sparse +Transformer-based methods, including UniTR, CMT, and SparseFusion, achieving +73.7 NDS on the nuScenes validation set with 10.1 FPS with PyTorch. + +
+
+
+
+
+ + ☆ Photometric Inverse Rendering: Shading Cues Modeling and Surface + Reflectance Regularization + + +
+ This paper addresses the problem of inverse rendering from photometric +images. Existing approaches for this problem suffer from the effects of +self-shadows, inter-reflections, and lack of constraints on the surface +reflectance, leading to inaccurate decomposition of reflectance and +illumination due to the ill-posed nature of inverse rendering. In this work, we +propose a new method for neural inverse rendering. Our method jointly optimizes +the light source position to account for the self-shadows in images, and +computes indirect illumination using a differentiable rendering layer and an +importance sampling strategy. To enhance surface reflectance decomposition, we +introduce a new regularization by distilling DINO features to foster accurate +and consistent material decomposition. Extensive experiments on synthetic and +real datasets demonstrate that our method outperforms the state-of-the-art +methods in reflectance decomposition. + +
+
+ comment: Project page: https://jzbao03.site/projects/PIR/ +
+
+
+
+
+ + ☆ Membership Inference Attack Against Masked Image Modeling + + +
+ Masked Image Modeling (MIM) has achieved significant success in the realm of +self-supervised learning (SSL) for visual recognition. The image encoder +pre-trained through MIM, involving the masking and subsequent reconstruction of +input images, attains state-of-the-art performance in various downstream vision +tasks. However, most existing works focus on improving the performance of +MIM.In this work, we take a different angle by studying the pre-training data +privacy of MIM. Specifically, we propose the first membership inference attack +against image encoders pre-trained by MIM, which aims to determine whether an +image is part of the MIM pre-training dataset. The key design is to simulate +the pre-training paradigm of MIM, i.e., image masking and subsequent +reconstruction, and then obtain reconstruction errors. These reconstruction +errors can serve as membership signals for achieving attack goals, as the +encoder is more capable of reconstructing the input image in its training set +with lower errors. Extensive evaluations are conducted on three model +architectures and three benchmark datasets. Empirical results show that our +attack outperforms baseline methods. Additionally, we undertake intricate +ablation studies to analyze multiple factors that could influence the +performance of the attack. + +
+
+
+
+
+ + ☆ Structure-preserving Planar Simplification for Indoor Environments + + +
+ This paper presents a novel approach for structure-preserving planar +simplification of indoor scene point clouds for both simulated and real-world +environments. Initially, the scene point cloud undergoes preprocessing steps, +including noise reduction and Manhattan world alignment, to ensure robustness +and coherence in subsequent analyses. We segment each captured scene into +structured (walls-ceiling-floor) and non-structured (indoor objects) scenes. +Leveraging a RANSAC algorithm, we extract primitive planes from the input point +cloud, facilitating the segmentation and simplification of the structured +scene. The best-fitting wall meshes are then generated from the primitives, +followed by adjacent mesh merging with the vertex-translation algorithm which +preserves the mesh layout. To accurately represent ceilings and floors, we +employ the mesh clipping algorithm which clips the ceiling and floor meshes +with respect to wall normals. In the case of indoor scenes, we apply a surface +reconstruction technique to enhance the fidelity. This paper focuses on the +intricate steps of the proposed scene simplification methodology, addressing +complex scenarios such as multi-story and slanted walls and ceilings. We also +conduct qualitative and quantitative performance comparisons against popular +surface reconstruction, shape approximation, and floorplan generation +approaches. + +
+
+
+
+
+ + ☆ Oracle Bone Script Similiar Character Screening Approach Based on + Simsiam Contrastive Learning and Supervised Learning + + +
+ This project proposes a new method that uses fuzzy comprehensive evaluation +method to integrate ResNet-50 self-supervised and RepVGG supervised learning. +The source image dataset HWOBC oracle is taken as input, the target image is +selected, and finally the most similar image is output in turn without any +manual intervention. The same feature encoding method is not used for images of +different modalities. Before the model training, the image data is +preprocessed, and the image is enhanced by random rotation processing, +self-square graph equalization theory algorithm, and gamma transform, which +effectively enhances the key feature learning. Finally, the fuzzy comprehensive +evaluation method is used to combine the results of supervised training and +unsupervised training, which can better solve the "most similar" problem that +is difficult to quantify. At present, there are many unknown oracle-bone +inscriptions waiting for us to crack. Contacting with the glyphs can provide +new ideas for cracking. + +
+
+
+
+
+ + ☆ Unmasking the Uniqueness: A Glimpse into Age-Invariant Face Recognition + of Indigenous African Faces + + +
+ The task of recognizing the age-separated faces of an individual, +Age-Invariant Face Recognition (AIFR), has received considerable research +efforts in Europe, America, and Asia, compared to Africa. Thus, AIFR research +efforts have often under-represented/misrepresented the African ethnicity with +non-indigenous Africans. This work developed an AIFR system for indigenous +African faces to reduce the misrepresentation of African ethnicity in facial +image analysis research. We adopted a pre-trained deep learning model (VGGFace) +for AIFR on a dataset of 5,000 indigenous African faces (FAGE\_v2) collected +for this study. FAGE\_v2 was curated via Internet image searches of 500 +individuals evenly distributed across 10 African countries. VGGFace was trained +on FAGE\_v2 to obtain the best accuracy of 81.80\%. We also performed +experiments on an African-American subset of the CACD dataset and obtained the +best accuracy of 91.5\%. The results show a significant difference in the +recognition accuracies of indigenous versus non-indigenous Africans. + +
+
+ comment: Keywords: Age-Invariant Face Recognition, CACD, FAGE_v2, VGGFace +
+
+
+
+
+ + ☆ Integrating Saliency Ranking and Reinforcement Learning for Enhanced + Object Detection + + +
+ With the ever-growing variety of object detection approaches, this study +explores a series of experiments that combine reinforcement learning (RL)-based +visual attention methods with saliency ranking techniques to investigate +transparent and sustainable solutions. By integrating saliency ranking for +initial bounding box prediction and subsequently applying RL techniques to +refine these predictions through a finite set of actions over multiple time +steps, this study aims to enhance RL object detection accuracy. Presented as a +series of experiments, this research investigates the use of various image +feature extraction methods and explores diverse Deep Q-Network (DQN) +architectural variations for deep reinforcement learning-based localisation +agent training. Additionally, we focus on optimising the detection pipeline at +every step by prioritising lightweight and faster models, while also +incorporating the capability to classify detected objects, a feature absent in +previous RL approaches. We show that by evaluating the performance of these +trained agents using the Pascal VOC 2007 dataset, faster and more optimised +models were developed. Notably, the best mean Average Precision (mAP) achieved +in this study was 51.4, surpassing benchmarks set by RL-based single object +detectors in the literature. + +
+
+ comment: Resultant work from Dissertation, Department of AI, University of + Malta. Code available at: https://github.com/mbar0075/SaRLVision +
+
+
+
+
+ + ☆ Token Compensator: Altering Inference Cost of Vision Transformer without + Re-Tuning ECCV2024 + + +
+ Token compression expedites the training and inference of Vision Transformers +(ViTs) by reducing the number of the redundant tokens, e.g., pruning +inattentive tokens or merging similar tokens. However, when applied to +downstream tasks, these approaches suffer from significant performance drop +when the compression degrees are mismatched between training and inference +stages, which limits the application of token compression on off-the-shelf +trained models. In this paper, we propose a model arithmetic framework to +decouple the compression degrees between the two stages. In advance, we +additionally perform a fast parameter-efficient self-distillation stage on the +pre-trained models to obtain a small plugin, called Token Compensator (ToCom), +which describes the gap between models across different compression degrees. +During inference, ToCom can be directly inserted into any downstream +off-the-shelf models with any mismatched training and inference compression +degrees to acquire universal performance improvements without further training. +Experiments on over 20 downstream tasks demonstrate the effectiveness of our +framework. On CIFAR100, fine-grained visual classification, and VTAB-1k, ToCom +can yield up to a maximum improvement of 2.3%, 1.5%, and 2.0% in the average +performance of DeiT-B, respectively. Code: https://github.com/JieShibo/ToCom + +
+
+ comment: Accepted to ECCV2024 +
+
+
+
+
+ + ☆ Visual Neural Decoding via Improved Visual-EEG Semantic Consistency + + +
+ Visual neural decoding refers to the process of extracting and interpreting +original visual experiences from human brain activity. Recent advances in +metric learning-based EEG visual decoding methods have delivered promising +results and demonstrated the feasibility of decoding novel visual categories +from brain activity. However, methods that directly map EEG features to the +CLIP embedding space may introduce mapping bias and cause semantic +inconsistency among features, thereby degrading alignment and impairing +decoding performance. To further explore the semantic consistency between +visual and neural signals. In this work, we construct a joint semantic space +and propose a Visual-EEG Semantic Decouple Framework that explicitly extracts +the semantic-related features of these two modalities to facilitate optimal +alignment. Specifically, a cross-modal information decoupling module is +introduced to guide the extraction of semantic-related information from +modalities. Then, by quantifying the mutual information between visual image +and EEG features, we observe a strong positive correlation between the decoding +performance and the magnitude of mutual information. Furthermore, inspired by +the mechanisms of visual object understanding from neuroscience, we propose an +intra-class geometric consistency approach during the alignment process. This +strategy maps visual samples within the same class to consistent neural +patterns, which further enhances the robustness and the performance of EEG +visual decoding. Experiments on a large Image-EEG dataset show that our method +achieves state-of-the-art results in zero-shot neural decoding tasks. + +
+
+
+
+
+ + ☆ Enhancing Diabetic Retinopathy Diagnosis: A Lightweight CNN Architecture + for Efficient Exudate Detection in Retinal Fundus Images + + +
+ Retinal fundus imaging plays an essential role in diagnosing various stages +of diabetic retinopathy, where exudates are critical markers of early disease +onset. Prompt detection of these exudates is pivotal for enabling optometrists +to arrest or significantly decelerate the disease progression. This paper +introduces a novel, lightweight convolutional neural network architecture +tailored for automated exudate detection, designed to identify these markers +efficiently and accurately. To address the challenge of limited training data, +we have incorporated domain-specific data augmentations to enhance the model's +generalizability. Furthermore, we applied a suite of regularization techniques +within our custom architecture to boost diagnostic accuracy while optimizing +computational efficiency. Remarkably, this streamlined model contains only 4.73 +million parameters a reduction of nearly 60% compared to the standard ResNet-18 +model, which has 11.69 million parameters. Despite its reduced complexity, our +model achieves an impressive F1 score of 90%, demonstrating its efficacy in the +early detection of diabetic retinopathy through fundus imaging. + +
+
+
+
+
+ + ☆ Do Vision-Language Foundational models show Robust Visual Perception? + + +
+ Recent advances in vision-language foundational models have enabled +development of systems that can perform visual understanding and reasoning +tasks. However, it is unclear if these models are robust to distribution +shifts, and how their performance and generalization capabilities vary under +changes in data distribution. In this project we strive to answer the question +"Are vision-language foundational models robust to distribution shifts like +human perception?" Specifically, we consider a diverse range of vision-language +models and compare how the performance of these systems is affected by +corruption based distribution shifts (such as \textit{motion blur, fog, snow, +gaussian noise}) commonly found in practical real-world scenarios. We analyse +the generalization capabilities qualitatively and quantitatively on zero-shot +image classification task under aforementioned distribution shifts. Our code +will be avaible at \url{https://github.com/shivam-chandhok/CPSC-540-Project} + +
+
+ comment: UBC Report +
+
+
+
+
+ + ☆ ED$^4$: Explicit Data-level Debiasing for Deepfake Detection + + +
+ Learning intrinsic bias from limited data has been considered the main reason +for the failure of deepfake detection with generalizability. Apart from the +discovered content and specific-forgery bias, we reveal a novel spatial bias, +where detectors inertly anticipate observing structural forgery clues appearing +at the image center, also can lead to the poor generalization of existing +methods. We present ED$^4$, a simple and effective strategy, to address +aforementioned biases explicitly at the data level in a unified framework +rather than implicit disentanglement via network design. In particular, we +develop ClockMix to produce facial structure preserved mixtures with arbitrary +samples, which allows the detector to learn from an exponentially extended data +distribution with much more diverse identities, backgrounds, local manipulation +traces, and the co-occurrence of multiple forgery artifacts. We further propose +the Adversarial Spatial Consistency Module (AdvSCM) to prevent extracting +features with spatial bias, which adversarially generates spatial-inconsistent +images and constrains their extracted feature to be consistent. As a +model-agnostic debiasing strategy, ED$^4$ is plug-and-play: it can be +integrated with various deepfake detectors to obtain significant benefits. We +conduct extensive experiments to demonstrate its effectiveness and superiority +over existing deepfake detection approaches. + +
+
+
+
+
+ + ☆ Exploring Domain Shift on Radar-Based 3D Object Detection Amidst Diverse + Environmental Conditions SC + + +
+ The rapid evolution of deep learning and its integration with autonomous +driving systems have led to substantial advancements in 3D perception using +multimodal sensors. Notably, radar sensors show greater robustness compared to +cameras and lidar under adverse weather and varying illumination conditions. +This study delves into the often-overlooked yet crucial issue of domain shift +in 4D radar-based object detection, examining how varying environmental +conditions, such as different weather patterns and road types, impact 3D object +detection performance. Our findings highlight distinct domain shifts across +various weather scenarios, revealing unique dataset sensitivities that +underscore the critical role of radar point cloud generation. Additionally, we +demonstrate that transitioning between different road types, especially from +highways to urban settings, introduces notable domain shifts, emphasizing the +necessity for diverse data collection across varied road environments. To the +best of our knowledge, this is the first comprehensive analysis of domain shift +effects on 4D radar-based object detection. We believe this empirical study +contributes to understanding the complex nature of domain shifts in radar data +and suggests paths forward for data collection strategy in the face of +environmental variability. + +
+
+ comment: 6 pages, 5 figures, 3 tables, accepted in IEEE International + Conference on Intelligent Transportation Systems (ITSC) 2024 +
+
+
+
+
+ + ☆ Cross-View Geolocalization and Disaster Mapping with Street-View and VHR + Satellite Imagery: A Case Study of Hurricane IAN + + +
+ Nature disasters play a key role in shaping human-urban infrastructure +interactions. Effective and efficient response to natural disasters is +essential for building resilience and a sustainable urban environment. Two +types of information are usually the most necessary and difficult to gather in +disaster response. The first information is about disaster damage perception, +which shows how badly people think that urban infrastructure has been damaged. +The second information is geolocation awareness, which means how people +whereabouts are made available. In this paper, we proposed a novel disaster +mapping framework, namely CVDisaster, aiming at simultaneously addressing +geolocalization and damage perception estimation using cross-view Street-View +Imagery (SVI) and Very High-Resolution satellite imagery. CVDisaster consists +of two cross-view models, where CVDisaster-Geoloc refers to a cross-view +geolocalization model based on a contrastive learning objective with a Siamese +ConvNeXt image encoder, and CVDisaster-Est is a cross-view classification model +based on a Couple Global Context Vision Transformer (CGCViT). Taking Hurricane +IAN as a case study, we evaluate the CVDisaster framework by creating a novel +cross-view dataset (CVIAN) and conducting extensive experiments. As a result, +we show that CVDisaster can achieve highly competitive performance (over 80% +for geolocalization and 75% for damage perception estimation) with even limited +fine-tuning efforts, which largely motivates future cross-view models and +applications within a broader GeoAI research community. The data and code are +publicly available at: https://github.com/tum-bgd/CVDisaster. + +
+
+
+
+
+ + ☆ Sumotosima: A Framework and Dataset for Classifying and Summarizing + Otoscopic Images + + +
+ Otoscopy is a diagnostic procedure to examine the ear canal and eardrum using +an otoscope. It identifies conditions like infections, foreign bodies, ear drum +perforations and ear abnormalities. We propose a novel resource efficient deep +learning and transformer based framework, Sumotosima (Summarizer for otoscopic +images), an end-to-end pipeline for classification followed by summarization. +Our framework works on combination of triplet and cross-entropy losses. +Additionally, we use Knowledge Enhanced Multimodal BART whose input is fused +textual and image embedding. The objective is to provide summaries that are +well-suited for patients, ensuring clarity and efficiency in understanding +otoscopic images. Given the lack of existing datasets, we have curated our own +OCASD (Otoscopic Classification And Summary Dataset), which includes 500 images +with 5 unique categories annotated with their class and summaries by +Otolaryngologists. Sumotosima achieved a result of 98.03%, which is 7.00%, +3.10%, 3.01% higher than K-Nearest Neighbors, Random Forest and Support Vector +Machines, respectively, in classification tasks. For summarization, Sumotosima +outperformed GPT-4o and LLaVA by 88.53% and 107.57% in ROUGE scores, +respectively. We have made our code and dataset publicly available at +https://github.com/anas2908/Sumotosima + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies BMVC 2024 + + +
+ Existing methods on audio-visual deepfake detection mainly focus on +high-level features for modeling inconsistencies between audio and visual data. +As a result, these approaches usually overlook finer audio-visual artifacts, +which are inherent to deepfakes. Herein, we propose the introduction of +fine-grained mechanisms for detecting subtle artifacts in both spatial and +temporal domains. First, we introduce a local audio-visual model capable of +capturing small spatial regions that are prone to inconsistencies with audio. +For that purpose, a fine-grained mechanism based on a spatially-local distance +coupled with an attention module is adopted. Second, we introduce a +temporally-local pseudo-fake augmentation to include samples incorporating +subtle temporal inconsistencies in our training set. Experiments on the DFDC +and the FakeAVCeleb datasets demonstrate the superiority of the proposed method +in terms of generalization as compared to the state-of-the-art under both +in-dataset and cross-dataset settings. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ☆ ReCLIP++: Learn to Rectify the Bias of CLIP for Unsupervised Semantic + Segmentation CVPR 24 + + +
+ Recent works utilize CLIP to perform the challenging unsupervised semantic +segmentation task where only images without annotations are available. However, +we observe that when adopting CLIP to such a pixel-level understanding task, +unexpected bias (including class-preference bias and space-preference bias) +occurs. Previous works don't explicitly model the bias, which largely +constrains the segmentation performance. In this paper, we propose to +explicitly model and rectify the bias existing in CLIP to facilitate the +unsupervised semantic segmentation task. Specifically, we design a learnable +''Reference'' prompt to encode class-preference bias and a projection of the +positional embedding in vision transformer to encode space-preference bias +respectively. To avoid interference, two kinds of biases are firstly +independently encoded into the Reference feature and the positional feature. +Via a matrix multiplication between two features, a bias logit map is generated +to explicitly represent two kinds of biases. Then we rectify the logits of CLIP +via a simple element-wise subtraction. To make the rectified results smoother +and more contextual, we design a mask decoder which takes the feature of CLIP +and rectified logits as input and outputs a rectified segmentation mask with +the help of Gumbel-Softmax operation. To make the bias modeling and +rectification process meaningful and effective, a contrastive loss based on +masked visual features and the text features of different classes is imposed. +To further improve the segmentation, we distill the knowledge from the +rectified CLIP to the advanced segmentation architecture via minimizing our +designed mask-guided, feature-guided and text-guided loss terms. Extensive +experiments on various benchmarks demonstrate that ReCLIP++ performs favorably +against previous SOTAs. The implementation is available at: +https://github.com/dogehhh/ReCLIP. + +
+
+ comment: Extended version of our CVPR 24 paper +
+
+
+
+
+ + ☆ Long-Tailed Out-of-Distribution Detection: Prioritizing Attention to + Tail + + +
+ Current out-of-distribution (OOD) detection methods typically assume balanced +in-distribution (ID) data, while most real-world data follow a long-tailed +distribution. Previous approaches to long-tailed OOD detection often involve +balancing the ID data by reducing the semantics of head classes. However, this +reduction can severely affect the classification accuracy of ID data. The main +challenge of this task lies in the severe lack of features for tail classes, +leading to confusion with OOD data. To tackle this issue, we introduce a novel +Prioritizing Attention to Tail (PATT) method using augmentation instead of +reduction. Our main intuition involves using a mixture of von Mises-Fisher +(vMF) distributions to model the ID data and a temperature scaling module to +boost the confidence of ID data. This enables us to generate infinite +contrastive pairs, implicitly enhancing the semantics of ID classes while +promoting differentiation between ID and OOD data. To further strengthen the +detection of OOD data without compromising the classification performance of ID +data, we propose feature calibration during the inference phase. By extracting +an attention weight from the training set that prioritizes the tail classes and +reduces the confidence in OOD data, we improve the OOD detection capability. +Extensive experiments verified that our method outperforms the current +state-of-the-art methods on various benchmarks. + +
+
+
+
+
+ + ☆ Improving Synthetic Image Detection Towards Generalization: An Image + Transformation Perspective + + +
+ With recent generative models facilitating photo-realistic image synthesis, +the proliferation of synthetic images has also engendered certain negative +impacts on social platforms, thereby raising an urgent imperative to develop +effective detectors. Current synthetic image detection (SID) pipelines are +primarily dedicated to crafting universal artifact features, accompanied by an +oversight about SID training paradigm. In this paper, we re-examine the SID +problem and identify two prevalent biases in current training paradigms, i.e., +weakened artifact features and overfitted artifact features. Meanwhile, we +discover that the imaging mechanism of synthetic images contributes to +heightened local correlations among pixels, suggesting that detectors should be +equipped with local awareness. In this light, we propose SAFE, a lightweight +and effective detector with three simple image transformations. Firstly, for +weakened artifact features, we substitute the down-sampling operator with the +crop operator in image pre-processing to help circumvent artifact distortion. +Secondly, for overfitted artifact features, we include ColorJitter and +RandomRotation as additional data augmentations, to help alleviate irrelevant +biases from color discrepancies and semantic differences in limited training +samples. Thirdly, for local awareness, we propose a patch-based random masking +strategy tailored for SID, forcing the detector to focus on local regions at +training. Comparative experiments are conducted on an open-world dataset, +comprising synthetic images generated by 26 distinct generative models. Our +pipeline achieves a new state-of-the-art performance, with remarkable +improvements of 4.5% in accuracy and 2.9% in average precision against existing +methods. + +
+
+
+
+
+ + ☆ DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with + Diffusion + + +
+ Personalized text-to-image generation has gained significant attention for +its capability to generate high-fidelity portraits of specific identities +conditioned on user-defined prompts. Existing methods typically involve +test-time fine-tuning or instead incorporating an additional pre-trained +branch. However, these approaches struggle to simultaneously address the +demands of efficiency, identity fidelity, and preserving the model's original +generative capabilities. In this paper, we propose DiffLoRA, a novel approach +that leverages diffusion models as a hypernetwork to predict personalized +low-rank adaptation (LoRA) weights based on the reference images. By +integrating these LoRA weights into the text-to-image model, DiffLoRA achieves +personalization during inference without further training. Additionally, we +propose an identity-oriented LoRA weight construction pipeline to facilitate +the training of DiffLoRA. By utilizing the dataset produced by this pipeline, +our DiffLoRA consistently generates high-performance and accurate LoRA weights. +Extensive evaluations demonstrate the effectiveness of our method, achieving +both time efficiency and maintaining identity fidelity throughout the +personalization process. + +
+
+ comment: 9 pages,8 figures +
+
+
+
+
+ + ☆ Enhancing Visual Dialog State Tracking through Iterative Object-Entity + Alignment in Multi-Round Conversations + + +
+ Visual Dialog (VD) is a task where an agent answers a series of image-related +questions based on a multi-round dialog history. However, previous VD methods +often treat the entire dialog history as a simple text input, disregarding the +inherent conversational information flows at the round level. In this paper, we +introduce Multi-round Dialogue State Tracking model (MDST), a framework that +addresses this limitation by leveraging the dialogue state learned from dialog +history to answer questions. MDST captures each round of dialog history, +constructing internal dialogue state representations defined as 2-tuples of +vision-language representations. These representations effectively ground the +current question, enabling the generation of accurate answers. Experimental +results on the VisDial v1.0 dataset demonstrate that MDST achieves a new +state-of-the-art performance in generative setting. Furthermore, through a +series of human studies, we validate the effectiveness of MDST in generating +long, consistent, and human-like answers while consistently answering a series +of questions correctly. + +
+
+ comment: This article has been accepted in CAAI Transactions on Intelligence + Technology! Article ID: CIT2_12370, Article DOI: 10.1049/cit2.12370 +
+
+
+
+
+ + ☆ Response Wide Shut: Surprising Observations in Basic Vision Language + Model Capabilities + + +
+ Vision-Language Models (VLMs) have emerged as general purpose tools for +addressing a variety of complex computer vision problems. Such models have been +shown to be highly capable, but, at the same time, also lacking some basic +visual understanding skills. In this paper, we set out to understand the +limitations of SoTA VLMs on fundamental visual tasks: object classification, +understanding spatial arrangement, and ability to delineate individual object +instances (through counting), by constructing a series of tests that probe +which components of design, specifically, maybe lacking. Importantly, we go +significantly beyond the current benchmarks, that simply measure final +performance of VLM, by also comparing and contrasting it to performance of +probes trained directly on features obtained from visual encoder (image +embeddings), as well as intermediate vision-language projection used to bridge +image-encoder and LLM-decoder ouput in many SoTA models (e.g., LLaVA, BLIP, +InstructBLIP). In doing so, we uncover nascent shortcomings in VLMs response +and make a number of important observations which could help train and develop +more effective VLM models in future. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ☆ Multimodal Analysis of White Blood Cell Differentiation in Acute Myeloid + Leukemia Patients using a β-Variational Autoencoder MICCAI 2024 + + +
+ Biomedical imaging and RNA sequencing with single-cell resolution improves +our understanding of white blood cell diseases like leukemia. By combining +morphological and transcriptomic data, we can gain insights into cellular +functions and trajectoriess involved in blood cell differentiation. However, +existing methodologies struggle with integrating morphological and +transcriptomic data, leaving a significant research gap in comprehensively +understanding the dynamics of cell differentiation. Here, we introduce an +unsupervised method that explores and reconstructs these two modalities and +uncovers the relationship between different subtypes of white blood cells from +human peripheral blood smears in terms of morphology and their corresponding +transcriptome. Our method is based on a beta-variational autoencoder +(\beta-VAE) with a customized loss function, incorporating a R-CNN architecture +to distinguish single-cell from background and to minimize any interference +from artifacts. This implementation of \beta-VAE shows good reconstruction +capability along with continuous latent embeddings, while maintaining clear +differentiation between single-cell classes. Our novel approach is especially +helpful to uncover the correlation of two latent features in complex biological +processes such as formation of granules in the cell (granulopoiesis) with gene +expression patterns. It thus provides a unique tool to improve the +understanding of white blood cell maturation for biomedicine and diagnostics. + +
+
+ comment: Accepted for publication at MICCAI 2024 workshop on AI for Imaging + Genomics Learning (AIIG) +
+
+
+
+
+ + ☆ Towards Cross-Domain Single Blood Cell Image Classification via + Large-Scale LoRA-based Segment Anything Model + + +
+ Accurate classification of blood cells plays a vital role in hematological +analysis as it aids physicians in diagnosing various medical conditions. In +this study, we present a novel approach for classifying blood cell images known +as BC-SAM. BC-SAM leverages the large-scale foundation model of Segment +Anything Model (SAM) and incorporates a fine-tuning technique using LoRA, +allowing it to extract general image embeddings from blood cell images. To +enhance the applicability of BC-SAM across different blood cell image datasets, +we introduce an unsupervised cross-domain autoencoder that focuses on learning +intrinsic features while suppressing artifacts in the images. To assess the +performance of BC-SAM, we employ four widely used machine learning classifiers +(Random Forest, Support Vector Machine, Artificial Neural Network, and XGBoost) +to construct blood cell classification models and compare them against existing +state-of-the-art methods. Experimental results conducted on two publicly +available blood cell datasets (Matek-19 and Acevedo-20) demonstrate that our +proposed BC-SAM achieves a new state-of-the-art result, surpassing the baseline +methods with a significant improvement. The source code of this paper is +available at https://github.com/AnoK3111/BC-SAM. + +
+
+
+
+
+ + ☆ Review Learning: Advancing All-in-One Ultra-High-Definition Image + Restoration Training Method + + +
+ All-in-one image restoration tasks are becoming increasingly important, +especially for ultra-high-definition (UHD) images. Existing all-in-one UHD +image restoration methods usually boost the model's performance by introducing +prompt or customized dynamized networks for different degradation types. For +the inference stage, it might be friendly, but in the training stage, since the +model encounters multiple degraded images of different quality in an epoch, +these cluttered learning objectives might be information pollution for the +model. To address this problem, we propose a new training paradigm for general +image restoration models, which we name \textbf{Review Learning}, which enables +image restoration models to be capable enough to handle multiple types of +degradation without prior knowledge and prompts. This approach begins with +sequential training of an image restoration model on several degraded datasets, +combined with a review mechanism that enhances the image restoration model's +memory for several previous classes of degraded datasets. In addition, we +design a lightweight all-purpose image restoration network that can efficiently +reason about degraded images with 4K ($3840 \times 2160$) resolution on a +single consumer-grade GPU. + +
+
+
+
+
+ + ☆ MAIR++: Improving Multi-view Attention Inverse Rendering with Implicit + Lighting Representation + + +
+ In this paper, we propose a scene-level inverse rendering framework that uses +multi-view images to decompose the scene into geometry, SVBRDF, and 3D +spatially-varying lighting. While multi-view images have been widely used for +object-level inverse rendering, scene-level inverse rendering has primarily +been studied using single-view images due to the lack of a dataset containing +high dynamic range multi-view images with ground-truth geometry, material, and +spatially-varying lighting. To improve the quality of scene-level inverse +rendering, a novel framework called Multi-view Attention Inverse Rendering +(MAIR) was recently introduced. MAIR performs scene-level multi-view inverse +rendering by expanding the OpenRooms dataset, designing efficient pipelines to +handle multi-view images, and splitting spatially-varying lighting. Although +MAIR showed impressive results, its lighting representation is fixed to +spherical Gaussians, which limits its ability to render images realistically. +Consequently, MAIR cannot be directly used in applications such as material +editing. Moreover, its multi-view aggregation networks have difficulties +extracting rich features because they only focus on the mean and variance +between multi-view features. In this paper, we propose its extended version, +called MAIR++. MAIR++ addresses the aforementioned limitations by introducing +an implicit lighting representation that accurately captures the lighting +conditions of an image while facilitating realistic rendering. Furthermore, we +design a directional attention-based multi-view aggregation network to infer +more intricate relationships between views. Experimental results show that +MAIR++ not only achieves better performance than MAIR and single-view-based +methods, but also displays robust performance on unseen real-world scenes. + +
+
+
+
+
+ + ☆ SlotLifter: Slot-guided Feature Lifting for Learning Object-centric + Radiance Fields ECCV 2024 + + +
+ The ability to distill object-centric abstractions from intricate visual +scenes underpins human-level generalization. Despite the significant progress +in object-centric learning methods, learning object-centric representations in +the 3D physical world remains a crucial challenge. In this work, we propose +SlotLifter, a novel object-centric radiance model addressing scene +reconstruction and decomposition jointly via slot-guided feature lifting. Such +a design unites object-centric learning representations and image-based +rendering methods, offering state-of-the-art performance in scene decomposition +and novel-view synthesis on four challenging synthetic and four complex +real-world datasets, outperforming existing 3D object-centric learning methods +by a large margin. Through extensive ablative studies, we showcase the efficacy +of designs in SlotLifter, revealing key insights for potential future +directions. + +
+
+ comment: Accepted by ECCV 2024. Project website: https://slotlifter.github.io +
+
+
+
+
+ + ☆ DC3DO: Diffusion Classifier for 3D Objects + + +
+ Inspired by Geoffrey Hinton emphasis on generative modeling, To recognize +shapes, first learn to generate them, we explore the use of 3D diffusion models +for object classification. Leveraging the density estimates from these models, +our approach, the Diffusion Classifier for 3D Objects (DC3DO), enables +zero-shot classification of 3D shapes without additional training. On average, +our method achieves a 12.5 percent improvement compared to its multiview +counterparts, demonstrating superior multimodal reasoning over discriminative +approaches. DC3DO employs a class-conditional diffusion model trained on +ShapeNet, and we run inferences on point clouds of chairs and cars. This work +highlights the potential of generative models in 3D object classification. + +
+
+
+
+
+ + ☆ Masked Image Modeling: A Survey + + +
+ In this work, we survey recent studies on masked image modeling (MIM), an +approach that emerged as a powerful self-supervised learning technique in +computer vision. The MIM task involves masking some information, e.g. pixels, +patches, or even latent representations, and training a model, usually an +autoencoder, to predicting the missing information by using the context +available in the visible part of the input. We identify and formalize two +categories of approaches on how to implement MIM as a pretext task, one based +on reconstruction and one based on contrastive learning. Then, we construct a +taxonomy and review the most prominent papers in recent years. We complement +the manually constructed taxonomy with a dendrogram obtained by applying a +hierarchical clustering algorithm. We further identify relevant clusters via +manually inspecting the resulting dendrogram. Our review also includes datasets +that are commonly used in MIM research. We aggregate the performance results of +various masked image modeling methods on the most popular datasets, to +facilitate the comparison of competing methods. Finally, we identify research +gaps and propose several interesting directions of future work. + +
+
+
+
+
+ + ☆ How to Best Combine Demosaicing and Denoising? + + +
+ Image demosaicing and denoising play a critical role in the raw imaging +pipeline. These processes have often been treated as independent, without +considering their interactions. Indeed, most classic denoising methods handle +noisy RGB images, not raw images. Conversely, most demosaicing methods address +the demosaicing of noise free images. The real problem is to jointly denoise +and demosaic noisy raw images. But the question of how to proceed is still not +yet clarified. In this paper, we carry-out extensive experiments and a +mathematical analysis to tackle this problem by low complexity algorithms. +Indeed, both problems have been only addressed jointly by end-to-end heavy +weight convolutional neural networks (CNNs), which are currently incompatible +with low power portable imaging devices and remain by nature domain (or device) +dependent. Our study leads us to conclude that, with moderate noise, +demosaicing should be applied first, followed by denoising. This requires a +simple adaptation of classic denoising algorithms to demosaiced noise, which we +justify and specify. Although our main conclusion is ``demosaic first, then +denoise'', we also discover that for high noise, there is a moderate PSNR gain +by a more complex strategy: partial CFA denoising followed by demosaicing, and +by a second denoising on the RGB image. These surprising results are obtained +by a black-box optimization of the pipeline, which could be applied to any +other pipeline. We validate our results on simulated and real noisy CFA images +obtained from several benchmarks. + +
+
+ comment: This paper was accepted by Inverse Problems and Imaging on October, + 2023 +
+
+
+
+
+ + ☆ Coherence Awareness in Diffractive Neural Networks + + +
+ Diffractive neural networks hold great promise for applications requiring +intensive computational processing. Considerable attention has focused on +diffractive networks for either spatially coherent or spatially incoherent +illumination. Here we illustrate that, as opposed to imaging systems, in +diffractive networks the degree of spatial coherence has a dramatic effect. In +particular, we show that when the spatial coherence length on the object is +comparable to the minimal feature size preserved by the optical system, neither +the incoherent nor the coherent extremes serve as acceptable approximations. +Importantly, this situation is inherent to many settings involving active +illumination, including reflected light microscopy, autonomous vehicles and +smartphones. Following this observation, we propose a general framework for +training diffractive networks for any specified degree of spatial and temporal +coherence, supporting all types of linear and nonlinear layers. Using our +method, we numerically optimize networks for image classification, and +thoroughly investigate their performance dependence on the illumination +coherence properties. We further introduce the concept of coherence-blind +networks, which have enhanced resilience to changes in illumination conditions. +Our findings serve as a steppingstone toward adopting all-optical neural +networks in real-world applications, leveraging nothing but natural light. + +
+
+
+
+
+ + ☆ Bi-directional Contextual Attention for 3D Dense Captioning ECCV 2024 + + +
+ 3D dense captioning is a task involving the localization of objects and the +generation of descriptions for each object in a 3D scene. Recent approaches +have attempted to incorporate contextual information by modeling relationships +with object pairs or aggregating the nearest neighbor features of an object. +However, the contextual information constructed in these scenarios is limited +in two aspects: first, objects have multiple positional relationships that +exist across the entire global scene, not only near the object itself. Second, +it faces with contradicting objectives--where localization and attribute +descriptions are generated better with tight localization, while descriptions +involving global positional relations are generated better with contextualized +features of the global scene. To overcome this challenge, we introduce BiCA, a +transformer encoder-decoder pipeline that engages in 3D dense captioning for +each object with Bi-directional Contextual Attention. Leveraging parallelly +decoded instance queries for objects and context queries for non-object +contexts, BiCA generates object-aware contexts, where the contexts relevant to +each object is summarized, and context-aware objects, where the objects +relevant to the summarized object-aware contexts are aggregated. This extension +relieves previous methods from the contradicting objectives, enhancing both +localization performance and enabling the aggregation of contextual features +throughout the global scene; thus improving caption generation performance +simultaneously. Extensive experiments on two of the most widely-used 3D dense +captioning datasets demonstrate that our proposed method achieves a significant +improvement over prior methods. + +
+
+ comment: Accepted to ECCV 2024 (Oral) +
+
+
+
+
+ + ☆ Hybrid SD: Edge-Cloud Collaborative Inference for Stable Diffusion + Models + + +
+ Stable Diffusion Models (SDMs) have shown remarkable proficiency in image +synthesis. However, their broad application is impeded by their large model +sizes and intensive computational requirements, which typically require +expensive cloud servers for deployment. On the flip side, while there are many +compact models tailored for edge devices that can reduce these demands, they +often compromise on semantic integrity and visual quality when compared to +full-sized SDMs. To bridge this gap, we introduce Hybrid SD, an innovative, +training-free SDMs inference framework designed for edge-cloud collaborative +inference. Hybrid SD distributes the early steps of the diffusion process to +the large models deployed on cloud servers, enhancing semantic planning. +Furthermore, small efficient models deployed on edge devices can be integrated +for refining visual details in the later stages. Acknowledging the diversity of +edge devices with differing computational and storage capacities, we employ +structural pruning to the SDMs U-Net and train a lightweight VAE. Empirical +evaluations demonstrate that our compressed models achieve state-of-the-art +parameter efficiency (225.8M) on edge devices with competitive image quality. +Additionally, Hybrid SD reduces the cloud cost by 66% with edge-cloud +collaborative inference. + +
+
+
+
+
+ + ☆ Specialized Change Detection using Segment Anything + + +
+ Change detection (CD) is a fundamental task in Earth observation. While most +change detection methods detect all changes, there is a growing need for +specialized methods targeting specific changes relevant to particular +applications while discarding the other changes. For instance, urban management +might prioritize detecting the disappearance of buildings due to natural +disasters or other reasons. Furthermore, while most supervised change detection +methods require large-scale training datasets, in many applications only one or +two training examples might be available instead of large datasets. Addressing +such needs, we propose a focused CD approach using the Segment Anything Model +(SAM), a versatile vision foundation model. Our method leverages a binary mask +of the object of interest in pre-change images to detect their disappearance in +post-change images. By using SAM's robust segmentation capabilities, we create +prompts from the pre-change mask, use those prompts to segment the post-change +image, and identify missing objects. This unsupervised approach demonstrated +for building disappearance detection, is adaptable to various domains requiring +specialized CD. Our contributions include defining a novel CD problem, +proposing a method using SAM, and demonstrating its effectiveness. The proposed +method also has benefits related to privacy preservation. + +
+
+
+
+
+ + ☆ Attention Based Feature Fusion Network for Monkeypox Skin Lesion + Detection + + +
+ The recent monkeypox outbreak has raised significant public health concerns +due to its rapid spread across multiple countries. Monkeypox can be difficult +to distinguish from chickenpox and measles in the early stages because the +symptoms of all three diseases are similar. Modern deep learning algorithms can +be used to identify diseases, including COVID-19, by analyzing images of the +affected areas. In this study, we introduce a lightweight model that merges two +pre-trained architectures, EfficientNetV2B3 and ResNet151V2, to classify human +monkeypox disease. We have also incorporated the squeeze-and-excitation +attention network module to focus on the important parts of the feature maps +for classifying the monkeypox images. This attention module provides channels +and spatial attention to highlight significant areas within feature maps. We +evaluated the effectiveness of our model by extensively testing it on a +publicly available Monkeypox Skin Lesions Dataset using a four-fold +cross-validation approach. The evaluation metrics of our model were compared +with the existing others. Our model achieves a mean validation accuracy of +96.52%, with precision, recall, and F1-score values of 96.58%, 96.52%, and +96.51%, respectively. + +
+
+ comment: 6 pages with 6 figures +
+
+
+
+
+ + ☆ COD: Learning Conditional Invariant Representation for Domain Adaptation + Regression ECCV 2024 + + +
+ Aiming to generalize the label knowledge from a source domain with continuous +outputs to an unlabeled target domain, Domain Adaptation Regression (DAR) is +developed for complex practical learning problems. However, due to the +continuity problem in regression, existing conditional distribution alignment +theory and methods with discrete prior, which are proven to be effective in +classification settings, are no longer applicable. In this work, focusing on +the feasibility problems in DAR, we establish the sufficiency theory for the +regression model, which shows the generalization error can be sufficiently +dominated by the cross-domain conditional discrepancy. Further, to characterize +conditional discrepancy with continuous conditioning variable, a novel +Conditional Operator Discrepancy (COD) is proposed, which admits the metric +property on conditional distributions via the kernel embedding theory. Finally, +to minimize the discrepancy, a COD-based conditional invariant representation +learning model is proposed, and the reformulation is derived to show that +reasonable modifications on moment statistics can further improve the +discriminability of the adaptation model. Extensive experiments on standard DAR +datasets verify the validity of theoretical results and the superiority over +SOTA DAR methods. + +
+
+ comment: Accepted to ECCV 2024 (oral) +
+
+
+
+
+ + ☆ Unified-IoU: For High-Quality Object Detection + + +
+ Object detection is an important part in the field of computer vision, and +the effect of object detection is directly determined by the regression +accuracy of the prediction box. As the key to model training, IoU (Intersection +over Union) greatly shows the difference between the current prediction box and +the Ground Truth box. Subsequent researchers have continuously added more +considerations to IoU, such as center distance, aspect ratio, and so on. +However, there is an upper limit to just refining the geometric differences; +And there is a potential connection between the new consideration index and the +IoU itself, and the direct addition or subtraction between the two may lead to +the problem of "over-consideration". Based on this, we propose a new IoU loss +function, called Unified-IoU (UIoU), which is more concerned with the weight +assignment between different quality prediction boxes. Specifically, the loss +function dynamically shifts the model's attention from low-quality prediction +boxes to high-quality prediction boxes in a novel way to enhance the model's +detection performance on high-precision or intensive datasets and achieve a +balance in training speed. Our proposed method achieves better performance on +multiple datasets, especially at a high IoU threshold, UIoU has a more +significant improvement effect compared with other improved IoU losses. Our +code is publicly available at: https://github.com/lxj-drifter/UIOU_files. + +
+
+
+
+
+ + ☆ IDRetracor: Towards Visual Forensics Against Malicious Face Swapping + + +
+ The face swapping technique based on deepfake methods poses significant +social risks to personal identity security. While numerous deepfake detection +methods have been proposed as countermeasures against malicious face swapping, +they can only output binary labels (Fake/Real) for distinguishing fake content +without reliable and traceable evidence. To achieve visual forensics and target +face attribution, we propose a novel task named face retracing, which considers +retracing the original target face from the given fake one via inverse mapping. +Toward this goal, we propose an IDRetracor that can retrace arbitrary original +target identities from fake faces generated by multiple face swapping methods. +Specifically, we first adopt a mapping resolver to perceive the possible +solution space of the original target face for the inverse mappings. Then, we +propose mapping-aware convolutions to retrace the original target face from the +fake one. Such convolutions contain multiple kernels that can be combined under +the control of the mapping resolver to tackle different face swapping mappings +dynamically. Extensive experiments demonstrate that the IDRetracor exhibits +promising retracing performance from both quantitative and qualitative +perspectives. + +
+
+
+
+
+ + ☆ A lightweight YOLOv5-FFM model for occlusion pedestrian detection + + +
+ The development of autonomous driving technology must be inseparable from +pedestrian detection. Because of the fast speed of the vehicle, the accuracy +and real-time performance of the pedestrian detection algorithm are very +important. YOLO, as an efficient and simple one-stage target detection method, +is often used for pedestrian detection in various environments. However, this +series of detectors face some challenges, such as excessive computation and +undesirable detection rate when facing occluded pedestrians. In this paper, we +propose an improved lightweight YOLOv5 model to deal with these problems. This +model can achieve better pedestrian detection accuracy with fewer +floating-point operations (FLOPs), especially for occluded targets. In order to +achieve the above goals, we made improvements based on the YOLOv5 model +framework and introduced Ghost module and SE block. Furthermore, we designed a +local feature fusion module (FFM) to deal with occlusion in pedestrian +detection. To verify the validity of our method, two datasets, Citypersons and +CUHK Occlusion, were selected for the experiment. The experimental results show +that, compared with the original yolov5s model, the average precision (AP) of +our method is significantly improved, while the number of parameters is reduced +by 27.9% and FLOPs are reduced by 19.0%. + +
+
+
+
+
+ + ☆ Fast Information Streaming Handler (FisH): A Unified Seismic Neural + Network for Single Station Real-Time Earthquake Early Warning + + +
+ Existing EEW approaches often treat phase picking, location estimation, and +magnitude estimation as separate tasks, lacking a unified framework. +Additionally, most deep learning models in seismology rely on full +three-component waveforms and are not suitable for real-time streaming data. To +address these limitations, we propose a novel unified seismic neural network +called Fast Information Streaming Handler (FisH). FisH is designed to process +real-time streaming seismic data and generate simultaneous results for phase +picking, location estimation, and magnitude estimation in an end-to-end +fashion. By integrating these tasks within a single model, FisH simplifies the +overall process and leverages the nonlinear relationships between tasks for +improved performance. The FisH model utilizes RetNet as its backbone, enabling +parallel processing during training and recurrent handling during inference. +This capability makes FisH suitable for real-time applications, reducing +latency in EEW systems. Extensive experiments conducted on the STEAD benchmark +dataset provide strong validation for the effectiveness of our proposed FisH +model. The results demonstrate that FisH achieves impressive performance across +multiple seismic event detection and characterization tasks. Specifically, it +achieves an F1 score of 0.99/0.96. Also, FisH demonstrates precise earthquake +location estimation, with location error of only 6.0km, a distance error of +2.6km, and a back-azimuth error of 19{\deg}. The model also exhibits accurate +earthquake magnitude estimation, with a magnitude error of just 0.14. +Additionally, FisH is capable of generating real-time estimations, providing +location and magnitude estimations with a location error of 8.06km and a +magnitude error of 0.18 within a mere 3 seconds after the P-wave arrives. + +
+
+
+
+
+ + ☆ DePatch: Towards Robust Adversarial Patch for Evading Person Detectors + in the Real World + + +
+ Recent years have seen an increasing interest in physical adversarial +attacks, which aim to craft deployable patterns for deceiving deep neural +networks, especially for person detectors. However, the adversarial patterns of +existing patch-based attacks heavily suffer from the self-coupling issue, where +a degradation, caused by physical transformations, in any small patch segment +can result in a complete adversarial dysfunction, leading to poor robustness in +the complex real world. Upon this observation, we introduce the Decoupled +adversarial Patch (DePatch) attack to address the self-coupling issue of +adversarial patches. Specifically, we divide the adversarial patch into +block-wise segments, and reduce the inter-dependency among these segments +through randomly erasing out some segments during the optimization. We further +introduce a border shifting operation and a progressive decoupling strategy to +improve the overall attack capabilities. Extensive experiments demonstrate the +superior performance of our method over other physical adversarial attacks, +especially in the real world. + +
+
+
+
+
+ + ☆ ActPrompt: In-Domain Feature Adaptation via Action Cues for Video + Temporal Grounding + + +
+ Video temporal grounding is an emerging topic aiming to identify specific +clips within videos. In addition to pre-trained video models, contemporary +methods utilize pre-trained vision-language models (VLM) to capture detailed +characteristics of diverse scenes and objects from video frames. However, as +pre-trained on images, VLM may struggle to distinguish action-sensitive +patterns from static objects, making it necessary to adapt them to specific +data domains for effective feature representation over temporal grounding. We +address two primary challenges to achieve this goal. Specifically, to mitigate +high adaptation costs, we propose an efficient preliminary in-domain +fine-tuning paradigm for feature adaptation, where downstream-adaptive features +are learned through several pretext tasks. Furthermore, to integrate +action-sensitive information into VLM, we introduce Action-Cue-Injected +Temporal Prompt Learning (ActPrompt), which injects action cues into the image +encoder of VLM for better discovering action-sensitive patterns. Extensive +experiments demonstrate that ActPrompt is an off-the-shelf training framework +that can be effectively applied to various SOTA methods, resulting in notable +improvements. The complete code used in this study is provided in the +supplementary materials. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ ViMo: Generating Motions from Casual Videos + + +
+ Although humans have the innate ability to imagine multiple possible actions +from videos, it remains an extraordinary challenge for computers due to the +intricate camera movements and montages. Most existing motion generation +methods predominantly rely on manually collected motion datasets, usually +tediously sourced from motion capture (Mocap) systems or Multi-View cameras, +unavoidably resulting in a limited size that severely undermines their +generalizability. Inspired by recent advance of diffusion models, we probe a +simple and effective way to capture motions from videos and propose a novel +Video-to-Motion-Generation framework (ViMo) which could leverage the immense +trove of untapped video content to produce abundant and diverse 3D human +motions. Distinct from prior work, our videos could be more causal, including +complicated camera movements and occlusions. Striking experimental results +demonstrate the proposed model could generate natural motions even for videos +where rapid movements, varying perspectives, or frequent occlusions might +exist. We also show this work could enable three important downstream +applications, such as generating dancing motions according to arbitrary music +and source video style. Extensive experimental results prove that our model +offers an effective and scalable way to generate diversity and realistic +motions. Code and demos will be public soon. + +
+
+
+
+
+ + ☆ CROME: Cross-Modal Adapters for Efficient Multimodal LLM + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable +image-language capabilities, but their widespread use faces challenges in +cost-effective training and adaptation. Existing approaches often necessitate +expensive language model retraining and limited adaptability. Additionally, the +current focus on zero-shot performance improvements offers insufficient +guidance for task-specific tuning. We propose CROME, an efficient +vision-language instruction tuning framework. It features a novel gated +cross-modal adapter that effectively combines visual and textual +representations prior to input into a frozen LLM. This lightweight adapter, +trained with minimal parameters, enables efficient cross-modal understanding. +Notably, CROME demonstrates superior zero-shot performance on standard visual +question answering and instruction-following benchmarks. Moreover, it yields +fine-tuning with exceptional parameter efficiency, competing with task-specific +specialist state-of-the-art methods. CROME demonstrates the potential of pre-LM +alignment for building scalable, adaptable, and parameter-efficient multimodal +models. + +
+
+
+
+
+ + ☆ MV-DETR: Multi-modality indoor object detection by Multi-View DEtecton + TRansformers + + +
+ We introduce a novel MV-DETR pipeline which is effective while efficient +transformer based detection method. Given input RGBD data, we notice that there +are super strong pretraining weights for RGB data while less effective works +for depth related data. First and foremost , we argue that geometry and texture +cues are both of vital importance while could be encoded separately. Secondly, +we find that visual texture feature is relatively hard to extract compared with +geometry feature in 3d space. Unfortunately, single RGBD dataset with thousands +of data is not enough for training an discriminating filter for visual texture +feature extraction. Last but certainly not the least, we designed a lightweight +VG module consists of a visual textual encoder, a geometry encoder and a VG +connector. Compared with previous state of the art works like V-DETR, gains +from pretrained visual encoder could be seen. Extensive experiments on +ScanNetV2 dataset shows the effectiveness of our method. It is worth mentioned +that our method achieve 78\% AP which create new state of the art on ScanNetv2 +benchmark. + +
+
+
+
+
+ + ☆ Deep Inertia $L_p$ Half-Quadratic Splitting Unrolling Network for Sparse + View CT Reconstruction + + +
+ Sparse view computed tomography (CT) reconstruction poses a challenging +ill-posed inverse problem, necessitating effective regularization techniques. +In this letter, we employ $L_p$-norm ($0 +
+
+ comment: This paper was accepted by IEEE Signal Processing Letters on July 28, + 2024 +
+
+
+
+
+ + ☆ GeoFormer: Learning Point Cloud Completion with Tri-Plane Integrated + Transformer + + +
+ Point cloud completion aims to recover accurate global geometry and preserve +fine-grained local details from partial point clouds. Conventional methods +typically predict unseen points directly from 3D point cloud coordinates or use +self-projected multi-view depth maps to ease this task. However, these +gray-scale depth maps cannot reach multi-view consistency, consequently +restricting the performance. In this paper, we introduce a GeoFormer that +simultaneously enhances the global geometric structure of the points and +improves the local details. Specifically, we design a CCM Feature Enhanced +Point Generator to integrate image features from multi-view consistent +canonical coordinate maps (CCMs) and align them with pure point features, +thereby enhancing the global geometry feature. Additionally, we employ the +Multi-scale Geometry-aware Upsampler module to progressively enhance local +details. This is achieved through cross attention between the multi-scale +features extracted from the partial input and the features derived from +previously estimated points. Extensive experiments on the PCN, ShapeNet-55/34, +and KITTI benchmarks demonstrate that our GeoFormer outperforms recent methods, +achieving the state-of-the-art performance. Our code is available at +\href{https://github.com/Jinpeng-Yu/GeoFormer}{https://github.com/Jinpeng-Yu/GeoFormer}. + +
+
+ comment: accepted by the 32nd ACM International Conference on Multimedia + (MM'24) +
+
+
+
+
+ + ☆ ActiveNeRF: Learning Accurate 3D Geometry by Active Pattern Projection + + +
+ NeRFs have achieved incredible success in novel view synthesis. However, the +accuracy of the implicit geometry is unsatisfactory because the passive static +environmental illumination has low spatial frequency and cannot provide enough +information for accurate geometry reconstruction. In this work, we propose +ActiveNeRF, a 3D geometry reconstruction framework, which improves the geometry +quality of NeRF by actively projecting patterns of high spatial frequency onto +the scene using a projector which has a constant relative pose to the camera. +We design a learnable active pattern rendering pipeline which jointly learns +the scene geometry and the active pattern. We find that, by adding the active +pattern and imposing its consistency across different views, our proposed +method outperforms state of the art geometry reconstruction methods +qualitatively and quantitatively in both simulation and real experiments. Code +is avaliable at https://github.com/hcp16/active_nerf + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ☆ HDRGS: High Dynamic Range Gaussian Splatting + + +
+ Recent years have witnessed substantial advancements in the field of 3D +reconstruction from 2D images, particularly following the introduction of the +neural radiance field (NeRF) technique. However, reconstructing a 3D high +dynamic range (HDR) radiance field, which aligns more closely with real-world +conditions, from 2D multi-exposure low dynamic range (LDR) images continues to +pose significant challenges. Approaches to this issue fall into two categories: +grid-based and implicit-based. Implicit methods, using multi-layer perceptrons +(MLP), face inefficiencies, limited solvability, and overfitting risks. +Conversely, grid-based methods require significant memory and struggle with +image quality and long training times. In this paper, we introduce Gaussian +Splatting-a recent, high-quality, real-time 3D reconstruction technique-into +this domain. We further develop the High Dynamic Range Gaussian Splatting +(HDR-GS) method, designed to address the aforementioned challenges. This method +enhances color dimensionality by including luminance and uses an asymmetric +grid for tone-mapping, swiftly and precisely converting pixel irradiance to +color. Our approach improves HDR scene recovery accuracy and integrates a novel +coarse-to-fine strategy to speed up model convergence, enhancing robustness +against sparse viewpoints and exposure extremes, and preventing local optima. +Extensive testing confirms that our method surpasses current state-of-the-art +techniques in both synthetic and real-world scenarios. Code will be released at +\url{https://github.com/WuJH2001/HDRGS} + +
+
+
+
+
+ + ☆ A Review of Pseudo-Labeling for Computer Vision + + +
+ Deep neural models have achieved state of the art performance on a wide range +of problems in computer science, especially in computer vision. However, deep +neural networks often require large datasets of labeled samples to generalize +effectively, and an important area of active research is semi-supervised +learning, which attempts to instead utilize large quantities of (easily +acquired) unlabeled samples. One family of methods in this space is +pseudo-labeling, a class of algorithms that use model outputs to assign labels +to unlabeled samples which are then used as labeled samples during training. +Such assigned labels, called pseudo-labels, are most commonly associated with +the field of semi-supervised learning. In this work we explore a broader +interpretation of pseudo-labels within both self-supervised and unsupervised +methods. By drawing the connection between these areas we identify new +directions when advancements in one area would likely benefit others, such as +curriculum learning and self-supervised regularization. + +
+
+ comment: 21 pages, 4 figures +
+
+
+
+
+ + ☆ SeLoRA: Self-Expanding Low-Rank Adaptation of Latent Diffusion Model for + Medical Image Synthesis + + +
+ The persistent challenge of medical image synthesis posed by the scarcity of +annotated data and the need to synthesize `missing modalities' for multi-modal +analysis, underscored the imperative development of effective synthesis +methods. Recently, the combination of Low-Rank Adaptation (LoRA) with latent +diffusion models (LDMs) has emerged as a viable approach for efficiently +adapting pre-trained large language models, in the medical field. However, the +direct application of LoRA assumes uniform ranking across all linear layers, +overlooking the significance of different weight matrices, and leading to +sub-optimal outcomes. Prior works on LoRA prioritize the reduction of trainable +parameters, and there exists an opportunity to further tailor this adaptation +process to the intricate demands of medical image synthesis. In response, we +present SeLoRA, a Self-Expanding Low-Rank Adaptation Module, that dynamically +expands its ranking across layers during training, strategically placing +additional ranks on crucial layers, to allow the model to elevate synthesis +quality where it matters most. The proposed method not only enables LDMs to +fine-tune on medical data efficiently but also empowers the model to achieve +improved image quality with minimal ranking. The code of our SeLoRA method is +publicly available on https://anonymous.4open.science/r/SeLoRA-980D . + +
+
+ comment: Project Page: https://yuchen20.github.io/SeLoRA.github.io/ +
+
+
+
+
+ + ☆ BVI-UGC: A Video Quality Database for User-Generated Content Transcoding + + +
+ In recent years, user-generated content (UGC) has become one of the major +video types consumed via streaming networks. Numerous research contributions +have focused on assessing its visual quality through subjective tests and +objective modeling. In most cases, objective assessments are based on a +no-reference scenario, where the corresponding reference content is assumed not +to be available. However, full-reference video quality assessment is also +important for UGC in the delivery pipeline, particularly associated with the +video transcoding process. In this context, we present a new UGC video quality +database, BVI-UGC, for user-generated content transcoding, which contains 60 +(non-pristine) reference videos and 1,080 test sequences. In this work, we +simulated the creation of non-pristine reference sequences (with a wide range +of compression distortions), typical of content uploaded to UGC platforms for +transcoding. A comprehensive crowdsourced subjective study was then conducted +involving more than 3,500 human participants. Based on this collected +subjective data, we benchmarked the performance of 10 full-reference and 11 +no-reference quality metrics. Our results demonstrate the poor performance +(SROCC values are lower than 0.6) of these metrics in predicting the perceptual +quality of UGC in two different scenarios (with or without a reference). + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Flexible 3D Lane Detection by Hierarchical Shape MatchingFlexible 3D + Lane Detection by Hierarchical Shape Matching + + +
+ As one of the basic while vital technologies for HD map construction, 3D lane +detection is still an open problem due to varying visual conditions, complex +typologies, and strict demands for precision. In this paper, an end-to-end +flexible and hierarchical lane detector is proposed to precisely predict 3D +lane lines from point clouds. Specifically, we design a hierarchical network +predicting flexible representations of lane shapes at different levels, +simultaneously collecting global instance semantics and avoiding local errors. +In the global scope, we propose to regress parametric curves w.r.t adaptive +axes that help to make more robust predictions towards complex scenes, while in +the local vision the structure of lane segment is detected in each of the +dynamic anchor cells sampled along the global predicted curves. Moreover, +corresponding global and local shape matching losses and anchor cell generation +strategies are designed. Experiments on two datasets show that we overwhelm +current top methods under high precision standards, and full ablation studies +also verify each part of our method. Our codes will be released at +https://github.com/Doo-do/FHLD. + +
+
+
+
+
+ + ☆ Controlling the World by Sleight of Hand + + +
+ Humans naturally build mental models of object interactions and dynamics, +allowing them to imagine how their surroundings will change if they take a +certain action. While generative models today have shown impressive results on +generating/editing images unconditionally or conditioned on text, current +methods do not provide the ability to perform object manipulation conditioned +on actions, an important tool for world modeling and action planning. +Therefore, we propose to learn an action-conditional generative models by +learning from unlabeled videos of human hands interacting with objects. The +vast quantity of such data on the internet allows for efficient scaling which +can enable high-performing action-conditional models. Given an image, and the +shape/location of a desired hand interaction, CosHand, synthesizes an image of +a future after the interaction has occurred. Experiments show that the +resulting model can predict the effects of hand-object interactions well, with +strong generalization particularly to translation, stretching, and squeezing +interactions of unseen objects in unseen environments. Further, CosHand can be +sampled many times to predict multiple possible effects, modeling the +uncertainty of forces in the interaction/environment. Finally, method +generalizes to different embodiments, including non-human hands, i.e. robot +hands, suggesting that generative video models can be powerful models for +robotics. + +
+
+
+
+
+ + ☆ Vision Language Model for Interpretable and Fine-grained Detection of + Safety Compliance in Diverse Workplaces + + +
+ Workplace accidents due to personal protective equipment (PPE) non-compliance +raise serious safety concerns and lead to legal liabilities, financial +penalties, and reputational damage. While object detection models have shown +the capability to address this issue by identifying safety items, most existing +models, such as YOLO, Faster R-CNN, and SSD, are limited in verifying the +fine-grained attributes of PPE across diverse workplace scenarios. Vision +language models (VLMs) are gaining traction for detection tasks by leveraging +the synergy between visual and textual information, offering a promising +solution to traditional object detection limitations in PPE recognition. +Nonetheless, VLMs face challenges in consistently verifying PPE attributes due +to the complexity and variability of workplace environments, requiring them to +interpret context-specific language and visual cues simultaneously. We +introduce Clip2Safety, an interpretable detection framework for diverse +workplace safety compliance, which comprises four main modules: scene +recognition, the visual prompt, safety items detection, and fine-grained +verification. The scene recognition identifies the current scenario to +determine the necessary safety gear. The visual prompt formulates the specific +visual prompts needed for the detection process. The safety items detection +identifies whether the required safety gear is being worn according to the +specified scenario. Lastly, the fine-grained verification assesses whether the +worn safety equipment meets the fine-grained attribute requirements. We conduct +real-world case studies across six different scenarios. The results show that +Clip2Safety not only demonstrates an accuracy improvement over state-of-the-art +question-answering based VLMs but also achieves inference times two hundred +times faster. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ☆ Generative Photomontage + + +
+ Text-to-image models are powerful tools for image creation. However, the +generation process is akin to a dice roll and makes it difficult to achieve a +single image that captures everything a user wants. In this paper, we propose a +framework for creating the desired image by compositing it from various parts +of generated images, in essence forming a Generative Photomontage. Given a +stack of images generated by ControlNet using the same input condition and +different seeds, we let users select desired parts from the generated results +using a brush stroke interface. We introduce a novel technique that takes in +the user's brush strokes, segments the generated images using a graph-based +optimization in diffusion feature space, and then composites the segmented +regions via a new feature-space blending method. Our method faithfully +preserves the user-selected regions while compositing them harmoniously. We +demonstrate that our flexible framework can be used for many applications, +including generating new appearance combinations, fixing incorrect shapes and +artifacts, and improving prompt alignment. We show compelling results for each +application and demonstrate that our method outperforms existing image blending +methods and various baselines. + +
+
+ comment: Project webpage: https://lseancs.github.io/generativephotomontage/ +
+
+
+
+
+ + ♻ ☆ TraceFL: Achieving Interpretability in Federated Learning via Neuron + Provenance + + +
+ In Federated Learning, clients train models on local data and send updates to +a central server, which aggregates them into a global model using a fusion +algorithm. This collaborative yet privacy-preserving training comes at a +cost--FL developers face significant challenges in attributing global model +predictions to specific clients. Localizing responsible clients is a crucial +step towards (a) excluding clients primarily responsible for incorrect +predictions and (b) encouraging clients who contributed high-quality models to +continue participating in the future. Existing ML explainability approaches are +inherently inapplicable as they are designed for single-model, centralized +training. + We introduce TraceFL, a fine-grained neuron provenance capturing mechanism +that identifies clients responsible for the global model's prediction by +tracking the flow of information from individual clients to the global model. +Since inference on different inputs activates a different set of neurons of the +global model, TraceFL dynamically quantifies the significance of the global +model's neurons in a given prediction. It then selectively picks a slice of the +most crucial neurons in the global model and maps them to the corresponding +neurons in every participating client to determine each client's contribution, +ultimately localizing the responsible client. We evaluate TraceFL on six +datasets, including two real-world medical imaging datasets and four neural +networks, including advanced models such as GPT. TraceFL achieves 99% accuracy +in localizing the responsible client in FL tasks spanning both image and text +classification tasks. At a time when state-of-the-art ML debugging approaches +are mostly domain-specific (e.g., image classification only), TraceFL is the +first technique to enable highly accurate automated reasoning across a wide +range of FL applications. + +
+
+ comment: 13 pages. TraceFL is the first interpretability technique in FL that + can work on both image and text classification tasks. For source code please + contact at waris@vt.edu +
+
+
+
+
+ + ♻ ☆ From NeRFs to Gaussian Splats, and Back + + +
+ For robotics applications where there is a limited number of (typically +ego-centric) views, parametric representations such as neural radiance fields +(NeRFs) generalize better than non-parametric ones such as Gaussian splatting +(GS) to views that are very different from those in the training data; GS +however can render much faster than NeRFs. We develop a procedure to convert +back and forth between the two. Our approach achieves the best of both NeRFs +(superior PSNR, SSIM, and LPIPS on dissimilar views, and a compact +representation) and GS (real-time rendering and ability for easily modifying +the representation); the computational cost of these conversions is minor +compared to training the two from scratch. + +
+
+
+
+
+ + ♻ ☆ GarmentCodeData: A Dataset of 3D Made-to-Measure Garments With Sewing + Patterns ECCV 2024 + + +
+ Recent research interest in the learning-based processing of garments, from +virtual fitting to generation and reconstruction, stumbles on a scarcity of +high-quality public data in the domain. We contribute to resolving this need by +presenting the first large-scale synthetic dataset of 3D made-to-measure +garments with sewing patterns, as well as its generation pipeline. +GarmentCodeData contains 115,000 data points that cover a variety of designs in +many common garment categories: tops, shirts, dresses, jumpsuits, skirts, +pants, etc., fitted to a variety of body shapes sampled from a custom +statistical body model based on CAESAR, as well as a standard reference body +shape, applying three different textile materials. To enable the creation of +datasets of such complexity, we introduce a set of algorithms for automatically +taking tailor's measures on sampled body shapes, sampling strategies for sewing +pattern design, and propose an automatic, open-source 3D garment draping +pipeline based on a fast XPBD simulator, while contributing several solutions +for collision resolution and drape correctness to enable scalability. + Project Page: https://igl.ethz.ch/projects/GarmentCodeData/ + Dataset: https://doi.org/10.3929/ethz-b-000673889 + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical Quantum Control Gates for Functional MRI Understanding + + +
+ Quantum computing has emerged as a powerful tool for solving complex problems +intractable for classical computers, particularly in popular fields such as +cryptography, optimization, and neurocomputing. In this paper, we present a new +quantum-based approach named the Hierarchical Quantum Control Gates (HQCG) +method for efficient understanding of Functional Magnetic Resonance Imaging +(fMRI) data. This approach includes two novel modules: the Local Quantum +Control Gate (LQCG) and the Global Quantum Control Gate (GQCG), which are +designed to extract local and global features of fMRI signals, respectively. +Our method operates end-to-end on a quantum machine, leveraging quantum +mechanics to learn patterns within extremely high-dimensional fMRI signals, +such as 30,000 samples which is a challenge for classical computers. Empirical +results demonstrate that our approach significantly outperforms classical +methods. Additionally, we found that the proposed quantum model is more stable +and less prone to overfitting than the classical methods. + +
+
+
+
+
+ + ♻ ☆ The Visual Experience Dataset: Over 200 Recorded Hours of Integrated Eye + Movement, Odometry, and Egocentric Video + + +
+ We introduce the Visual Experience Dataset (VEDB), a compilation of over 240 +hours of egocentric video combined with gaze- and head-tracking data that +offers an unprecedented view of the visual world as experienced by human +observers. The dataset consists of 717 sessions, recorded by 58 observers +ranging from 6-49 years old. This paper outlines the data collection, +processing, and labeling protocols undertaken to ensure a representative sample +and discusses the potential sources of error or bias within the dataset. The +VEDB's potential applications are vast, including improving gaze tracking +methodologies, assessing spatiotemporal image statistics, and refining deep +neural networks for scene and activity recognition. The VEDB is accessible +through established open science platforms and is intended to be a living +dataset with plans for expansion and community contributions. It is released +with an emphasis on ethical considerations, such as participant privacy and the +mitigation of potential biases. By providing a dataset grounded in real-world +experiences and accompanied by extensive metadata and supporting code, the +authors invite the research community to utilize and contribute to the VEDB, +facilitating a richer understanding of visual perception and behavior in +naturalistic settings. + +
+
+ comment: 40 pages, 1 table, 9 figures +
+
+
+
+
+ + ♻ ☆ HeadGaS: Real-Time Animatable Head Avatars via 3D Gaussian Splatting ECCV 2024 + + +
+ 3D head animation has seen major quality and runtime improvements over the +last few years, particularly empowered by the advances in differentiable +rendering and neural radiance fields. Real-time rendering is a highly desirable +goal for real-world applications. We propose HeadGaS, a model that uses 3D +Gaussian Splats (3DGS) for 3D head reconstruction and animation. In this paper +we introduce a hybrid model that extends the explicit 3DGS representation with +a base of learnable latent features, which can be linearly blended with +low-dimensional parameters from parametric head models to obtain +expression-dependent color and opacity values. We demonstrate that HeadGaS +delivers state-of-the-art results in real-time inference frame rates, +surpassing baselines by up to 2dB, while accelerating rendering speed by over +x10. + +
+
+ comment: accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ V4d: voxel for 4d novel view synthesis + + +
+ Neural radiance fields have made a remarkable breakthrough in the novel view +synthesis task at the 3D static scene. However, for the 4D circumstance (e.g., +dynamic scene), the performance of the existing method is still limited by the +capacity of the neural network, typically in a multilayer perceptron network +(MLP). In this paper, we utilize 3D Voxel to model the 4D neural radiance +field, short as V4D, where the 3D voxel has two formats. The first one is to +regularly model the 3D space and then use the sampled local 3D feature with the +time index to model the density field and the texture field by a tiny MLP. The +second one is in look-up tables (LUTs) format that is for the pixel-level +refinement, where the pseudo-surface produced by the volume rendering is +utilized as the guidance information to learn a 2D pixel-level refinement +mapping. The proposed LUTs-based refinement module achieves the performance +gain with little computational cost and could serve as the plug-and-play module +in the novel view synthesis task. Moreover, we propose a more effective +conditional positional encoding toward the 4D data that achieves performance +gain with negligible computational burdens. Extensive experiments demonstrate +that the proposed method achieves state-of-the-art performance at a low +computational cost. + +
+
+ comment: Code released. Accepted by IEEE TVCG 2023 +
+
+
+
+
+ + ♻ ☆ Deepfake Media Forensics: State of the Art and Challenges Ahead + + +
+ AI-generated synthetic media, also called Deepfakes, have significantly +influenced so many domains, from entertainment to cybersecurity. Generative +Adversarial Networks (GANs) and Diffusion Models (DMs) are the main frameworks +used to create Deepfakes, producing highly realistic yet fabricated content. +While these technologies open up new creative possibilities, they also bring +substantial ethical and security risks due to their potential misuse. The rise +of such advanced media has led to the development of a cognitive bias known as +Impostor Bias, where individuals doubt the authenticity of multimedia due to +the awareness of AI's capabilities. As a result, Deepfake detection has become +a vital area of research, focusing on identifying subtle inconsistencies and +artifacts with machine learning techniques, especially Convolutional Neural +Networks (CNNs). Research in forensic Deepfake technology encompasses five main +areas: detection, attribution and recognition, passive authentication, +detection in realistic scenarios, and active authentication. This paper reviews +the primary algorithms that address these challenges, examining their +advantages, limitations, and future prospects. + +
+
+
+
+
+ + ♻ ☆ DynaSeg: A Deep Dynamic Fusion Method for Unsupervised Image + Segmentation Incorporating Feature Similarity and Spatial Continuity + + +
+ Our work tackles the fundamental challenge of image segmentation in computer +vision, which is crucial for diverse applications. While supervised methods +demonstrate proficiency, their reliance on extensive pixel-level annotations +limits scalability. We introduce DynaSeg, an innovative unsupervised image +segmentation approach that overcomes the challenge of balancing feature +similarity and spatial continuity without relying on extensive hyperparameter +tuning. Unlike traditional methods, DynaSeg employs a dynamic weighting scheme +that automates parameter tuning, adapts flexibly to image characteristics, and +facilitates easy integration with other segmentation networks. By incorporating +a Silhouette Score Phase, DynaSeg prevents undersegmentation failures where the +number of predicted clusters might converge to one. DynaSeg uses CNN-based and +pre-trained ResNet feature extraction, making it computationally efficient and +more straightforward than other complex models. Experimental results showcase +state-of-the-art performance, achieving a 12.2% and 14.12% mIOU improvement +over current unsupervised segmentation approaches on COCO-All and COCO-Stuff +datasets, respectively. We provide qualitative and quantitative results on five +benchmark datasets, demonstrating the efficacy of the proposed approach.Code is +available at https://github.com/RyersonMultimediaLab/DynaSeg + +
+
+ comment: In Press: Image and Vision Computing Journal +
+
+
+
+
+ + ♻ ☆ Regularizing Self-supervised 3D Scene Flows with Surface Awareness and + Cyclic Consistency + + +
+ Learning without supervision how to predict 3D scene flows from point clouds +is essential to many perception systems. We propose a novel learning framework +for this task which improves the necessary regularization. Relying on the +assumption that scene elements are mostly rigid, current smoothness losses are +built on the definition of "rigid clusters" in the input point clouds. The +definition of these clusters is challenging and has a significant impact on the +quality of predicted flows. We introduce two new consistency losses that +enlarge clusters while preventing them from spreading over distinct objects. In +particular, we enforce \emph{temporal} consistency with a forward-backward +cyclic loss and \emph{spatial} consistency by considering surface orientation +similarity in addition to spatial proximity. The proposed losses are +model-independent and can thus be used in a plug-and-play fashion to +significantly improve the performance of existing models, as demonstrated on +two most widely used architectures. We also showcase the effectiveness and +generalization capability of our framework on four standard sensor-unique +driving datasets, achieving state-of-the-art performance in 3D scene flow +estimation. Our codes are available on https://github.com/ctu-vras/sac-flow. + +
+
+
+
+
+ + ♻ ☆ Let-It-Flow: Simultaneous Optimization of 3D Flow and Object Clustering + + +
+ We study the problem of self-supervised 3D scene flow estimation from real +large-scale raw point cloud sequences, which is crucial to various tasks like +trajectory prediction or instance segmentation. In the absence of ground truth +scene flow labels, contemporary approaches concentrate on deducing optimizing +flow across sequential pairs of point clouds by incorporating structure based +regularization on flow and object rigidity. The rigid objects are estimated by +a variety of 3D spatial clustering methods. While state-of-the-art methods +successfully capture overall scene motion using the Neural Prior structure, +they encounter challenges in discerning multi-object motions. We identified the +structural constraints and the use of large and strict rigid clusters as the +main pitfall of the current approaches and we propose a novel clustering +approach that allows for combination of overlapping soft clusters as well as +non-overlapping rigid clusters representation. Flow is then jointly estimated +with progressively growing non-overlapping rigid clusters together with fixed +size overlapping soft clusters. We evaluate our method on multiple datasets +with LiDAR point clouds, demonstrating the superior performance over the +self-supervised baselines reaching new state of the art results. Our method +especially excels in resolving flow in complicated dynamic scenes with multiple +independently moving objects close to each other which includes pedestrians, +cyclists and other vulnerable road users. Our codes are publicly available on +https://github.com/ctu-vras/let-it-flow. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Video Anomaly Detection and Localization with + Spatio-Temporal Prompts + + +
+ Current weakly supervised video anomaly detection (WSVAD) task aims to +achieve frame-level anomalous event detection with only coarse video-level +annotations available. Existing works typically involve extracting global +features from full-resolution video frames and training frame-level classifiers +to detect anomalies in the temporal dimension. However, most anomalous events +tend to occur in localized spatial regions rather than the entire video frames, +which implies existing frame-level feature based works may be misled by the +dominant background information and lack the interpretation of the detected +anomalies. To address this dilemma, this paper introduces a novel method called +STPrompt that learns spatio-temporal prompt embeddings for weakly supervised +video anomaly detection and localization (WSVADL) based on pre-trained +vision-language models (VLMs). Our proposed method employs a two-stream network +structure, with one stream focusing on the temporal dimension and the other +primarily on the spatial dimension. By leveraging the learned knowledge from +pre-trained VLMs and incorporating natural motion priors from raw videos, our +model learns prompt embeddings that are aligned with spatio-temporal regions of +videos (e.g., patches of individual frames) for identify specific local regions +of anomalies, enabling accurate video anomaly detection while mitigating the +influence of background information. Without relying on detailed +spatio-temporal annotations or auxiliary object detection/tracking, our method +achieves state-of-the-art performance on three public benchmarks for the WSVADL +task. + +
+
+ comment: Accepted by ACMMM2024 +
+
+
+
+
+ + ♻ ☆ Power Variable Projection for Initialization-Free Large-Scale Bundle + Adjustment + + +
+ Most Bundle Adjustment (BA) solvers like the Levenberg-Marquardt algorithm +require a good initialization. Instead, initialization-free BA remains a +largely uncharted territory. The under-explored Variable Projection algorithm +(VarPro) exhibits a wide convergence basin even without initialization. Coupled +with object space error formulation, recent works have shown its ability to +solve small-scale initialization-free bundle adjustment problem. To make such +initialization-free BA approaches scalable, we introduce Power Variable +Projection (PoVar), extending a recent inverse expansion method based on power +series. Importantly, we link the power series expansion to Riemannian manifold +optimization. This projective framework is crucial to solve large-scale bundle +adjustment problems without initialization. Using the real-world BAL dataset, +we experimentally demonstrate that our solver achieves state-of-the-art results +in terms of speed and accuracy. To our knowledge, this work is the first to +address the scalability of BA without initialization opening new venues for +initialization-free structure-from-motion. + +
+
+
+
+
+ + ♻ ☆ kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually + Expanding Large Vocabularies + + +
+ Continual segmentation has not yet tackled the challenge of improving +open-vocabulary segmentation models with training data for accurate +segmentation across large, continually expanding vocabularies. We discover that +traditional continual training results in severe catastrophic forgetting, +failing to outperform a zero-shot segmentation baseline. We introduce a novel +training-free strategy, kNN-CLIP, which augments the model with a database of +instance embeddings for semantic and panoptic segmentation that achieves zero +forgetting. We demonstrate that kNN-CLIP can adapt to continually growing +vocabularies without the need for retraining or large memory costs. kNN-CLIP +enables open-vocabulary segmentation methods to expand their vocabularies on +any domain with a single pass through the data, while only storing compact +embeddings. This approach minimizes both compute and memory costs. kNN-CLIP +achieves state-of-the-art performance across large-vocabulary semantic and +panoptic segmentation datasets. We hope kNN-CLIP represents a significant step +forward in enabling more efficient and adaptable continual segmentation, paving +the way for advances in real-world large-vocabulary continual segmentation +methods. + +
+
+
+
+
+ + ♻ ☆ Refractive COLMAP: Refractive Structure-from-Motion Revisited IROS + 2024 + + +
+ In this paper, we present a complete refractive Structure-from-Motion (RSfM) +framework for underwater 3D reconstruction using refractive camera setups (for +both, flat- and dome-port underwater housings). Despite notable achievements in +refractive multi-view geometry over the past decade, a robust, complete and +publicly available solution for such tasks is not available at present, and +often practical applications have to resort to approximating refraction effects +by the intrinsic (distortion) parameters of a pinhole camera model. To fill +this gap, we have integrated refraction considerations throughout the entire +SfM process within the state-of-the-art, open-source SfM framework COLMAP. +Numerical simulations and reconstruction results on synthetically generated but +photo-realistic images with ground truth validate that enabling refraction does +not compromise accuracy or robustness as compared to in-air reconstructions. +Finally, we demonstrate the capability of our approach for large-scale +refractive scenarios using a dataset consisting of nearly 6000 images. The +implementation is released as open-source at: +https://cau-git.rz.uni-kiel.de/inf-ag-koeser/colmap_underwater. + +
+
+ comment: 8 pages, 7 figures, the paper is accepted to be published at the 2024 + IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS + 2024) +
+
+
+
+
+ + ♻ ☆ Neural Architecture Search based Global-local Vision Mamba for Palm-Vein + Recognition + + +
+ Due to the advantages such as high security, high privacy, and liveness +recognition, vein recognition has been received more and more attention in past +years. Recently, deep learning models, e.g., Mamba has shown robust feature +representation with linear computational complexity and successfully applied +for visual tasks. However, vision Manba can capture long-distance feature +dependencies but unfortunately deteriorate local feature details. Besides, +manually designing a Mamba architecture based on human priori knowledge is very +time-consuming and error-prone. In this paper, first, we propose a hybrid +network structure named Global-local Vision Mamba (GLVM), to learn the local +correlations in images explicitly and global dependencies among tokens for vein +feature representation. Secondly, we design a Multi-head Mamba to learn the +dependencies along different directions, so as to improve the feature +representation ability of vision Mamba. Thirdly, to learn the complementary +features, we propose a ConvMamba block consisting of three branches, named +Multi-head Mamba branch (MHMamba), Feature Iteration Unit branch (FIU), and +Convolutional Neural Network (CNN) branch, where the Feature Iteration Unit +branch aims to fuse convolutional local features with Mamba-based global +representations. Finally, a Globallocal Alternate Neural Architecture Search +(GLNAS) method is proposed to search the optimal architecture of GLVM +alternately with the evolutionary algorithm, thereby improving the recognition +performance for vein recognition tasks. We conduct rigorous experiments on +three public palm-vein databases to estimate the performance. The experimental +results demonstrate that the proposed method outperforms the representative +approaches and achieves state-of-the-art recognition accuracy. + +
+
+
+
+
+ + ♻ ☆ CLIP4Sketch: Enhancing Sketch to Mugshot Matching through Dataset + Augmentation using Diffusion Models + + +
+ Forensic sketch-to-mugshot matching is a challenging task in face +recognition, primarily hindered by the scarcity of annotated forensic sketches +and the modality gap between sketches and photographs. To address this, we +propose CLIP4Sketch, a novel approach that leverages diffusion models to +generate a large and diverse set of sketch images, which helps in enhancing the +performance of face recognition systems in sketch-to-mugshot matching. Our +method utilizes Denoising Diffusion Probabilistic Models (DDPMs) to generate +sketches with explicit control over identity and style. We combine CLIP and +Adaface embeddings of a reference mugshot, along with textual descriptions of +style, as the conditions to the diffusion model. We demonstrate the efficacy of +our approach by generating a comprehensive dataset of sketches corresponding to +mugshots and training a face recognition model on our synthetic data. Our +results show significant improvements in sketch-to-mugshot matching accuracy +over training on an existing, limited amount of real face sketch data, +validating the potential of diffusion models in enhancing the performance of +face recognition systems across modalities. We also compare our dataset with +datasets generated using GAN-based methods to show its superiority. + +
+
+
+
+
+ + ♻ ☆ Out of Length Text Recognition with Sub-String Matching + + +
+ Scene Text Recognition (STR) methods have demonstrated robust performance in +word-level text recognition. However, in real applications the text image is +sometimes long due to detected with multiple horizontal words. It triggers the +requirement to build long text recognition models from readily available short +(i.e., word-level) text datasets, which has been less studied previously. In +this paper, we term this task Out of Length (OOL) text recognition. We +establish the first Long Text Benchmark (LTB) to facilitate the assessment of +different methods in long text recognition. Meanwhile, we propose a novel +method called OOL Text Recognition with sub-String Matching (SMTR). SMTR +comprises two cross-attention-based modules: one encodes a sub-string +containing multiple characters into next and previous queries, and the other +employs the queries to attend to the image features, matching the sub-string +and simultaneously recognizing its next and previous character. SMTR can +recognize text of arbitrary length by iterating the process above. To avoid +being trapped in recognizing highly similar sub-strings, we introduce a +regularization training to compel SMTR to effectively discover subtle +differences between similar sub-strings for precise matching. In addition, we +propose an inference augmentation strategy to alleviate confusion caused by +identical sub-strings in the same text and improve the overall recognition +efficiency. Extensive experimental results reveal that SMTR, even when trained +exclusively on short text, outperforms existing methods in public short text +benchmarks and exhibits a clear advantage on LTB. Code: +https://github.com/Topdu/OpenOCR. + +
+
+ comment: Preprint, 16 pages +
+
+
+
+
+ + ♻ ☆ MLAAN: Scaling Supervised Local Learning with Multilaminar Leap + Augmented Auxiliary Network + + +
+ Deep neural networks (DNNs) typically employ an end-to-end (E2E) training +paradigm which presents several challenges, including high GPU memory +consumption, inefficiency, and difficulties in model parallelization during +training. Recent research has sought to address these issues, with one +promising approach being local learning. This method involves partitioning the +backbone network into gradient-isolated modules and manually designing +auxiliary networks to train these local modules. Existing methods often neglect +the interaction of information between local modules, leading to myopic issues +and a performance gap compared to E2E training. To address these limitations, +we propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN). +Specifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap +Augmented Modules (LAM). MLM captures both local and global features through +independent and cascaded auxiliary networks, alleviating performance issues +caused by insufficient global features. However, overly simplistic auxiliary +networks can impede MLM's ability to capture global information. To address +this, we further design LAM, an enhanced auxiliary network that uses the +Exponential Moving Average (EMA) method to facilitate information exchange +between local modules, thereby mitigating the shortsightedness resulting from +inadequate interaction. The synergy between MLM and LAM has demonstrated +excellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and +ImageNet datasets show that MLAAN can be seamlessly integrated into existing +local learning frameworks, significantly enhancing their performance and even +surpassing end-to-end (E2E) training methods, while also reducing GPU memory +consumption. + +
+
+
+
+
+ + ♻ ☆ A Practical Solver for Scalar Data Topological Simplification IEEE VIS 2024 + + +
+ This paper presents a practical approach for the optimization of topological +simplification, a central pre-processing step for the analysis and +visualization of scalar data. Given an input scalar field f and a set of +"signal" persistence pairs to maintain, our approach produces an output field g +that is close to f and which optimizes (i) the cancellation of "non-signal" +pairs, while (ii) preserving the "signal" pairs. In contrast to pre-existing +simplification algorithms, our approach is not restricted to persistence pairs +involving extrema and can thus address a larger class of topological features, +in particular saddle pairs in three-dimensional scalar data. Our approach +leverages recent generic persistence optimization frameworks and extends them +with tailored accelerations specific to the problem of topological +simplification. Extensive experiments report substantial accelerations over +these frameworks, thereby making topological simplification optimization +practical for real-life datasets. Our approach enables a direct visualization +and analysis of the topologically simplified data, e.g., via isosurfaces of +simplified topology (fewer components and handles). We apply our approach to +the extraction of prominent filament structures in three-dimensional data. +Specifically, we show that our pre-simplification of the data leads to +practical improvements over standard topological techniques for removing +filament loops. We also show how our approach can be used to repair genus +defects in surface processing. Finally, we provide a C++ implementation for +reproducibility purposes. + +
+
+ comment: 13 pages, 10 figures, IEEE VIS 2024 +
+
+
+
+
+ + ♻ ☆ S$^2$Mamba: A Spatial-spectral State Space Model for Hyperspectral Image + Classification + + +
+ Land cover analysis using hyperspectral images (HSI) remains an open problem +due to their low spatial resolution and complex spectral information. Recent +studies are primarily dedicated to designing Transformer-based architectures +for spatial-spectral long-range dependencies modeling, which is computationally +expensive with quadratic complexity. Selective structured state space model +(Mamba), which is efficient for modeling long-range dependencies with linear +complexity, has recently shown promising progress. However, its potential in +hyperspectral image processing that requires handling numerous spectral bands +has not yet been explored. In this paper, we innovatively propose S$^2$Mamba, a +spatial-spectral state space model for hyperspectral image classification, to +excavate spatial-spectral contextual features, resulting in more efficient and +accurate land cover analysis. In S$^2$Mamba, two selective structured state +space models through different dimensions are designed for feature extraction, +one for spatial, and the other for spectral, along with a spatial-spectral +mixture gate for optimal fusion. More specifically, S$^2$Mamba first captures +spatial contextual relations by interacting each pixel with its adjacent +through a Patch Cross Scanning module and then explores semantic information +from continuous spectral bands through a Bi-directional Spectral Scanning +module. Considering the distinct expertise of the two attributes in homogenous +and complicated texture scenes, we realize the Spatial-spectral Mixture Gate by +a group of learnable matrices, allowing for the adaptive incorporation of +representations learned across different dimensions. Extensive experiments +conducted on HSI classification benchmarks demonstrate the superiority and +prospect of S$^2$Mamba. The code will be made available at: +https://github.com/PURE-melo/S2Mamba. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ DA-BEV: Unsupervised Domain Adaptation for Bird's Eye View Perception + + +
+ Camera-only Bird's Eye View (BEV) has demonstrated great potential in +environment perception in a 3D space. However, most existing studies were +conducted under a supervised setup which cannot scale well while handling +various new data. Unsupervised domain adaptive BEV, which effective learning +from various unlabelled target data, is far under-explored. In this work, we +design DA-BEV, the first domain adaptive camera-only BEV framework that +addresses domain adaptive BEV challenges by exploiting the complementary nature +of image-view features and BEV features. DA-BEV introduces the idea of query +into the domain adaptation framework to derive useful information from +image-view and BEV features. It consists of two query-based designs, namely, +query-based adversarial learning (QAL) and query-based self-training (QST), +which exploits image-view features or BEV features to regularize the adaptation +of the other. Extensive experiments show that DA-BEV achieves superior domain +adaptive BEV perception performance consistently across multiple datasets and +tasks such as 3D object detection and 3D scene segmentation. + +
+
+
+
+
+ + ♻ ☆ InterCLIP-MEP: Interactive CLIP and Memory-Enhanced Predictor for + Multi-modal Sarcasm Detection + + +
+ The prevalence of sarcasm in social media, conveyed through text-image +combinations, presents significant challenges for sentiment analysis and +intention mining. Existing multi-modal sarcasm detection methods have been +proven to overestimate performance, as they struggle to effectively capture the +intricate sarcastic cues that arise from the interaction between an image and +text. To address these issues, we propose InterCLIP-MEP, a novel framework for +multi-modal sarcasm detection. Specifically, we introduce an Interactive CLIP +(InterCLIP) as the backbone to extract text-image representations, enhancing +them by embedding cross-modality information directly within each encoder, +thereby improving the representations to capture text-image interactions +better. Furthermore, an efficient training strategy is designed to adapt +InterCLIP for our proposed Memory-Enhanced Predictor (MEP). MEP uses a dynamic, +fixed-length dual-channel memory to store historical knowledge of valuable test +samples during inference. It then leverages this memory as a non-parametric +classifier to derive the final prediction, offering a more robust recognition +of multi-modal sarcasm. Experiments demonstrate that InterCLIP-MEP achieves +state-of-the-art performance on the MMSD2.0 benchmark, with an accuracy +improvement of 1.08% and an F1 score improvement of 1.51% over the previous +best method. + +
+
+ comment: 9 pages, 6 figures, 3 tables; Code and data are available at + https://github.com/CoderChen01/InterCLIP-MEP +
+
+
+
+
+ + ♻ ☆ Cross Pseudo Supervision Framework for Sparsely Labelled Geospatial + Images + + +
+ Land Use Land Cover (LULC) mapping is a vital tool for urban and resource +planning, playing a key role in the development of innovative and sustainable +cities. This study introduces a semi-supervised segmentation model for LULC +prediction using high-resolution satellite images with a vast diversity of data +distributions in different areas of India. Our approach ensures a robust +generalization across different types of buildings, roads, trees, and water +bodies within these distinct areas. We propose a modified Cross Pseudo +Supervision framework to train image segmentation models on sparsely labelled +data. The proposed framework addresses the limitations of the famous 'Cross +Pseudo Supervision' technique for semi-supervised learning, specifically +tackling the challenges of training segmentation models on noisy satellite +image data with sparse and inaccurate labels. This comprehensive approach +significantly enhances the accuracy and utility of LULC mapping, providing +valuable insights for urban and resource planning applications. + +
+
+
+
+
+ + ♻ ☆ Performance Evaluation of YOLOv8 Model Configurations, for Instance + Segmentation of Strawberry Fruit Development Stages in an Open Field + Environment + + +
+ Accurate identification of strawberries during their maturing stages is +crucial for optimizing yield management, and pest control, and making informed +decisions related to harvest and post-harvest logistics. This study evaluates +the performance of YOLOv8 model configurations for instance segmentation of +strawberries into ripe and unripe stages in an open field environment. The +YOLOv8n model demonstrated superior segmentation accuracy with a mean Average +Precision (mAP) of 80.9\%, outperforming other YOLOv8 configurations. In terms +of inference speed, YOLOv8n processed images at 12.9 milliseconds, while +YOLOv8s, the least-performing model, processed at 22.2 milliseconds. Over 86 +test images with 348 ground truth labels, YOLOv8n detected 235 ripe fruit +classes and 51 unripe fruit classes out of 251 ground truth ripe fruits and 97 +unripe ground truth labels, respectively. In comparison, YOLOv8s detected 204 +ripe fruits and 37 unripe fruits. Overall, YOLOv8n achieved the fastest +inference speed of 24.2 milliseconds, outperforming YOLOv8s, YOLOv8m, YOLOv8l, +and YOLOv8x, which processed images at 33.0 milliseconds, 44.3 milliseconds, +53.6 milliseconds, and 62.5 milliseconds, respectively. These results +underscore the potential of advanced object segmentation algorithms to address +complex visual recognition tasks in open-field agriculture effectively to +address complex visual recognition tasks in open-field agriculture effectively. + +
+
+ comment: 15 page, 18 figures +
+
+
+
+
+ + ♻ ☆ Can virtual staining for high-throughput screening generalize? + + +
+ The large volume and variety of imaging data from high-throughput screening +(HTS) in the pharmaceutical industry present an excellent resource for training +virtual staining models. However, the potential of models trained under one set +of experimental conditions to generalize to other conditions remains +underexplored. This study systematically investigates whether data from three +cell types (lung, ovarian, and breast) and two phenotypes (toxic and non-toxic +conditions) commonly found in HTS can effectively train virtual staining models +to generalize across three typical HTS distribution shifts: unseen phenotypes, +unseen cell types, and the combination of both. Utilizing a dataset of 772,416 +paired bright-field, cytoplasm, nuclei, and DNA-damage stain images, we +evaluate the generalization capabilities of models across pixel-based, +instance-wise, and biological-feature-based levels. Our findings indicate that +training virtual nuclei and cytoplasm models on non-toxic condition samples not +only generalizes to toxic condition samples but leads to improved performance +across all evaluation levels compared to training on toxic condition samples. +Generalization to unseen cell types shows variability depending on the cell +type; models trained on ovarian or lung cell samples often perform well under +other conditions, while those trained on breast cell samples consistently show +poor generalization. Generalization to unseen cell types and phenotypes shows +good generalization across all levels of evaluation compared to addressing +unseen cell types alone. This study represents the first large-scale, +data-centric analysis of the generalization capability of virtual staining +models trained on diverse HTS datasets, providing valuable strategies for +experimental training data generation. + +
+
+
+
+
+ + ♻ ☆ Rectified Iterative Disparity for Stereo Matching + + +
+ Both uncertainty-assisted and iteration-based methods have achieved great +success in stereo matching. However, existing uncertainty estimation methods +take a single image and the corresponding disparity as input, which imposes +higher demands on the estimation network. In this paper, we propose Cost +volume-based disparity Uncertainty Estimation (UEC). Based on the rich +similarity information in the cost volume coming from the image pairs, the +proposed UEC can achieve competitive performance with low computational cost. +Secondly, we propose two methods of uncertainty-assisted disparity estimation, +Uncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity +update Conditioning (UDC). These two methods optimise the disparity update +process of the iterative-based approach without adding extra parameters. In +addition, we propose Disparity Rectification loss that significantly improves +the accuracy of small amount of disparity updates. We present a +high-performance stereo architecture, DR Stereo, which is a combination of the +proposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014, +and ETH3D show that DR-Stereo achieves very competitive disparity estimation +performance. + +
+
+
+
+
+ + ♻ ☆ EasyInv: Toward Fast and Better DDIM Inversion + + +
+ This paper introduces EasyInv, an easy yet novel approach that significantly +advances the field of DDIM Inversion by addressing the inherent inefficiencies +and performance limitations of traditional iterative optimization methods. At +the core of our EasyInv is a refined strategy for approximating inversion +noise, which is pivotal for enhancing the accuracy and reliability of the +inversion process. By prioritizing the initial latent state, which encapsulates +rich information about the original images, EasyInv steers clear of the +iterative refinement of noise items. Instead, we introduce a methodical +aggregation of the latent state from the preceding time step with the current +state, effectively increasing the influence of the initial latent state and +mitigating the impact of noise. We illustrate that EasyInv is capable of +delivering results that are either on par with or exceed those of the +conventional DDIM Inversion approach, especially under conditions where the +model's precision is limited or computational resources are scarce. +Concurrently, our EasyInv offers an approximate threefold enhancement regarding +inference efficiency over off-the-shelf iterative optimization techniques. + +
+
+ comment: 9 pages not including reference +
+
+
+
+
+ + ♻ ☆ Automatic Spatial Calibration of Near-Field MIMO Radar With Respect to + Optical Depth Sensors IROS 2024 + + +
+ Despite an emerging interest in MIMO radar, the utilization of its +complementary strengths in combination with optical depth sensors has so far +been limited to far-field applications, due to the challenges that arise from +mutual sensor calibration in the near field. In fact, most related approaches +in the autonomous industry propose target-based calibration methods using +corner reflectors that have proven to be unsuitable for the near field. In +contrast, we propose a novel, joint calibration approach for optical RGB-D +sensors and MIMO radars that is designed to operate in the radar's near-field +range, within decimeters from the sensors. Our pipeline consists of a bespoke +calibration target, allowing for automatic target detection and localization, +followed by the spatial calibration of the two sensor coordinate systems +through target registration. We validate our approach using two different depth +sensing technologies from the optical domain. The experiments show the +efficiency and accuracy of our calibration for various target displacements, as +well as its robustness of our localization in terms of signal ambiguities. + +
+
+ comment: 8 pages, 9 figures, accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal + Large Language Models + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in executing instructions for a variety of single-image tasks. +Despite this progress, significant challenges remain in modeling long image +sequences. In this work, we introduce the versatile multi-modal large language +model, mPLUG-Owl3, which enhances the capability for long image-sequence +understanding in scenarios that incorporate retrieved image-text knowledge, +interleaved image-text, and lengthy videos. Specifically, we propose novel +hyper attention blocks to efficiently integrate vision and language into a +common language-guided semantic space, thereby facilitating the processing of +extended multi-image scenarios. Extensive experimental results suggest that +mPLUG-Owl3 achieves state-of-the-art performance among models with a similar +size on single-image, multi-image, and video benchmarks. Moreover, we propose a +challenging long visual sequence evaluation named Distractor Resistance to +assess the ability of models to maintain focus amidst distractions. Finally, +with the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance +on ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to +the development of more efficient and powerful multimodal large language +models. + +
+
+
+
+
+ + ♻ ☆ PromptKD: Unsupervised Prompt Distillation for Vision-Language Models CVPR 2024 + + +
+ Prompt learning has emerged as a valuable technique in enhancing +vision-language models (VLMs) such as CLIP for downstream tasks in specific +domains. Existing work mainly focuses on designing various learning forms of +prompts, neglecting the potential of prompts as effective distillers for +learning from larger teacher models. In this paper, we introduce an +unsupervised domain prompt distillation framework, which aims to transfer the +knowledge of a larger teacher model to a lightweight target model through +prompt-driven imitation using unlabeled domain images. Specifically, our +framework consists of two distinct stages. In the initial stage, we pre-train a +large CLIP teacher model using domain (few-shot) labels. After pre-training, we +leverage the unique decoupled-modality characteristics of CLIP by pre-computing +and storing the text features as class vectors only once through the teacher +text encoder. In the subsequent stage, the stored class vectors are shared +across teacher and student image encoders for calculating the predicted logits. +Further, we align the logits of both the teacher and student models via KL +divergence, encouraging the student image encoder to generate similar +probability distributions to the teacher through the learnable prompts. The +proposed prompt distillation process eliminates the reliance on labeled data, +enabling the algorithm to leverage a vast amount of unlabeled images within the +domain. Finally, the well-trained student image encoders and pre-stored text +features (class vectors) are utilized for inference. To our best knowledge, we +are the first to (1) perform unsupervised domain-specific prompt-driven +knowledge distillation for CLIP, and (2) establish a practical pre-storing +mechanism of text features as shared class vectors between teacher and student. +Extensive experiments on 11 datasets demonstrate the effectiveness of our +method. + +
+
+ comment: CVPR 2024. Project Page: https://zhengli97.github.io/PromptKD. Code: + https://github.com/zhengli97/PromptKD +
+
+
+
+
+ + ♻ ☆ Deep Non-rigid Structure-from-Motion: A Sequence-to-Sequence Translation + Perspective + + +
+ Directly regressing the non-rigid shape and camera pose from the individual +2D frame is ill-suited to the Non-Rigid Structure-from-Motion (NRSfM) problem. +This frame-by-frame 3D reconstruction pipeline overlooks the inherent +spatial-temporal nature of NRSfM, i.e., reconstructing the whole 3D sequence +from the input 2D sequence. In this paper, we propose to model deep NRSfM from +a sequence-to-sequence translation perspective, where the input 2D frame +sequence is taken as a whole to reconstruct the deforming 3D non-rigid shape +sequence. First, we apply a shape-motion predictor to estimate the initial +non-rigid shape and camera motion from a single frame. Then we propose a +context modeling module to model camera motions and complex non-rigid shapes. +To tackle the difficulty in enforcing the global structure constraint within +the deep framework, we propose to impose the union-of-subspace structure by +replacing the self-expressiveness layer with multi-head attention and delayed +regularizers, which enables end-to-end batch-wise training. Experimental +results across different datasets such as Human3.6M, CMU Mocap and InterHand +prove the superiority of our framework. + +
+
+ comment: has been accepted by IEEE Transactions on Pattern Analysis and + Machine Intelligence +
+
+
+
+
+ + ♻ ☆ etuner: Redundancy-Aware Efficient Continual Learning on Edge Devices + + +
+ Many emerging applications, such as robot-assisted eldercare and object +recognition, generally employ deep learning neural networks (DNNs) and require +the deployment of DNN models on edge devices. These applications naturally +require i) handling streaming-in inference requests and ii) fine-tuning the +deployed models to adapt to possible deployment scenario changes. Continual +learning (CL) is widely adopted to satisfy these needs. CL is a popular deep +learning paradigm that handles both continuous model fine-tuning and overtime +inference requests. However, an inappropriate model fine-tuning scheme could +involve significant redundancy and consume considerable time and energy, making +it challenging to apply CL on edge devices. In this paper, we propose ETuner, +an efficient edge continual learning framework that optimizes inference +accuracy, fine-tuning execution time, and energy efficiency through both +inter-tuning and intra-tuning optimizations. Experimental results show that, on +average, ETuner reduces overall fine-tuning execution time by 64%, energy +consumption by 56%, and improves average inference accuracy by 1.75% over the +immediate model fine-tuning approach. + +
+
+
+
+
+ + ♻ ☆ FreeDiff: Progressive Frequency Truncation for Image Editing with + Diffusion Models ECCV-2024 + + +
+ Precise image editing with text-to-image models has attracted increasing +interest due to their remarkable generative capabilities and user-friendly +nature. However, such attempts face the pivotal challenge of misalignment +between the intended precise editing target regions and the broader area +impacted by the guidance in practice. Despite excellent methods leveraging +attention mechanisms that have been developed to refine the editing guidance, +these approaches necessitate modifications through complex network architecture +and are limited to specific editing tasks. In this work, we re-examine the +diffusion process and misalignment problem from a frequency perspective, +revealing that, due to the power law of natural images and the decaying noise +schedule, the denoising network primarily recovers low-frequency image +components during the earlier timesteps and thus brings excessive low-frequency +signals for editing. Leveraging this insight, we introduce a novel fine-tuning +free approach that employs progressive $\textbf{Fre}$qu$\textbf{e}$ncy +truncation to refine the guidance of $\textbf{Diff}$usion models for universal +editing tasks ($\textbf{FreeDiff}$). Our method achieves comparable results +with state-of-the-art methods across a variety of editing tasks and on a +diverse set of images, highlighting its potential as a versatile tool in image +editing applications. + +
+
+ comment: Accepted by ECCV-2024 +
+
+
+
+
+ + ♻ ☆ PAtt-Lite: Lightweight Patch and Attention MobileNet for Challenging + Facial Expression Recognition + + +
+ Facial Expression Recognition (FER) is a machine learning problem that deals +with recognizing human facial expressions. While existing work has achieved +performance improvements in recent years, FER in the wild and under challenging +conditions remains a challenge. In this paper, a lightweight patch and +attention network based on MobileNetV1, referred to as PAtt-Lite, is proposed +to improve FER performance under challenging conditions. A truncated +ImageNet-pre-trained MobileNetV1 is utilized as the backbone feature extractor +of the proposed method. In place of the truncated layers is a patch extraction +block that is proposed for extracting significant local facial features to +enhance the representation from MobileNetV1, especially under challenging +conditions. An attention classifier is also proposed to improve the learning of +these patched feature maps from the extremely lightweight feature extractor. +The experimental results on public benchmark databases proved the effectiveness +of the proposed method. PAtt-Lite achieved state-of-the-art results on CK+, +RAF-DB, FER2013, FERPlus, and the challenging conditions subsets for RAF-DB and +FERPlus. + +
+
+ comment: Copyright 2024 IEEE. Personal use of this material is permitted. IEEE + Access 2024 +
+
+
+
+
+ + ♻ ☆ Negative Object Presence Evaluation (NOPE) to Measure Object + Hallucination in Vision-Language Models ACL 2024 + + +
+ Object hallucination poses a significant challenge in vision-language (VL) +models, often leading to the generation of nonsensical or unfaithful responses +with non-existent objects. However, the absence of a general measurement for +evaluating object hallucination in VL models has hindered our understanding and +ability to mitigate this issue. In this work, we present NOPE (Negative Object +Presence Evaluation), a novel benchmark designed to assess object hallucination +in VL models through visual question answering (VQA). We propose a +cost-effective and scalable approach utilizing large language models to +generate 29.5k synthetic negative pronoun (NegP) data of high quality for NOPE. +We extensively investigate the performance of 10 state-of-the-art VL models in +discerning the non-existence of objects in visual questions, where the ground +truth answers are denoted as NegP (e.g., "none"). Additionally, we evaluate +their standard performance on visual questions on 9 other VQA datasets. Through +our experiments, we demonstrate that no VL model is immune to the vulnerability +of object hallucination, as all models achieve accuracy below 10\% on NegP. +Furthermore, we uncover that lexically diverse visual questions, question types +with large scopes, and scene-relevant objects capitalize the risk of object +hallucination in VL models. + +
+
+ comment: Published in ALVR Workshop at ACL 2024 +
+
+
+
+
+ + ♻ ☆ Spb3DTracker: A Robust LiDAR-Based Person Tracker for Noisy Environment + + +
+ Person detection and tracking (PDT) has seen significant advancements with 2D +camera-based systems in the autonomous vehicle field, leading to widespread +adoption of these algorithms. However, growing privacy concerns have recently +emerged as a major issue, prompting a shift towards LiDAR-based PDT as a viable +alternative. Within this domain, "Tracking-by-Detection" (TBD) has become a +prominent methodology. Despite its effectiveness, LiDAR-based PDT has not yet +achieved the same level of performance as camera-based PDT. This paper examines +key components of the LiDAR-based PDT framework, including detection +post-processing, data association, motion modeling, and lifecycle management. +Building upon these insights, we introduce SpbTrack, a robust person tracker +designed for diverse environments. Our method achieves superior performance on +noisy datasets and state-of-the-art results on KITTI Dataset benchmarks and +custom office indoor dataset among LiDAR-based trackers. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Long working distance portable smartphone microscopy for metallic mesh + defect detection + + +
+ Metallic mesh is a transparent electromagnetic shielding film with a fine +metal line structure. However, it can develop defects that affect the +optoelectronic performance whether in the production preparation or in actual +use. The development of in-situ non-destructive testing (NDT) devices for +metallic mesh requires long working distances, reflective optical path design, +and miniaturization. To address the limitations of existing smartphone +microscopes, which feature short working distances and inadequate transmission +imaging for industrial in-situ inspection, we propose a novel long-working +distance reflective smartphone microscopy system (LD-RSM). LD-RSM builds a 4f +optical imaging system with external optical components and a smartphone, +utilizing a beam splitter to achieve reflective imaging with the illumination +system and imaging system on the same side of the sample. It achieves an +optical resolution of 4.92$\mu$m and a working distance of up to 22.23 mm. +Additionally, we introduce a dual prior weighted Robust Principal Component +Analysis (DW-RPCA) for defect detection. This approach leverages spectral +filter fusion and Hough transform to model different defect types, enhancing +the accuracy and efficiency of defect identification. Coupled with an optimized +threshold segmentation algorithm, DW-RPCA method achieves a pixel-level +accuracy of 84.8%. Our work showcases strong potential for growth in the field +of in-situ on-line inspection of industrial products. + +
+
+
+
+
+ + ♻ ☆ Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid + Representation and Normal Prior Enhancement + + +
+ The reconstruction of indoor scenes from multi-view RGB images is challenging +due to the coexistence of flat and texture-less regions alongside delicate and +fine-grained regions. Recent methods leverage neural radiance fields aided by +predicted surface normal priors to recover the scene geometry. These methods +excel in producing complete and smooth results for floor and wall areas. +However, they struggle to capture complex surfaces with high-frequency +structures due to the inadequate neural representation and the inaccurately +predicted normal priors. This work aims to reconstruct high-fidelity surfaces +with fine-grained details by addressing the above limitations. To improve the +capacity of the implicit representation, we propose a hybrid architecture to +represent low-frequency and high-frequency regions separately. To enhance the +normal priors, we introduce a simple yet effective image sharpening and +denoising technique, coupled with a network that estimates the pixel-wise +uncertainty of the predicted surface normal vectors. Identifying such +uncertainty can prevent our model from being misled by unreliable surface +normal supervisions that hinder the accurate reconstruction of intricate +geometries. Experiments on the benchmark datasets show that our method +outperforms existing methods in terms of reconstruction quality. Furthermore, +the proposed method also generalizes well to real-world indoor scenarios +captured by our hand-held mobile phones. Our code is publicly available at: +https://github.com/yec22/Fine-Grained-Indoor-Recon. + +
+
+ comment: accepted by TVCG +
+
+
+
+
+ + ♻ ☆ VolumeDiffusion: Flexible Text-to-3D Generation with Efficient + Volumetric Encoder + + +
+ This paper introduces a pioneering 3D volumetric encoder designed for +text-to-3D generation. To scale up the training data for the diffusion model, a +lightweight network is developed to efficiently acquire feature volumes from +multi-view images. The 3D volumes are then trained on a diffusion model for +text-to-3D generation using a 3D U-Net. This research further addresses the +challenges of inaccurate object captions and high-dimensional feature volumes. +The proposed model, trained on the public Objaverse dataset, demonstrates +promising outcomes in producing diverse and recognizable samples from text +prompts. Notably, it empowers finer control over object part characteristics +through textual cues, fostering model creativity by seamlessly combining +multiple concepts within a single object. This research significantly +contributes to the progress of 3D generation by introducing an efficient, +flexible, and scalable representation methodology. + +
+
+
+
+
+ + ♻ ☆ Simplified Diffusion Schrödinger Bridge + + +
+ This paper introduces a novel theoretical simplification of the Diffusion +Schr\"odinger Bridge (DSB) that facilitates its unification with Score-based +Generative Models (SGMs), addressing the limitations of DSB in complex data +generation and enabling faster convergence and enhanced performance. By +employing SGMs as an initial solution for DSB, our approach capitalizes on the +strengths of both frameworks, ensuring a more efficient training process and +improving the performance of SGM. We also propose a reparameterization +technique that, despite theoretical approximations, practically improves the +network's fitting capabilities. Our extensive experimental evaluations confirm +the effectiveness of the simplified DSB, demonstrating its significant +improvements. We believe the contributions of this work pave the way for +advanced generative modeling. + +
+
+
+
+
+ + ♻ ☆ Enhancing Visual Place Recognition via Fast and Slow Adaptive Biasing in + Event Cameras IROS 2024 + + +
+ Event cameras are increasingly popular in robotics due to beneficial features +such as low latency, energy efficiency, and high dynamic range. Nevertheless, +their downstream task performance is greatly influenced by the optimization of +bias parameters. These parameters, for instance, regulate the necessary change +in light intensity to trigger an event, which in turn depends on factors such +as the environment lighting and camera motion. This paper introduces feedback +control algorithms that automatically tune the bias parameters through two +interacting methods: 1) An immediate, on-the-fly \textit{fast} adaptation of +the refractory period, which sets the minimum interval between consecutive +events, and 2) if the event rate exceeds the specified bounds even after +changing the refractory period repeatedly, the controller adapts the pixel +bandwidth and event thresholds, which stabilizes after a short period of noise +events across all pixels (\textit{slow} adaptation). Our evaluation focuses on +the visual place recognition task, where incoming query images are compared to +a given reference database. We conducted comprehensive evaluations of our +algorithms' adaptive feedback control in real-time. To do so, we collected the +QCR-Fast-and-Slow dataset that contains DAVIS346 event camera streams from 366 +repeated traversals of a Scout Mini robot navigating through a 100 meter long +indoor lab setting (totaling over 35km distance traveled) in varying brightness +conditions with ground truth location information. Our proposed feedback +controllers result in superior performance when compared to the standard bias +settings and prior feedback control methods. Our findings also detail the +impact of bias adjustments on task performance and feature ablation studies on +the fast and slow adaptation mechanisms. + +
+
+ comment: 8 pages, 9 figures, paper accepted to the 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ WaveShot: A Compact Portable Unmanned Surface Vessel for Dynamic Water + Surface Videography and Media Production + + +
+ This paper presents WaveShot, an innovative portable unmanned surface vessel +that aims to transform water surface videography by offering a highly +maneuverable, cost-effective, and safe alternative to traditional filming +methods. WaveShot is designed for the modern demands of film production, +advertising, documentaries, and visual arts, equipped with professional-grade +waterproof cameras and advanced technology to capture static and dynamic scenes +on waterways. We discuss the development and advantages of WaveShot, +highlighting its portability, ease of transport, and rapid deployment +capabilities. Experimental validation showcasing WaveShot's stability and +high-quality video capture in various water conditions, and the integration of +monocular depth estimation algorithms to enhance the operator's spatial +perception. The paper concludes by exploring WaveShot's real-world +applications, its user-friendly remote operation, and future enhancements such +as gimbal integration and advanced computer vision for optimized videography on +water surfaces. + +
+
+
+
+
+ + ♻ ☆ Enhancing Vision-Language Models Generalization via Diversity-Driven + Novel Feature Synthesis + + +
+ Vision-language foundation models like CLIP have shown impressive zero-shot +generalization, but finetuning on downstream datasets can cause overfitting and +loss of its generalization ability on unseen domains. Although collecting +additional data from new domains of interest is possible, this method is often +impractical due to the challenges in obtaining annotated data. To address this, +we propose a plug-and-play feature synthesis method called LDFS +(Language-Guided Diverse Feature Synthesis) to synthesize new domain features +and improve existing CLIP fine-tuning strategies. LDFS has three main +contributions: 1) To synthesize novel domain features and promote diversity, we +propose an instance-conditional feature augmentation strategy based on a +text-guided feature augmentation loss. 2) To maintain feature quality after +augmenting, we introduce a pairwise regularizer to preserve augmented feature +coherence within the CLIP feature space. 3) We propose to use stochastic text +feature augmentation to reduce the modality gap and further facilitate the +process of text-guided feature synthesis. Extensive experiments show LDFS +superiority in improving CLIP generalization ability on unseen domains without +collecting data from those domains. The code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ StylizedGS: Controllable Stylization for 3D Gaussian Splatting + + +
+ As XR technology continues to advance rapidly, 3D generation and editing are +increasingly crucial. Among these, stylization plays a key role in enhancing +the appearance of 3D models. By utilizing stylization, users can achieve +consistent artistic effects in 3D editing using a single reference style image, +making it a user-friendly editing method. However, recent NeRF-based 3D +stylization methods encounter efficiency issues that impact the user +experience, and their implicit nature limits their ability to accurately +transfer geometric pattern styles. Additionally, the ability for artists to +apply flexible control over stylized scenes is considered highly desirable to +foster an environment conducive to creative exploration. To address the above +issues, we introduce StylizedGS, an efficient 3D neural style transfer +framework with adaptable control over perceptual factors based on 3D Gaussian +Splatting (3DGS) representation. We propose a filter-based refinement to +eliminate floaters that affect the stylization effects in the scene +reconstruction process. The nearest neighbor-based style loss is introduced to +achieve stylization by fine-tuning the geometry and color parameters of 3DGS, +while a depth preservation loss with other regularizations is proposed to +prevent the tampering of geometry content. Moreover, facilitated by specially +designed losses, StylizedGS enables users to control color, stylized scale, and +regions during the stylization to possess customization capabilities. Our +method achieves high-quality stylization results characterized by faithful +brushstrokes and geometric consistency with flexible controls. Extensive +experiments across various scenes and styles demonstrate the effectiveness and +efficiency of our method concerning both stylization quality and inference +speed. + +
+
+
+
+
+ + ♻ ☆ Made to Order: Discovering monotonic temporal changes via + self-supervised video ordering ECCV 2024 + + +
+ Our objective is to discover and localize monotonic temporal changes in a +sequence of images. To achieve this, we exploit a simple proxy task of ordering +a shuffled image sequence, with `time' serving as a supervisory signal, since +only changes that are monotonic with time can give rise to the correct +ordering. We also introduce a transformer-based model for ordering of image +sequences of arbitrary length with built-in attribution maps. After training, +the model successfully discovers and localizes monotonic changes while ignoring +cyclic and stochastic ones. We demonstrate applications of the model in +multiple domains covering different scene and object types, discovering both +object-level and environmental changes in unseen sequences. We also demonstrate +that the attention-based attribution maps function as effective prompts for +segmenting the changing regions, and that the learned representations can be +used for downstream applications. Finally, we show that the model achieves the +state-of-the-art on standard benchmarks for image ordering. + +
+
+ comment: ECCV 2024 Oral. Project page: https://charigyang.github.io/order/ +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Graph Enhanced DETR Towards Multi-Frame 3D Object + Detection + + +
+ The Detection Transformer (DETR) has revolutionized the design of CNN-based +object detection systems, showcasing impressive performance. However, its +potential in the domain of multi-frame 3D object detection remains largely +unexplored. In this paper, we present STEMD, a novel end-to-end framework that +enhances the DETR-like paradigm for multi-frame 3D object detection by +addressing three key aspects specifically tailored for this task. First, to +model the inter-object spatial interaction and complex temporal dependencies, +we introduce the spatial-temporal graph attention network, which represents +queries as nodes in a graph and enables effective modeling of object +interactions within a social context. To solve the problem of missing hard +cases in the proposed output of the encoder in the current frame, we +incorporate the output of the previous frame to initialize the query input of +the decoder. Finally, it poses a challenge for the network to distinguish +between the positive query and other highly similar queries that are not the +best match. And similar queries are insufficiently suppressed and turn into +redundant prediction boxes. To address this issue, our proposed IoU +regularization term encourages similar queries to be distinct during the +refinement. Through extensive experiments, we demonstrate the effectiveness of +our approach in handling challenging scenarios, while incurring only a minor +additional computational overhead. The code is publicly available at +https://github.com/Eaphan/STEMD. + +
+
+ comment: 16 pages, 9 figures; Accepted by IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ SFMViT: SlowFast Meet ViT in Chaotic World + + +
+ The task of spatiotemporal action localization in chaotic scenes is a +challenging task toward advanced video understanding. Paving the way with +high-quality video feature extraction and enhancing the precision of +detector-predicted anchors can effectively improve model performance. To this +end, we propose a high-performance dual-stream spatiotemporal feature +extraction network SFMViT with an anchor pruning strategy. The backbone of our +SFMViT is composed of ViT and SlowFast with prior knowledge of spatiotemporal +action localization, which fully utilizes ViT's excellent global feature +extraction capabilities and SlowFast's spatiotemporal sequence modeling +capabilities. Secondly, we introduce the confidence maximum heap to prune the +anchors detected in each frame of the picture to filter out the effective +anchors. These designs enable our SFMViT to achieve a mAP of 26.62% in the +Chaotic World dataset, far exceeding existing models. Code is available at +https://github.com/jfightyr/SlowFast-Meet-ViT. + +
+
+
+
+
+ + ♻ ☆ ClickAttention: Click Region Similarity Guided Interactive Segmentation + + +
+ Interactive segmentation algorithms based on click points have garnered +significant attention from researchers in recent years. However, existing +studies typically use sparse click maps as model inputs to segment specific +target objects, which primarily affect local regions and have limited abilities +to focus on the whole target object, leading to increased times of clicks. In +addition, most existing algorithms can not balance well between high +performance and efficiency. To address this issue, we propose a click attention +algorithm that expands the influence range of positive clicks based on the +similarity between positively-clicked regions and the whole input. We also +propose a discriminative affinity loss to reduce the attention coupling between +positive and negative click regions to avoid an accuracy decrease caused by +mutual interference between positive and negative clicks. Extensive experiments +demonstrate that our approach is superior to existing methods and achieves +cutting-edge performance in fewer parameters. An interactive demo and all +reproducible codes will be released at +https://github.com/hahamyt/ClickAttention. + +
+
+
+
+
+ + ♻ ☆ HcNet: Image Modeling with Heat Conduction Equation + + +
+ Foundation models, such as CNNs and ViTs, have powered the development of +image modeling. However, general guidance to model architecture design is still +missing. The design of many modern model architectures, such as residual +structures, multiplicative gating signal, and feed-forward networks, can be +interpreted in terms of the heat conduction equation. This finding inspired us +to model images by the heat conduction equation, where the essential idea is to +conceptualize image features as temperatures and model their information +interaction as the diffusion of thermal energy. We can take advantage of the +rich knowledge in the heat conduction equation to guide us in designing new and +more interpretable models. As an example, we propose Heat Conduction Layer and +Refine Approximation Layer inspired by solving the heat conduction equation +using Finite Difference Method and Fourier series, respectively. This paper +does not aim to present a state-of-the-art model; instead, it seeks to +integrate the overall architectural design of the model into the heat +conduction theory framework. Nevertheless, our Heat Conduction Network (HcNet) +still shows competitive performance. Code available at +\url{https://github.com/ZheminZhang1/HcNet}. + +
+
+
+
+
+ + ♻ ☆ CS-TRD: a Cross Sections Tree Ring Detection method + + +
+ This work describes a Tree Ring Detection method for complete Cross-Sections +of Trees (CS-TRD) that detects, processes and connects edges corresponding to +the tree's growth rings. The method depends on the parameters for the Canny +Devernay edge detector (sigma), a resize factor, the number of rays, and the +pith location. The first five are fixed by default. The pith location can be +marked manually or using an automatic pith detection algorithm. Besides the +pith localization, CS-TRD is fully automated and achieves an F-Score of 89% in +the UruDendro dataset (of Pinus taeda) and 97% in the Kennel dataset (of Abies +alba) without specialized hardware requirements. + +
+
+ comment: presented to Ipol +
+
+
+
+
+ + ♻ ☆ GeoDTR+: Toward generic cross-view geolocalization via geometric + disentanglement + + +
+ Cross-View Geo-Localization (CVGL) estimates the location of a ground image +by matching it to a geo-tagged aerial image in a database. Recent works achieve +outstanding progress on CVGL benchmarks. However, existing methods still suffer +from poor performance in cross-area evaluation, in which the training and +testing data are captured from completely distinct areas. We attribute this +deficiency to the lack of ability to extract the geometric layout of visual +features and models' overfitting to low-level details. Our preliminary work +introduced a Geometric Layout Extractor (GLE) to capture the geometric layout +from input features. However, the previous GLE does not fully exploit +information in the input feature. In this work, we propose GeoDTR+ with an +enhanced GLE module that better models the correlations among visual features. +To fully explore the LS techniques from our preliminary work, we further +propose Contrastive Hard Samples Generation (CHSG) to facilitate model +training. Extensive experiments show that GeoDTR+ achieves state-of-the-art +(SOTA) results in cross-area evaluation on CVUSA, CVACT, and VIGOR by a large +margin ($16.44\%$, $22.71\%$, and $13.66\%$ without polar transformation) while +keeping the same-area performance comparable to existing SOTA. Moreover, we +provide detailed analyses of GeoDTR+. Our code will be available at +https://gitlab.com/vail-uvm/geodtr plus. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.04074 +
+
+
+
+
+ + ♻ ☆ Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis + Search + + +
+ Deep neural network models have a complex architecture and are +overparameterized. The number of parameters is more than the whole dataset, +which is highly resource-consuming. This complicates their application and +limits its usage on different devices. Reduction in the number of network +parameters helps to reduce the size of the model, but at the same time, +thoughtlessly applied, can lead to a deterioration in the quality of the +network. One way to reduce the number of model parameters is matrix +decomposition, where a matrix is represented as a product of smaller matrices. +In this paper, we propose a new way of applying the matrix decomposition with +respect to the weights of convolutional layers. The essence of the method is to +train not all convolutions, but only the subset of convolutions (basis +convolutions), and represent the rest as linear combinations of the basis ones. +Experiments on models from the ResNet family and the CIFAR-10 dataset +demonstrate that basis convolutions can not only reduce the size of the model +but also accelerate the forward and backward passes of the network. Another +contribution of this work is that we propose a fast method for selecting a +subset of network layers in which the use of matrix decomposition does not +degrade the quality of the final model. + +
+
+ comment: Increase the size of matrix pictures for better UX in PDF view +
+
+
+
+
+ + ♻ ☆ Multi-view Hybrid Graph Convolutional Network for Volume-to-mesh + Reconstruction in Cardiovascular MRI + + +
+ Cardiovascular magnetic resonance imaging is emerging as a crucial tool to +examine cardiac morphology and function. Essential to this endeavour are +anatomical 3D surface and volumetric meshes derived from CMR images, which +facilitate computational anatomy studies, biomarker discovery, and in-silico +simulations. However, conventional surface mesh generation methods, such as +active shape models and multi-atlas segmentation, are highly time-consuming and +require complex processing pipelines to generate simulation-ready 3D meshes. In +response, we introduce HybridVNet, a novel architecture for direct +image-to-mesh extraction seamlessly integrating standard convolutional neural +networks with graph convolutions, which we prove can efficiently handle surface +and volumetric meshes by encoding them as graph structures. To further enhance +accuracy, we propose a multiview HybridVNet architecture which processes both +long axis and short axis CMR, showing that it can increase the performance of +cardiac MR mesh generation. Our model combines traditional convolutional +networks with variational graph generative models, deep supervision and +mesh-specific regularisation. Experiments on a comprehensive dataset from the +UK Biobank confirm the potential of HybridVNet to significantly advance cardiac +imaging and computational cardiology by efficiently generating high-fidelity +and simulation ready meshes from CMR images. + +
+
+
+
+
+ + ♻ ☆ KAN-RCBEVDepth: A multi-modal fusion algorithm in object detection for + autonomous driving + + +
+ Accurate 3D object detection in autonomous driving is critical yet +challenging due to occlusions, varying object scales, and complex urban +environments. This paper introduces the RCBEV-KAN algorithm, a pioneering +method designed to enhance 3D object detection by fusing multimodal sensor data +from cameras, LiDAR, and millimeter-wave radar. Our innovative Bird's Eye View +(BEV)-based approach, utilizing a Transformer architecture, significantly +boosts detection precision and efficiency by seamlessly integrating diverse +data sources, improving spatial relationship handling, and optimizing +computational processes. Experimental results show that the RCBEV-KAN model +demonstrates superior performance across most detection categories, achieving +higher Mean Distance AP (0.389 vs. 0.316, a 23% improvement), better ND Score +(0.484 vs. 0.415, a 17% improvement), and faster Evaluation Time (71.28s, 8% +faster). These results indicate that RCBEV-KAN is more accurate, reliable, and +efficient, making it ideal for dynamic and challenging autonomous driving +environments. + +
+
+
+
+
+ + ♻ ☆ Garment3DGen: 3D Garment Stylization and Texture Generation + + +
+ We introduce Garment3DGen a new method to synthesize 3D garment assets from a +base mesh given a single input image as guidance. Our proposed approach allows +users to generate 3D textured clothes based on both real and synthetic images, +such as those generated by text prompts. The generated assets can be directly +draped and simulated on human bodies. We leverage the recent progress of +image-to-3D diffusion methods to generate 3D garment geometries. However, since +these geometries cannot be utilized directly for downstream tasks, we propose +to use them as pseudo ground-truth and set up a mesh deformation optimization +procedure that deforms a base template mesh to match the generated 3D target. +Carefully designed losses allow the base mesh to freely deform towards the +desired target, yet preserve mesh quality and topology such that they can be +simulated. Finally, we generate high-fidelity texture maps that are globally +and locally consistent and faithfully capture the input guidance, allowing us +to render the generated 3D assets. With Garment3DGen users can generate the +simulation-ready 3D garment of their choice without the need of artist +intervention. We present a plethora of quantitative and qualitative comparisons +on various assets and demonstrate that Garment3DGen unlocks key applications +ranging from sketch-to-simulated garments or interacting with the garments in +VR. Code is publicly available. + +
+
+ comment: Project Page and Code: https://nsarafianos.github.io/garment3dgen +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ TableGuard -- Securing Structured & Unstructured Data + + +
+ With the increasing demand for data sharing across platforms and +organizations, ensuring the privacy and security of sensitive information has +become a critical challenge. This paper introduces "TableGuard". An innovative +approach to data obfuscation tailored for relational databases. Building on the +principles and techniques developed in prior work on context-sensitive +obfuscation, TableGuard applies these methods to ensure that API calls return +only obfuscated data, thereby safeguarding privacy when sharing data with third +parties. TableGuard leverages advanced context-sensitive obfuscation techniques +to replace sensitive data elements with contextually appropriate alternatives. +By maintaining the relational integrity and coherence of the data, our approach +mitigates the risks of cognitive dissonance and data leakage. We demonstrate +the implementation of TableGuard using a BERT based transformer model, which +identifies and obfuscates sensitive entities within relational tables. Our +evaluation shows that TableGuard effectively balances privacy protection with +data utility, minimizing information loss while ensuring that the obfuscated +data remains functionally useful for downstream applications. The results +highlight the importance of domain-specific obfuscation strategies and the role +of context length in preserving data integrity. The implications of this +research are significant for organizations that need to share data securely +with external parties. TableGuard offers a robust framework for implementing +privacy-preserving data sharing mechanisms, thereby contributing to the broader +field of data privacy and security. + +
+
+ comment: 7 pages, 3 tables, 1 figure +
+
+
+
+
+ + ☆ OpenResearcher: Unleashing AI for Accelerated Scientific Research + + +
+ The rapid growth of scientific literature imposes significant challenges for +researchers endeavoring to stay updated with the latest advancements in their +fields and delve into new areas. We introduce OpenResearcher, an innovative +platform that leverages Artificial Intelligence (AI) techniques to accelerate +the research process by answering diverse questions from researchers. +OpenResearcher is built based on Retrieval-Augmented Generation (RAG) to +integrate Large Language Models (LLMs) with up-to-date, domain-specific +knowledge. Moreover, we develop various tools for OpenResearcher to understand +researchers' queries, search from the scientific literature, filter retrieved +information, provide accurate and comprehensive answers, and self-refine these +answers. OpenResearcher can flexibly use these tools to balance efficiency and +effectiveness. As a result, OpenResearcher enables researchers to save time and +increase their potential to discover new insights and drive scientific +breakthroughs. Demo, video, and code are available at: +https://github.com/GAIR-NLP/OpenResearcher. + +
+
+
+
+
+ + ☆ Diffusion Model for Slate Recommendation + + +
+ Slate recommendation is a technique commonly used on streaming platforms and +e-commerce sites to present multiple items together. A significant challenge +with slate recommendation is managing the complex combinatorial choice space. +Traditional methods often simplify this problem by assuming users engage with +only one item at a time. However, this simplification does not reflect the +reality, as users often interact with multiple items simultaneously. In this +paper, we address the general slate recommendation problem, which accounts for +simultaneous engagement with multiple items. We propose a generative approach +using Diffusion Models, leveraging their ability to learn structures in +high-dimensional data. Our model generates high-quality slates that maximize +user satisfaction by overcoming the challenges of the combinatorial choice +space. Furthermore, our approach enhances the diversity of recommendations. +Extensive offline evaluations on applications such as music playlist generation +and e-commerce bundle recommendations show that our model outperforms +state-of-the-art baselines in both relevance and diversity. + +
+
+ comment: 9 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ Reformulating Conversational Recommender Systems as Tri-Phase Offline + Policy Learning CIKM 2024 + + +
+ Existing Conversational Recommender Systems (CRS) predominantly utilize user +simulators for training and evaluating recommendation policies. These +simulators often oversimplify the complexity of user interactions by focusing +solely on static item attributes, neglecting the rich, evolving preferences +that characterize real-world user behavior. This limitation frequently leads to +models that perform well in simulated environments but falter in actual +deployment. Addressing these challenges, this paper introduces the Tri-Phase +Offline Policy Learning-based Conversational Recommender System (TPCRS), which +significantly reduces dependency on real-time interactions and mitigates +overfitting issues prevalent in traditional approaches. TPCRS integrates a +model-based offline learning strategy with a controllable user simulation that +dynamically aligns with both personalized and evolving user preferences. +Through comprehensive experiments, TPCRS demonstrates enhanced robustness, +adaptability, and accuracy in recommendations, outperforming traditional CRS +models in diverse user scenarios. This approach not only provides a more +realistic evaluation environment but also facilitates a deeper understanding of +user behavior dynamics, thereby refining the recommendation process. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ☆ Hierarchical Structured Neural Network for Retrieval + + +
+ Embedding Based Retrieval (EBR) is a crucial component of the retrieval stage +in (Ads) Recommendation System that utilizes Two Tower or Siamese Networks to +learn embeddings for both users and items (ads). It then employs an Approximate +Nearest Neighbor Search (ANN) to efficiently retrieve the most relevant ads for +a specific user. Despite the recent rise to popularity in the industry, they +have a couple of limitations. Firstly, Two Tower model architecture uses a +single dot product interaction which despite their efficiency fail to capture +the data distribution in practice. Secondly, the centroid representation and +cluster assignment, which are components of ANN, occur after the training +process has been completed. As a result, they do not take into account the +optimization criteria used for retrieval model. In this paper, we present +Hierarchical Structured Neural Network (HSNN), a deployed jointly optimized +hierarchical clustering and neural network model that can take advantage of +sophisticated interactions and model architectures that are more common in the +ranking stages while maintaining a sub-linear inference cost. We achieve 6.5% +improvement in offline evaluation and also demonstrate 1.22% online gains +through A/B experiments. HSNN has been successfully deployed into the Ads +Recommendation system and is currently handling major portion of the traffic. +The paper shares our experience in developing this system, dealing with +challenges like freshness, volatility, cold start recommendations, cluster +collapse and lessons deploying the model in a large scale retrieval production +system. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ BMX: Entropy-weighted Similarity and Semantic-enhanced Lexical Search + + +
+ BM25, a widely-used lexical search algorithm, remains crucial in information +retrieval despite the rise of pre-trained and large language models +(PLMs/LLMs). However, it neglects query-document similarity and lacks semantic +understanding, limiting its performance. We revisit BM25 and introduce BMX, a +novel extension of BM25 incorporating entropy-weighted similarity and semantic +enhancement techniques. Extensive experiments demonstrate that BMX consistently +outperforms traditional BM25 and surpasses PLM/LLM-based dense retrieval in +long-context and real-world retrieval benchmarks. This study bridges the gap +between classical lexical search and modern semantic approaches, offering a +promising direction for future information retrieval research. The reference +implementation of BMX can be found in Baguetter, which was created in the +context of this work. The code can be found here: +https://github.com/mixedbread-ai/baguetter. + +
+
+
+
+
+ + ☆ Generalized knowledge-enhanced framework for biomedical entity and + relation extraction + + +
+ In recent years, there has been an increasing number of frameworks developed +for biomedical entity and relation extraction. This research effort aims to +address the accelerating growth in biomedical publications and the intricate +nature of biomedical texts, which are written for mainly domain experts. To +handle these challenges, we develop a novel framework that utilizes external +knowledge to construct a task-independent and reusable background knowledge +graph for biomedical entity and relation extraction. The design of our model is +inspired by how humans learn domain-specific topics. In particular, humans +often first acquire the most basic and common knowledge regarding a field to +build the foundational knowledge and then use that as a basis for extending to +various specialized topics. Our framework employs such common-knowledge-sharing +mechanism to build a general neural-network knowledge graph that is learning +transferable to different domain-specific biomedical texts effectively. +Experimental evaluations demonstrate that our model, equipped with this +generalized and cross-transferable knowledge base, achieves competitive +performance benchmarks, including BioRelEx for binding interaction detection +and ADE for Adverse Drug Effect identification. + +
+
+
+
+
+ + ☆ Prompt Tuning as User Inherent Profile Inference Machine + + +
+ Large Language Models (LLMs) have exhibited significant promise in +recommender systems by empowering user profiles with their extensive world +knowledge and superior reasoning capabilities. However, LLMs face challenges +like unstable instruction compliance, modality gaps, and high inference +latency, leading to textual noise and limiting their effectiveness in +recommender systems. To address these challenges, we propose UserIP-Tuning, +which uses prompt-tuning to infer user profiles. It integrates the causal +relationship between user profiles and behavior sequences into LLMs' prompts. +And employs expectation maximization to infer the embedded latent profile, +minimizing textual noise by fixing the prompt template. Furthermore, A profile +quantization codebook bridges the modality gap by categorizing profile +embeddings into collaborative IDs, which are pre-stored for online deployment. +This improves time efficiency and reduces memory usage. Experiments on four +public datasets show that UserIP-Tuning outperforms state-of-the-art +recommendation algorithms. Additional tests and case studies confirm its +effectiveness, robustness, and transferability. + +
+
+
+
+
+ + ☆ On the Local Ultrametricity of Finite Metric Data + + +
+ New local ultrametricity measures for finite metric data are proposed through +the viewpoint that their Vietoris-Rips corners are samples from p-adic Mumford +curves endowed with a Radon measure coming from a regular differential 1-form. +This is experimentally applied to the iris dataset. + +
+
+ comment: 12 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ CAPRI-FAIR: Integration of Multi-sided Fairness in Contextual POI + Recommendation Framework + + +
+ Point-of-interest (POI) recommendation considers spatio-temporal constraints +like distance, peak hours, and user check-ins. Given their influence on both +consumer experience and POI business, it's crucial to consider fairness from +multiple perspectives. Unfortunately, these systems often provide less accurate +recommendations to inactive users and less exposure to unpopular POIs. This +paper develops a post-filter methodology that integrates provider and consumer +fairness into existing models, aiming to balance fairness metrics like item +exposure with performance metrics such as precision and distance. Experiments +show that a linear scoring model for provider fairness in re-scoring items +offers the best tradeoff between performance and long-tail exposure, sometimes +without significant precision loss. Addressing consumer fairness by +recommending more popular POIs to inactive users increased precision in some +models and datasets. However, combinations that reached the Pareto front of +consumer and provider fairness yielded the lowest precision values, +underscoring that tradeoffs depend heavily on the model and dataset. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Training ID-Agnostic Multi-modal Sequential + Recommenders + + +
+ Sequential Recommendation (SR) aims to predict future user-item interactions +based on historical interactions. While many SR approaches concentrate on user +IDs and item IDs, the human perception of the world through multi-modal +signals, like text and images, has inspired researchers to delve into +constructing SR from multi-modal information without using IDs. However, the +complexity of multi-modal learning manifests in diverse feature extractors, +fusion methods, and pre-trained models. Consequently, designing a simple and +universal \textbf{M}ulti-\textbf{M}odal \textbf{S}equential +\textbf{R}ecommendation (\textbf{MMSR}) framework remains a formidable +challenge. We systematically summarize the existing multi-modal related SR +methods and distill the essence into four core components: visual encoder, text +encoder, multimodal fusion module, and sequential architecture. Along these +dimensions, we dissect the model designs, and answer the following +sub-questions: First, we explore how to construct MMSR from scratch, ensuring +its performance either on par with or exceeds existing SR methods without +complex techniques. Second, we examine if MMSR can benefit from existing +multi-modal pre-training paradigms. Third, we assess MMSR's capability in +tackling common challenges like cold start and domain transferring. Our +experiment results across four real-world recommendation scenarios demonstrate +the great potential ID-agnostic multi-modal sequential recommendation. Our +framework can be found at: https://github.com/MMSR23/MMSR. + +
+
+ comment: A significant error in our methodology was discovered, which impacts + the reliability of the findings. We are revising the study to correct these + issues and will submit a corrected version in the future +
+
+
+
+
+
+
+
+ + Machine Learning 135 + +
+
+
+ + ☆ Approaches for enhancing extrapolability in process-based and + data-driven models in hydrology + + +
+ The application of process-based and data-driven hydrological models is +crucial in modern hydrological research, especially for predicting key water +cycle variables such as runoff, evapotranspiration (ET), and soil moisture. +These models provide a scientific basis for water resource management, flood +forecasting, and ecological protection. Process-based models simulate the +physical mechanisms of watershed hydrological processes, while data-driven +models leverage large datasets and advanced machine learning algorithms. This +paper reviewed and compared methods for assessing and enhancing the +extrapolability of both model types, discussing their prospects and +limitations. Key strategies include the use of leave-one-out cross-validation +and similarity-based methods to evaluate model performance in ungauged regions. +Deep learning, transfer learning, and domain adaptation techniques are also +promising in their potential to improve model predictions in data-sparse and +extreme conditions. Interdisciplinary collaboration and continuous algorithmic +advancements are also important to strengthen the global applicability and +reliability of hydrological models. + +
+
+
+
+
+ + ☆ Diversity Empowers Intelligence: Integrating Expertise of Software + Engineering Agents + + +
+ Large language model (LLM) agents have shown great potential in solving +real-world software engineering (SWE) problems. The most advanced open-source +SWE agent can resolve over 27% of real GitHub issues in SWE-Bench Lite. +However, these sophisticated agent frameworks exhibit varying strengths, +excelling in certain tasks while underperforming in others. To fully harness +the diversity of these agents, we propose DEI (Diversity Empowered +Intelligence), a framework that leverages their unique expertise. DEI functions +as a meta-module atop existing SWE agent frameworks, managing agent collectives +for enhanced problem-solving. Experimental results show that a DEI-guided +committee of agents is able to surpass the best individual agent's performance +by a large margin. For instance, a group of open-source SWE agents, with a +maximum individual resolve rate of 27.3% on SWE-Bench Lite, can achieve a 34.3% +resolve rate with DEI, making a 25% improvement and beating most closed-source +solutions. Our best-performing group excels with a 55% resolve rate, securing +the highest ranking on SWE-Bench Lite. Our findings contribute to the growing +body of research on collaborative AI systems and their potential to solve +complex software engineering challenges. + +
+
+
+
+
+ + ☆ A Survey on Model MoErging: Recycling and Routing Among Specialized + Experts for Collaborative Learning + + +
+ The availability of performant pre-trained models has led to a proliferation +of fine-tuned expert models that are specialized to a particular domain or +task. Model MoErging methods aim to recycle expert models to create an +aggregate system with improved performance or generalization. A key component +of MoErging methods is the creation of a router that decides which expert +model(s) to use for a particular input or application. The promise, +effectiveness, and large design space of MoErging has spurred the development +of many new methods over the past few years. This rapid pace of development has +made it challenging to compare different MoErging methods, which are rarely +compared to one another and are often validated in different experimental +setups. To remedy such gaps, we present a comprehensive survey of MoErging +methods that includes a novel taxonomy for cataloging key design choices and +clarifying suitable applications for each method. Apart from surveying MoErging +research, we inventory software tools and applications that make use of +MoErging. We additionally discuss related fields of study such as model +merging, multitask learning, and mixture-of-experts models. Taken as a whole, +our survey provides a unified overview of existing MoErging methods and creates +a solid foundation for future work in this burgeoning field. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ LongWriter: Unleashing 10,000+ Word Generation from Long Context LLMs + + +
+ Current long context large language models (LLMs) can process inputs up to +100,000 tokens, yet struggle to generate outputs exceeding even a modest length +of 2,000 words. Through controlled experiments, we find that the model's +effective generation length is inherently bounded by the sample it has seen +during supervised fine-tuning (SFT). In other words, their output limitation is +due to the scarcity of long-output examples in existing SFT datasets. To +address this, we introduce AgentWrite, an agent-based pipeline that decomposes +ultra-long generation tasks into subtasks, enabling off-the-shelf LLMs to +generate coherent outputs exceeding 20,000 words. Leveraging AgentWrite, we +construct LongWriter-6k, a dataset containing 6,000 SFT data with output +lengths ranging from 2k to 32k words. By incorporating this dataset into model +training, we successfully scale the output length of existing models to over +10,000 words while maintaining output quality. We also develop LongBench-Write, +a comprehensive benchmark for evaluating ultra-long generation capabilities. +Our 9B parameter model, further improved through DPO, achieves state-of-the-art +performance on this benchmark, surpassing even much larger proprietary models. +In general, our work demonstrates that existing long context LLM already +possesses the potential for a larger output window--all you need is data with +extended output during model alignment to unlock this capability. Our code & +models are at: https://github.com/THUDM/LongWriter. + +
+
+
+
+
+ + ☆ TableGuard -- Securing Structured & Unstructured Data + + +
+ With the increasing demand for data sharing across platforms and +organizations, ensuring the privacy and security of sensitive information has +become a critical challenge. This paper introduces "TableGuard". An innovative +approach to data obfuscation tailored for relational databases. Building on the +principles and techniques developed in prior work on context-sensitive +obfuscation, TableGuard applies these methods to ensure that API calls return +only obfuscated data, thereby safeguarding privacy when sharing data with third +parties. TableGuard leverages advanced context-sensitive obfuscation techniques +to replace sensitive data elements with contextually appropriate alternatives. +By maintaining the relational integrity and coherence of the data, our approach +mitigates the risks of cognitive dissonance and data leakage. We demonstrate +the implementation of TableGuard using a BERT based transformer model, which +identifies and obfuscates sensitive entities within relational tables. Our +evaluation shows that TableGuard effectively balances privacy protection with +data utility, minimizing information loss while ensuring that the obfuscated +data remains functionally useful for downstream applications. The results +highlight the importance of domain-specific obfuscation strategies and the role +of context length in preserving data integrity. The implications of this +research are significant for organizations that need to share data securely +with external parties. TableGuard offers a robust framework for implementing +privacy-preserving data sharing mechanisms, thereby contributing to the broader +field of data privacy and security. + +
+
+ comment: 7 pages, 3 tables, 1 figure +
+
+
+
+
+ + ☆ Defining and Measuring Disentanglement for non-Independent Factors of + Variation + + +
+ Representation learning is an approach that allows to discover and extract +the factors of variation from the data. Intuitively, a representation is said +to be disentangled if it separates the different factors of variation in a way +that is understandable to humans. Definitions of disentanglement and metrics to +measure it usually assume that the factors of variation are independent of each +other. However, this is generally false in the real world, which limits the use +of these definitions and metrics to very specific and unrealistic scenarios. In +this paper we give a definition of disentanglement based on information theory +that is also valid when the factors of variation are not independent. +Furthermore, we relate this definition to the Information Bottleneck Method. +Finally, we propose a method to measure the degree of disentanglement from the +given definition that works when the factors of variation are not independent. +We show through different experiments that the method proposed in this paper +correctly measures disentanglement with non-independent factors of variation, +while other methods fail in this scenario. + +
+
+
+
+
+ + ☆ Faster Private Minimum Spanning Trees + + +
+ Motivated by applications in clustering and synthetic data generation, we +consider the problem of releasing a minimum spanning tree (MST) under +edge-weight differential privacy constraints where a graph topology $G=(V,E)$ +with $n$ vertices and $m$ edges is public, the weight matrix $\vec{W}\in +\mathbb{R}^{n \times n}$ is private, and we wish to release an approximate MST +under $\rho$-zero-concentrated differential privacy. Weight matrices are +considered neighboring if they differ by at most $\Delta_\infty$ in each entry, +i.e., we consider an $\ell_\infty$ neighboring relationship. Existing private +MST algorithms either add noise to each entry in $\vec{W}$ and estimate the MST +by post-processing or add noise to weights in-place during the execution of a +specific MST algorithm. Using the post-processing approach with an efficient +MST algorithm takes $O(n^2)$ time on dense graphs but results in an additive +error on the weight of the MST of magnitude $O(n^2\log n)$. In-place algorithms +give asymptotically better utility, but the running time of existing in-place +algorithms is $O(n^3)$ for dense graphs. Our main result is a new +differentially private MST algorithm that matches the utility of existing +in-place methods while running in time $O(m + n^{3/2}\log n)$ for fixed privacy +parameter $\rho$. The technical core of our algorithm is an efficient sublinear +time simulation of Report-Noisy-Max that works by discretizing all edge weights +to a multiple of $\Delta_\infty$ and forming groups of edges with identical +weights. Specifically, we present a data structure that allows us to sample a +noisy minimum weight edge among at most $O(n^2)$ cut edges in $O(\sqrt{n} \log +n)$ time. Experimental evaluations support our claims that our algorithm +significantly improves previous algorithms either in utility or running time. + +
+
+
+
+
+ + ☆ Blessing of Dimensionality for Approximating Sobolev Classes on + Manifolds + + +
+ The manifold hypothesis says that natural high-dimensional data is actually +supported on or around a low-dimensional manifold. Recent success of +statistical and learning-based methods empirically supports this hypothesis, +due to outperforming classical statistical intuition in very high dimensions. A +natural step for analysis is thus to assume the manifold hypothesis and derive +bounds that are independent of any embedding space. Theoretical implications in +this direction have recently been explored in terms of generalization of ReLU +networks and convergence of Langevin methods. We complement existing results by +providing theoretical statistical complexity results, which directly relates to +generalization properties. In particular, we demonstrate that the statistical +complexity required to approximate a class of bounded Sobolev functions on a +compact manifold is bounded from below, and moreover that this bound is +dependent only on the intrinsic properties of the manifold. These provide +complementary bounds for existing approximation results for ReLU networks on +manifolds, which give upper bounds on generalization capacity. + +
+
+
+
+
+ + ☆ IRS-Assisted Lossy Communications Under Correlated Rayleigh Fading: + Outage Probability Analysis and Optimization + + +
+ This paper focuses on an intelligent reflecting surface (IRS)-assisted lossy +communication system with correlated Rayleigh fading. We analyze the correlated +channel model and derive the outage probability of the system. Then, we design +a deep reinforce learning (DRL) method to optimize the phase shift of IRS, in +order to maximize the received signal power. Moreover, this paper presents +results of the simulations conducted to evaluate the performance of the +DRL-based method. The simulation results indicate that the outage probability +of the considered system increases significantly with more correlated channel +coefficients. Moreover, the performance gap between DRL and theoretical limit +increases with higher transmit power and/or larger distortion requirement. + +
+
+
+
+
+ + ☆ Event-Stream Super Resolution using Sigma-Delta Neural Network ECCV + + +
+ This study introduces a novel approach to enhance the spatial-temporal +resolution of time-event pixels based on luminance changes captured by event +cameras. These cameras present unique challenges due to their low resolution +and the sparse, asynchronous nature of the data they collect. Current event +super-resolution algorithms are not fully optimized for the distinct data +structure produced by event cameras, resulting in inefficiencies in capturing +the full dynamism and detail of visual scenes with improved computational +complexity. To bridge this gap, our research proposes a method that integrates +binary spikes with Sigma Delta Neural Networks (SDNNs), leveraging +spatiotemporal constraint learning mechanism designed to simultaneously learn +the spatial and temporal distributions of the event stream. The proposed +network is evaluated using widely recognized benchmark datasets, including +N-MNIST, CIFAR10-DVS, ASL-DVS, and Event-NFS. A comprehensive evaluation +framework is employed, assessing both the accuracy, through root mean square +error (RMSE), and the computational efficiency of our model. The findings +demonstrate significant improvements over existing state-of-the-art methods, +specifically, the proposed method outperforms state-of-the-art performance in +computational efficiency, achieving a 17.04-fold improvement in event sparsity +and a 32.28-fold increase in synaptic operation efficiency over traditional +artificial neural networks, alongside a two-fold better performance over +spiking neural networks. + +
+
+ comment: ECCV: The 18th European Conference on Computer Vision ECCV 2024 NeVi + Workshop +
+
+
+
+
+ + ☆ Stabilizer bootstrapping: A recipe for efficient agnostic tomography and + magic estimation + + +
+ We study the task of agnostic tomography: given copies of an unknown +$n$-qubit state $\rho$ which has fidelity $\tau$ with some state in a given +class $C$, find a state which has fidelity $\ge \tau - \epsilon$ with $\rho$. +We give a new framework, stabilizer bootstrapping, for designing +computationally efficient protocols for this task, and use this to get new +agnostic tomography protocols for the following classes: + Stabilizer states: We give a protocol that runs in time +$\mathrm{poly}(n,1/\epsilon)\cdot (1/\tau)^{O(\log(1/\tau))}$, answering an +open question posed by Grewal, Iyer, Kretschmer, Liang [40] and Anshu and +Arunachalam [6]. Previous protocols ran in time $\mathrm{exp}(\Theta(n))$ or +required $\tau>\cos^2(\pi/8)$. + States with stabilizer dimension $n - t$: We give a protocol that runs in +time $n^3\cdot(2^t/\tau)^{O(\log(1/\epsilon))}$, extending recent work on +learning quantum states prepared by circuits with few non-Clifford gates, which +only applied in the realizable setting where $\tau = 1$ [30, 37, 46, 61]. + Discrete product states: If $C = K^{\otimes n}$ for some $\mu$-separated +discrete set $K$ of single-qubit states, we give a protocol that runs in time +$(n/\mu)^{O((1 + \log (1/\tau))/\mu)}/\epsilon^2$. This strictly generalizes a +prior guarantee which applied to stabilizer product states [39]. For stabilizer +product states, we give a further improved protocol that runs in time +$(n^2/\epsilon^2)\cdot (1/\tau)^{O(\log(1/\tau))}$. + As a corollary, we give the first protocol for estimating stabilizer +fidelity, a standard measure of magic for quantum states, to error $\epsilon$ +in $n^3 \mathrm{quasipoly}(1/\epsilon)$ time. + +
+
+ comment: 68 pages +
+
+
+
+
+ + ☆ DyG-Mamba: Continuous State Space Modeling on Dynamic Graphs + + +
+ Dynamic graph learning aims to uncover evolutionary laws in real-world +systems, enabling accurate social recommendation (link prediction) or early +detection of cancer cells (classification). Inspired by the success of state +space models, e.g., Mamba, for efficiently capturing long-term dependencies in +language modeling, we propose DyG-Mamba, a new continuous state space model +(SSM) for dynamic graph learning. Specifically, we first found that using +inputs as control signals for SSM is not suitable for continuous-time dynamic +network data with irregular sampling intervals, resulting in models being +insensitive to time information and lacking generalization properties. Drawing +inspiration from the Ebbinghaus forgetting curve, which suggests that memory of +past events is strongly correlated with time intervals rather than specific +details of the events themselves, we directly utilize irregular time spans as +control signals for SSM to achieve significant robustness and generalization. +Through exhaustive experiments on 12 datasets for dynamic link prediction and +dynamic node classification tasks, we found that DyG-Mamba achieves +state-of-the-art performance on most of the datasets, while also demonstrating +significantly improved computation and memory efficiency. + +
+
+
+
+
+ + ☆ Measuring User Understanding in Dialogue-based XAI Systems ECAI 2024 + + +
+ The field of eXplainable Artificial Intelligence (XAI) is increasingly +recognizing the need to personalize and/or interactively adapt the explanation +to better reflect users' explanation needs. While dialogue-based approaches to +XAI have been proposed recently, the state-of-the-art in XAI is still +characterized by what we call one-shot, non-personalized and one-way +explanations. In contrast, dialogue-based systems that can adapt explanations +through interaction with a user promise to be superior to GUI-based or +dashboard explanations as they offer a more intuitive way of requesting +information. In general, while interactive XAI systems are often evaluated in +terms of user satisfaction, there are limited studies that access user's +objective model understanding. This is in particular the case for +dialogue-based XAI approaches. In this paper, we close this gap by carrying out +controlled experiments within a dialogue framework in which we measure +understanding of users in three phases by asking them to simulate the +predictions of the model they are learning about. By this, we can quantify the +level of (improved) understanding w.r.t. how the model works, comparing the +state prior, and after the interaction. We further analyze the data to reveal +patterns of how the interaction between groups with high vs. low understanding +gain differ. Overall, our work thus contributes to our understanding about the +effectiveness of XAI approaches. + +
+
+ comment: Accepted at the ECAI 2024 main conference - final version and code + coming soon. 8 pages, 5 figures +
+
+
+
+
+ + ☆ AuToMATo: A Parameter-Free Persistence-Based Clustering Algorithm + + +
+ We present AuToMATo, a novel parameter-free clustering algorithm based on +persistent homology. AuToMATo combines the existing ToMATo clustering algorithm +with a bootstrapping procedure in order to separate significant peaks of an +estimated density function from non-significant ones. We perform a thorough +comparison of AuToMATo against many other state-of-the-art clustering +algorithms. We find that not only that AuToMATo compares favorably against +other parameter-free clustering algorithms, but in many instances also +significantly outperforms even the best selection of parameters for other +algorithms. AuToMATo is motivated by applications in topological data analysis, +in particular the Mapper algorithm, where it is desirable to work with a +parameter-free clustering algorithm. Indeed, we provide evidence that AuToMATo +performs well when used with Mapper. Finally, we provide an open-source +implementation of AuToMATo in Python that is fully compatible with the +standardscikit-learn architecture. + +
+
+
+
+
+ + ☆ Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation + + +
+ By using an parametric value function to replace the Monte-Carlo rollouts for +value estimation, the actor-critic (AC) algorithms can reduce the variance of +stochastic policy gradient so that to improve the convergence rate. While +existing works mainly focus on analyzing convergence rate of AC algorithms +under Markovian noise, the impacts of momentum on AC algorithms remain largely +unexplored. In this work, we first propose a heavy-ball momentum based +advantage actor-critic (\mbox{HB-A2C}) algorithm by integrating the heavy-ball +momentum into the critic recursion that is parameterized by a linear function. +When the sample trajectory follows a Markov decision process, we quantitatively +certify the acceleration capability of the proposed HB-A2C algorithm. Our +theoretical results demonstrate that the proposed HB-A2C finds an +$\epsilon$-approximate stationary point with $\oo{\epsilon^{-2}}$ iterations +for reinforcement learning tasks with Markovian noise. Moreover, we also reveal +the dependence of learning rates on the length of the sample trajectory. By +carefully selecting the momentum factor of the critic recursion, the proposed +HB-A2C can balance the errors introduced by the initialization and the +stoschastic approximation. + +
+
+
+
+
+ + ☆ Towards Holistic Disease Risk Prediction using Small Language Models ICML + + +
+ Data in the healthcare domain arise from a variety of sources and modalities, +such as x-ray images, continuous measurements, and clinical notes. Medical +practitioners integrate these diverse data types daily to make informed and +accurate decisions. With recent advancements in language models capable of +handling multimodal data, it is a logical progression to apply these models to +the healthcare sector. In this work, we introduce a framework that connects +small language models to multiple data sources, aiming to predict the risk of +various diseases simultaneously. Our experiments encompass 12 different tasks +within a multitask learning setup. Although our approach does not surpass +state-of-the-art methods specialized for single tasks, it demonstrates +competitive performance and underscores the potential of small language models +for multimodal reasoning in healthcare. + +
+
+ comment: 6 pages, submitted to ICMLA +
+
+
+
+
+ + ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. + To overcome these constraints, this paper presents the Inter-class Feature +Compensator (INFER), an innovative distillation approach that transcends the +class-specific data-label framework widely utilized in current dataset +distillation methods. Specifically, INFER leverages a Universal Feature +Compensator (UFC) to enhance feature integration across classes, enabling the +generation of multiple additional synthetic instances from a single UFC input. +This significantly improves the efficiency of the distillation budget. + Moreover, INFER enriches inter-class interactions during the distillation, +thereby enhancing the effectiveness and generalizability of the distilled data. +By allowing for the linear interpolation of labels similar to those in the +original dataset, INFER meticulously optimizes the synthetic data and +dramatically reduces the size of soft labels in the synthetic dataset to almost +zero, establishing a new benchmark for efficiency and effectiveness in dataset +distillation. + +
+
+
+
+
+ + ☆ Heterogeneity: An Open Challenge for Federated On-board Machine Learning SP + + +
+ The design of satellite missions is currently undergoing a paradigm shift +from the historical approach of individualised monolithic satellites towards +distributed mission configurations, consisting of multiple small satellites. +With a rapidly growing number of such satellites now deployed in orbit, each +collecting large amounts of data, interest in on-board orbital edge computing +is rising. Federated Learning is a promising distributed computing approach in +this context, allowing multiple satellites to collaborate efficiently in +training on-board machine learning models. Though recent works on the use of +Federated Learning in orbital edge computing have focused largely on +homogeneous satellite constellations, Federated Learning could also be employed +to allow heterogeneous satellites to form ad-hoc collaborations, e.g. in the +case of communications satellites operated by different providers. Such an +application presents additional challenges to the Federated Learning paradigm, +arising largely from the heterogeneity of such a system. In this position +paper, we offer a systematic review of these challenges in the context of the +cross-provider use case, giving a brief overview of the state-of-the-art for +each, and providing an entry point for deeper exploration of each issue. + +
+
+ comment: Accepted to the ESA SPAICE conference 2024 +
+
+
+
+
+ + ☆ Automatic Feature Recognition and Dimensional Attributes Extraction From + CAD Models for Hybrid Additive-Subtractive Manufacturing + + +
+ The integration of Computer-Aided Design (CAD), Computer-Aided Process +Planning (CAPP), and Computer-Aided Manufacturing (CAM) plays a crucial role in +modern manufacturing, facilitating seamless transitions from digital designs to +physical products. However, a significant challenge within this integration is +the Automatic Feature Recognition (AFR) of CAD models, especially in the +context of hybrid manufacturing that combines subtractive and additive +manufacturing processes. Traditional AFR methods, focused mainly on the +identification of subtractive (machined) features including holes, fillets, +chamfers, pockets, and slots, fail to recognize features pertinent to additive +manufacturing. Furthermore, the traditional methods fall short in accurately +extracting geometric dimensions and orientations, which are also key factors +for effective manufacturing process planning. This paper presents a novel +approach for creating a synthetic CAD dataset that encompasses features +relevant to both additive and subtractive machining through Python Open +Cascade. The Hierarchical Graph Convolutional Neural Network (HGCNN) model is +implemented to accurately identify the composite additive-subtractive features +within the synthetic CAD dataset. The key novelty and contribution of the +proposed methodology lie in its ability to recognize a wide range of +manufacturing features, and precisely extracting their dimensions, +orientations, and stock sizes. The proposed model demonstrates remarkable +feature recognition accuracy exceeding 97% and a dimension extraction accuracy +of 100% for identified features. Therefore, the proposed methodology enhances +the integration of CAD, CAPP, and CAM within hybrid manufacturing by providing +precise feature recognition and dimension extraction. It facilitates improved +manufacturing process planning, by enabling more informed decision-making. + +
+
+ comment: 10 pages, 12 figures. This paper has been accepted for presentation + at the ASME IDETC-CIE 2024 conference +
+
+
+
+
+ + ☆ BMFT: Achieving Fairness via Bias-based Weight Masking Fine-tuning MICCAI 2024 + + +
+ Developing models with robust group fairness properties is paramount, +particularly in ethically sensitive domains such as medical diagnosis. Recent +approaches to achieving fairness in machine learning require a substantial +amount of training data and depend on model retraining, which may not be +practical in real-world scenarios. To mitigate these challenges, we propose +Bias-based Weight Masking Fine-Tuning (BMFT), a novel post-processing method +that enhances the fairness of a trained model in significantly fewer epochs +without requiring access to the original training data. BMFT produces a mask +over model parameters, which efficiently identifies the weights contributing +the most towards biased predictions. Furthermore, we propose a two-step +debiasing strategy, wherein the feature extractor undergoes initial fine-tuning +on the identified bias-influenced weights, succeeded by a fine-tuning phase on +a reinitialised classification layer to uphold discriminative performance. +Extensive experiments across four dermatological datasets and two sensitive +attributes demonstrate that BMFT outperforms existing state-of-the-art (SOTA) +techniques in both diagnostic accuracy and fairness metrics. Our findings +underscore the efficacy and robustness of BMFT in advancing fairness across +various out-of-distribution (OOD) settings. Our code is available at: +https://github.com/vios-s/BMFT + +
+
+ comment: Accepted by MICCAI 2024 FAIMI Workshop Oral +
+
+
+
+
+ + ☆ Optimal Bound for PCA with Outliers using Higher-Degree Voronoi Diagrams + + +
+ In this paper, we introduce new algorithms for Principal Component Analysis +(PCA) with outliers. Utilizing techniques from computational geometry, +specifically higher-degree Voronoi diagrams, we navigate to the optimal +subspace for PCA even in the presence of outliers. This approach achieves an +optimal solution with a time complexity of +$n^{d+\mathcal{O}(1)}\text{poly}(n,d)$. Additionally, we present a randomized +algorithm with a complexity of $2^{\mathcal{O}(r(d-r))} \times \text{poly}(n, +d)$. This algorithm samples subspaces characterized in terms of a Grassmannian +manifold. By employing such sampling method, we ensure a high likelihood of +capturing the optimal subspace, with the success probability $(1 - \delta)^T$. +Where $\delta$ represents the probability that a sampled subspace does not +contain the optimal solution, and $T$ is the number of subspaces sampled, +proportional to $2^{r(d-r)}$. Our use of higher-degree Voronoi diagrams and +Grassmannian based sampling offers a clearer conceptual pathway and practical +advantages, particularly in handling large datasets or higher-dimensional +settings. + +
+
+
+
+
+ + ☆ PRESENT: Zero-Shot Text-to-Prosody Control + + +
+ Current strategies for achieving fine-grained prosody control in speech +synthesis entail extracting additional style embeddings or adopting more +complex architectures. To enable zero-shot application of pretrained +text-to-speech (TTS) models, we present PRESENT (PRosody Editing without Style +Embeddings or New Training), which exploits explicit prosody prediction in +FastSpeech2-based models by modifying the inference process directly. We apply +our text-to-prosody framework to zero-shot language transfer using a JETS model +exclusively trained on English LJSpeech data. We obtain character error rates +(CER) of 12.8%, 18.7% and 5.9% for German, Hungarian and Spanish respectively, +beating the previous state-of-the-art CER by over 2x for all three languages. +Furthermore, we allow subphoneme-level control, a first in this field. To +evaluate its effectiveness, we show that PRESENT can improve the prosody of +questions, and use it to generate Mandarin, a tonal language where vowel pitch +varies at subphoneme level. We attain 25.3% hanzi CER and 13.0% pinyin CER with +the JETS model. All our code and audio samples are available online. + +
+
+
+
+
+ + ☆ Efficient Search for Customized Activation Functions with Gradient + Descent + + +
+ Different activation functions work best for different deep learning models. +To exploit this, we leverage recent advancements in gradient-based search +techniques for neural architectures to efficiently identify high-performing +activation functions for a given application. We propose a fine-grained search +cell that combines basic mathematical operations to model activation functions, +allowing for the exploration of novel activations. Our approach enables the +identification of specialized activations, leading to improved performance in +every model we tried, from image classification to language models. Moreover, +the identified activations exhibit strong transferability to larger models of +the same type, as well as new datasets. Importantly, our automated process for +creating customized activation functions is orders of magnitude more efficient +than previous approaches. It can easily be applied on top of arbitrary deep +learning pipelines and thus offers a promising practical avenue for enhancing +deep learning architectures. + +
+
+ comment: 10 pages, 1 figure, excluding references and appendix +
+
+
+
+
+ + ☆ Enhancing Multiview Synergy: Robust Learning by Exploiting the Wave Loss + Function with Consensus and Complementarity Principles + + +
+ Multiview learning (MvL) is an advancing domain in machine learning, +leveraging multiple data perspectives to enhance model performance through +view-consistency and view-discrepancy. Despite numerous successful +multiview-based SVM models, existing frameworks predominantly focus on the +consensus principle, often overlooking the complementarity principle. +Furthermore, they exhibit limited robustness against noisy, error-prone, and +view-inconsistent samples, prevalent in multiview datasets. To tackle the +aforementioned limitations, this paper introduces Wave-MvSVM, a novel multiview +support vector machine framework leveraging the wave loss (W-loss) function, +specifically designed to harness both consensus and complementarity principles. +Unlike traditional approaches that often overlook the complementary information +among different views, the proposed Wave-MvSVM ensures a more comprehensive and +resilient learning process by integrating both principles effectively. The +W-loss function, characterized by its smoothness, asymmetry, and bounded +nature, is particularly effective in mitigating the adverse effects of noisy +and outlier data, thereby enhancing model stability. Theoretically, the W-loss +function also exhibits a crucial classification-calibrated property, further +boosting its effectiveness. Wave-MvSVM employs a between-view co-regularization +term to enforce view consistency and utilizes an adaptive combination weight +strategy to maximize the discriminative power of each view. The optimization +problem is efficiently solved using a combination of GD and the ADMM, ensuring +reliable convergence to optimal solutions. Theoretical analyses, grounded in +Rademacher complexity, validate the generalization capabilities of the +Wave-MvSVM model. Extensive empirical evaluations across diverse datasets +demonstrate the superior performance of Wave-MvSVM in comparison to existing +benchmark models. + +
+
+
+
+
+ + ☆ On a Scale-Invariant Approach to Bundle Recommendations in Candy Crush + Saga + + +
+ A good understanding of player preferences is crucial for increasing content +relevancy, especially in mobile games. This paper illustrates the use of +attentive models for producing item recommendations in a mobile game scenario. +The methodology comprises a combination of supervised and unsupervised +approaches to create user-level recommendations while introducing a novel +scale-invariant approach to the prediction. The methodology is subsequently +applied to a bundle recommendation in Candy Crush Saga. The strategy of +deployment, maintenance, and monitoring of ML models that are scaled up to +serve millions of users is presented, along with the best practices and design +patterns adopted to minimize technical debt typical of ML systems. The +recommendation approach is evaluated both offline and online, with a focus on +understanding the increase in engagement, click- and take rates, novelty +effects, recommendation diversity, and the impact of degenerate feedback loops. +We have demonstrated that the recommendation enhances user engagement by 30% +concerning click rate and by more than 40% concerning take rate. In addition, +we empirically quantify the diminishing effects of recommendation accuracy on +user engagement. + +
+
+
+
+
+ + ☆ Enhancing Diabetic Retinopathy Diagnosis: A Lightweight CNN Architecture + for Efficient Exudate Detection in Retinal Fundus Images + + +
+ Retinal fundus imaging plays an essential role in diagnosing various stages +of diabetic retinopathy, where exudates are critical markers of early disease +onset. Prompt detection of these exudates is pivotal for enabling optometrists +to arrest or significantly decelerate the disease progression. This paper +introduces a novel, lightweight convolutional neural network architecture +tailored for automated exudate detection, designed to identify these markers +efficiently and accurately. To address the challenge of limited training data, +we have incorporated domain-specific data augmentations to enhance the model's +generalizability. Furthermore, we applied a suite of regularization techniques +within our custom architecture to boost diagnostic accuracy while optimizing +computational efficiency. Remarkably, this streamlined model contains only 4.73 +million parameters a reduction of nearly 60% compared to the standard ResNet-18 +model, which has 11.69 million parameters. Despite its reduced complexity, our +model achieves an impressive F1 score of 90%, demonstrating its efficacy in the +early detection of diabetic retinopathy through fundus imaging. + +
+
+
+
+
+ + ☆ Exploring Domain Shift on Radar-Based 3D Object Detection Amidst Diverse + Environmental Conditions SC + + +
+ The rapid evolution of deep learning and its integration with autonomous +driving systems have led to substantial advancements in 3D perception using +multimodal sensors. Notably, radar sensors show greater robustness compared to +cameras and lidar under adverse weather and varying illumination conditions. +This study delves into the often-overlooked yet crucial issue of domain shift +in 4D radar-based object detection, examining how varying environmental +conditions, such as different weather patterns and road types, impact 3D object +detection performance. Our findings highlight distinct domain shifts across +various weather scenarios, revealing unique dataset sensitivities that +underscore the critical role of radar point cloud generation. Additionally, we +demonstrate that transitioning between different road types, especially from +highways to urban settings, introduces notable domain shifts, emphasizing the +necessity for diverse data collection across varied road environments. To the +best of our knowledge, this is the first comprehensive analysis of domain shift +effects on 4D radar-based object detection. We believe this empirical study +contributes to understanding the complex nature of domain shifts in radar data +and suggests paths forward for data collection strategy in the face of +environmental variability. + +
+
+ comment: 6 pages, 5 figures, 3 tables, accepted in IEEE International + Conference on Intelligent Transportation Systems (ITSC) 2024 +
+
+
+
+
+ + ☆ Robust Black-box Testing of Deep Neural Networks using Co-Domain + Coverage + + +
+ Rigorous testing of machine learning models is necessary for trustworthy +deployments. We present a novel black-box approach for generating test-suites +for robust testing of deep neural networks (DNNs). Most existing methods create +test inputs based on maximizing some "coverage" criterion/metric such as a +fraction of neurons activated by the test inputs. Such approaches, however, can +only analyze each neuron's behavior or each layer's output in isolation and are +unable to capture their collective effect on the DNN's output, resulting in +test suites that often do not capture the various failure modes of the DNN +adequately. These approaches also require white-box access, i.e., access to the +DNN's internals (node activations). We present a novel black-box coverage +criterion called Co-Domain Coverage (CDC), which is defined as a function of +the model's output and thus takes into account its end-to-end behavior. +Subsequently, we develop a new fuzz testing procedure named CoDoFuzz, which +uses CDC to guide the fuzzing process to generate a test suite for a DNN. We +extensively compare the test suite generated by CoDoFuzz with those generated +using several state-of-the-art coverage-based fuzz testing methods for the DNNs +trained on six publicly available datasets. Experimental results establish the +efficiency and efficacy of CoDoFuzz in generating the largest number of +misclassified inputs and the inputs for which the model lacks confidence in its +decision. + +
+
+ comment: 20 pages (including references), 4 figures, 7 tables +
+
+
+
+
+ + ☆ Class-aware and Augmentation-free Contrastive Learning from Label + Proportion + + +
+ Learning from Label Proportion (LLP) is a weakly supervised learning scenario +in which training data is organized into predefined bags of instances, +disclosing only the class label proportions per bag. This paradigm is essential +for user modeling and personalization, where user privacy is paramount, +offering insights into user preferences without revealing individual data. LLP +faces a unique difficulty: the misalignment between bag-level supervision and +the objective of instance-level prediction, primarily due to the inherent +ambiguity in label proportion matching. Previous studies have demonstrated deep +representation learning can generate auxiliary signals to promote the +supervision level in the image domain. However, applying these techniques to +tabular data presents significant challenges: 1) they rely heavily on +label-invariant augmentation to establish multi-view, which is not feasible +with the heterogeneous nature of tabular datasets, and 2) tabular datasets +often lack sufficient semantics for perfect class distinction, making them +prone to suboptimality caused by the inherent ambiguity of label proportion +matching. + To address these challenges, we propose an augmentation-free contrastive +framework TabLLP-BDC that introduces class-aware supervision (explicitly aware +of class differences) at the instance level. Our solution features a two-stage +Bag Difference Contrastive (BDC) learning mechanism that establishes robust +class-aware instance-level supervision by disassembling the nuance between bag +label proportions, without relying on augmentations. Concurrently, our model +presents a pioneering multi-task pretraining pipeline tailored for +tabular-based LLP, capturing intrinsic tabular feature correlations in +alignment with label proportion distribution. Extensive experiments demonstrate +that TabLLP-BDC achieves state-of-the-art performance for LLP in the tabular +domain. + +
+
+
+
+
+ + ☆ Multimodal Analysis of White Blood Cell Differentiation in Acute Myeloid + Leukemia Patients using a β-Variational Autoencoder MICCAI 2024 + + +
+ Biomedical imaging and RNA sequencing with single-cell resolution improves +our understanding of white blood cell diseases like leukemia. By combining +morphological and transcriptomic data, we can gain insights into cellular +functions and trajectoriess involved in blood cell differentiation. However, +existing methodologies struggle with integrating morphological and +transcriptomic data, leaving a significant research gap in comprehensively +understanding the dynamics of cell differentiation. Here, we introduce an +unsupervised method that explores and reconstructs these two modalities and +uncovers the relationship between different subtypes of white blood cells from +human peripheral blood smears in terms of morphology and their corresponding +transcriptome. Our method is based on a beta-variational autoencoder +(\beta-VAE) with a customized loss function, incorporating a R-CNN architecture +to distinguish single-cell from background and to minimize any interference +from artifacts. This implementation of \beta-VAE shows good reconstruction +capability along with continuous latent embeddings, while maintaining clear +differentiation between single-cell classes. Our novel approach is especially +helpful to uncover the correlation of two latent features in complex biological +processes such as formation of granules in the cell (granulopoiesis) with gene +expression patterns. It thus provides a unique tool to improve the +understanding of white blood cell maturation for biomedicine and diagnostics. + +
+
+ comment: Accepted for publication at MICCAI 2024 workshop on AI for Imaging + Genomics Learning (AIIG) +
+
+
+
+
+ + ☆ Computation-friendly Graph Neural Network Design by Accumulating + Knowledge on Large Language Models + + +
+ Graph Neural Networks (GNNs), like other neural networks, have shown +remarkable success but are hampered by the complexity of their architecture +designs, which heavily depend on specific data and tasks. Traditionally, +designing proper architectures involves trial and error, which requires +intensive manual effort to optimize various components. To reduce human +workload, researchers try to develop automated algorithms to design GNNs. +However, both experts and automated algorithms suffer from two major issues in +designing GNNs: 1) the substantial computational resources expended in +repeatedly trying candidate GNN architectures until a feasible design is +achieved, and 2) the intricate and prolonged processes required for humans or +algorithms to accumulate knowledge of the interrelationship between graphs, +GNNs, and performance. + To further enhance the automation of GNN architecture design, we propose a +computation-friendly way to empower Large Language Models (LLMs) with +specialized knowledge in designing GNNs, thereby drastically shortening the +computational overhead and development cycle of designing GNN architectures. +Our framework begins by establishing a knowledge retrieval pipeline that +comprehends the intercorrelations between graphs, GNNs, and performance. This +pipeline converts past model design experiences into structured knowledge for +LLM reference, allowing it to quickly suggest initial model proposals. +Subsequently, we introduce a knowledge-driven search strategy that emulates the +exploration-exploitation process of human experts, enabling quick refinement of +initial proposals within a promising scope. Extensive experiments demonstrate +that our framework can efficiently deliver promising (e.g., Top-5.77%) initial +model proposals for unseen datasets within seconds and without any prior +training and achieve outstanding search performance in a few iterations. + +
+
+
+
+
+ + ☆ Variational Learning of Gaussian Process Latent Variable Models through + Stochastic Gradient Annealed Importance Sampling + + +
+ Gaussian Process Latent Variable Models (GPLVMs) have become increasingly +popular for unsupervised tasks such as dimensionality reduction and missing +data recovery due to their flexibility and non-linear nature. An +importance-weighted version of the Bayesian GPLVMs has been proposed to obtain +a tighter variational bound. However, this version of the approach is primarily +limited to analyzing simple data structures, as the generation of an effective +proposal distribution can become quite challenging in high-dimensional spaces +or with complex data sets. In this work, we propose an Annealed Importance +Sampling (AIS) approach to address these issues. By transforming the posterior +into a sequence of intermediate distributions using annealing, we combine the +strengths of Sequential Monte Carlo samplers and VI to explore a wider range of +posterior distributions and gradually approach the target distribution. We +further propose an efficient algorithm by reparameterizing all variables in the +evidence lower bound (ELBO). Experimental results on both toy and image +datasets demonstrate that our method outperforms state-of-the-art methods in +terms of tighter variational bounds, higher log-likelihoods, and more robust +convergence. + +
+
+
+
+
+ + ☆ DiffSG: A Generative Solver for Network Optimization with Diffusion + Model + + +
+ Diffusion generative models, famous for their performance in image +generation, are popular in various cross-domain applications. However, their +use in the communication community has been mostly limited to auxiliary tasks +like data modeling and feature extraction. These models hold greater promise +for fundamental problems in network optimization compared to traditional +machine learning methods. Discriminative deep learning often falls short due to +its single-step input-output mapping and lack of global awareness of the +solution space, especially given the complexity of network optimization's +objective functions. In contrast, diffusion generative models can consider a +broader range of solutions and exhibit stronger generalization by learning +parameters that describe the distribution of the underlying solution space, +with higher probabilities assigned to better solutions. We propose a new +framework Diffusion Model-based Solution Generation (DiffSG), which leverages +the intrinsic distribution learning capabilities of diffusion generative models +to learn high-quality solution distributions based on given inputs. The optimal +solution within this distribution is highly probable, allowing it to be +effectively reached through repeated sampling. We validate the performance of +DiffSG on several typical network optimization problems, including +mixed-integer non-linear programming, convex optimization, and hierarchical +non-convex optimization. Our results show that DiffSG outperforms existing +baselines. In summary, we demonstrate the potential of diffusion generative +models in tackling complex network optimization problems and outline a +promising path for their broader application in the communication community. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Information Geometry and Beta Link for Optimizing Sparse Variational + Student-t Processes + + +
+ Recently, a sparse version of Student-t Processes, termed sparse variational +Student-t Processes, has been proposed to enhance computational efficiency and +flexibility for real-world datasets using stochastic gradient descent. However, +traditional gradient descent methods like Adam may not fully exploit the +parameter space geometry, potentially leading to slower convergence and +suboptimal performance. To mitigate these issues, we adopt natural gradient +methods from information geometry for variational parameter optimization of +Student-t Processes. This approach leverages the curvature and structure of the +parameter space, utilizing tools such as the Fisher information matrix which is +linked to the Beta function in our model. This method provides robust +mathematical support for the natural gradient algorithm when using Student's +t-distribution as the variational distribution. Additionally, we present a +mini-batch algorithm for efficiently computing natural gradients. Experimental +results across four benchmark datasets demonstrate that our method consistently +accelerates convergence speed. + +
+
+
+
+
+ + ☆ SlotLifter: Slot-guided Feature Lifting for Learning Object-centric + Radiance Fields ECCV 2024 + + +
+ The ability to distill object-centric abstractions from intricate visual +scenes underpins human-level generalization. Despite the significant progress +in object-centric learning methods, learning object-centric representations in +the 3D physical world remains a crucial challenge. In this work, we propose +SlotLifter, a novel object-centric radiance model addressing scene +reconstruction and decomposition jointly via slot-guided feature lifting. Such +a design unites object-centric learning representations and image-based +rendering methods, offering state-of-the-art performance in scene decomposition +and novel-view synthesis on four challenging synthetic and four complex +real-world datasets, outperforming existing 3D object-centric learning methods +by a large margin. Through extensive ablative studies, we showcase the efficacy +of designs in SlotLifter, revealing key insights for potential future +directions. + +
+
+ comment: Accepted by ECCV 2024. Project website: https://slotlifter.github.io +
+
+
+
+
+ + ☆ Masked Image Modeling: A Survey + + +
+ In this work, we survey recent studies on masked image modeling (MIM), an +approach that emerged as a powerful self-supervised learning technique in +computer vision. The MIM task involves masking some information, e.g. pixels, +patches, or even latent representations, and training a model, usually an +autoencoder, to predicting the missing information by using the context +available in the visible part of the input. We identify and formalize two +categories of approaches on how to implement MIM as a pretext task, one based +on reconstruction and one based on contrastive learning. Then, we construct a +taxonomy and review the most prominent papers in recent years. We complement +the manually constructed taxonomy with a dendrogram obtained by applying a +hierarchical clustering algorithm. We further identify relevant clusters via +manually inspecting the resulting dendrogram. Our review also includes datasets +that are commonly used in MIM research. We aggregate the performance results of +various masked image modeling methods on the most popular datasets, to +facilitate the comparison of competing methods. Finally, we identify research +gaps and propose several interesting directions of future work. + +
+
+
+
+
+ + ☆ Coherence Awareness in Diffractive Neural Networks + + +
+ Diffractive neural networks hold great promise for applications requiring +intensive computational processing. Considerable attention has focused on +diffractive networks for either spatially coherent or spatially incoherent +illumination. Here we illustrate that, as opposed to imaging systems, in +diffractive networks the degree of spatial coherence has a dramatic effect. In +particular, we show that when the spatial coherence length on the object is +comparable to the minimal feature size preserved by the optical system, neither +the incoherent nor the coherent extremes serve as acceptable approximations. +Importantly, this situation is inherent to many settings involving active +illumination, including reflected light microscopy, autonomous vehicles and +smartphones. Following this observation, we propose a general framework for +training diffractive networks for any specified degree of spatial and temporal +coherence, supporting all types of linear and nonlinear layers. Using our +method, we numerically optimize networks for image classification, and +thoroughly investigate their performance dependence on the illumination +coherence properties. We further introduce the concept of coherence-blind +networks, which have enhanced resilience to changes in illumination conditions. +Our findings serve as a steppingstone toward adopting all-optical neural +networks in real-world applications, leveraging nothing but natural light. + +
+
+
+
+
+ + ☆ Case-based Explainability for Random Forest: Prototypes, Critics, + Counter-factuals and Semi-factuals + + +
+ The explainability of black-box machine learning algorithms, commonly known +as Explainable Artificial Intelligence (XAI), has become crucial for financial +and other regulated industrial applications due to regulatory requirements and +the need for transparency in business practices. Among the various paradigms of +XAI, Explainable Case-Based Reasoning (XCBR) stands out as a pragmatic approach +that elucidates the output of a model by referencing actual examples from the +data used to train or test the model. Despite its potential, XCBR has been +relatively underexplored for many algorithms such as tree-based models until +recently. We start by observing that most XCBR methods are defined based on the +distance metric learned by the algorithm. By utilizing a recently proposed +technique to extract the distance metric learned by Random Forests (RFs), which +is both geometry- and accuracy-preserving, we investigate various XCBR methods. +These methods amount to identify special points from the training datasets, +such as prototypes, critics, counter-factuals, and semi-factuals, to explain +the predictions for a given query of the RF. We evaluate these special points +using various evaluation metrics to assess their explanatory power and +effectiveness. + +
+
+ comment: 8 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Leveraging Priors via Diffusion Bridge for Time Series Generation + + +
+ Time series generation is widely used in real-world applications such as +simulation, data augmentation, and hypothesis test techniques. Recently, +diffusion models have emerged as the de facto approach for time series +generation, emphasizing diverse synthesis scenarios based on historical or +correlated time series data streams. Since time series have unique +characteristics, such as fixed time order and data scaling, standard Gaussian +prior might be ill-suited for general time series generation. In this paper, we +exploit the usage of diverse prior distributions for synthesis. Then, we +propose TimeBridge, a framework that enables flexible synthesis by leveraging +diffusion bridges to learn the transport between chosen prior and data +distributions. Our model covers a wide range of scenarios in time series +diffusion models, which leverages (i) data- and time-dependent priors for +unconditional synthesis, and (ii) data-scale preserving synthesis with a +constraint as a prior for conditional generation. Experimentally, our model +achieves state-of-the-art performance in both unconditional and conditional +time series generation tasks. + +
+
+
+
+
+ + ☆ RW-NSGCN: A Robust Approach to Structural Attacks via Negative Sampling + + +
+ Node classification using Graph Neural Networks (GNNs) has been widely +applied in various practical scenarios, such as predicting user interests and +detecting communities in social networks. However, recent studies have shown +that graph-structured networks often contain potential noise and attacks, in +the form of topological perturbations and weight disturbances, which can lead +to decreased classification performance in GNNs. To improve the robustness of +the model, we propose a novel method: Random Walk Negative Sampling Graph +Convolutional Network (RW-NSGCN). Specifically, RW-NSGCN integrates the Random +Walk with Restart (RWR) and PageRank (PGR) algorithms for negative sampling and +employs a Determinantal Point Process (DPP)-based GCN for convolution +operations. RWR leverages both global and local information to manage noise and +local variations, while PGR assesses node importance to stabilize the +topological structure. The DPP-based GCN ensures diversity among negative +samples and aggregates their features to produce robust node embeddings, +thereby improving classification performance. Experimental results demonstrate +that the RW-NSGCN model effectively addresses network topology attacks and +weight instability, increasing the accuracy of anomaly detection and overall +stability. In terms of classification accuracy, RW-NSGCN significantly +outperforms existing methods, showing greater resilience across various +scenarios and effectively mitigating the impact of such vulnerabilities. + +
+
+
+
+
+ + ☆ COD: Learning Conditional Invariant Representation for Domain Adaptation + Regression ECCV 2024 + + +
+ Aiming to generalize the label knowledge from a source domain with continuous +outputs to an unlabeled target domain, Domain Adaptation Regression (DAR) is +developed for complex practical learning problems. However, due to the +continuity problem in regression, existing conditional distribution alignment +theory and methods with discrete prior, which are proven to be effective in +classification settings, are no longer applicable. In this work, focusing on +the feasibility problems in DAR, we establish the sufficiency theory for the +regression model, which shows the generalization error can be sufficiently +dominated by the cross-domain conditional discrepancy. Further, to characterize +conditional discrepancy with continuous conditioning variable, a novel +Conditional Operator Discrepancy (COD) is proposed, which admits the metric +property on conditional distributions via the kernel embedding theory. Finally, +to minimize the discrepancy, a COD-based conditional invariant representation +learning model is proposed, and the reformulation is derived to show that +reasonable modifications on moment statistics can further improve the +discriminability of the adaptation model. Extensive experiments on standard DAR +datasets verify the validity of theoretical results and the superiority over +SOTA DAR methods. + +
+
+ comment: Accepted to ECCV 2024 (oral) +
+
+
+
+
+ + ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ☆ Towards Robust and Cost-Efficient Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) have demonstrated strong reasoning and +memorization capabilities via pretraining on massive textual corpora. However, +training LLMs on human-written text entails significant risk of privacy and +copyright violations, which demands an efficient machine unlearning framework +to remove knowledge of sensitive data without retraining the model from +scratch. While Gradient Ascent (GA) is widely used for unlearning by reducing +the likelihood of generating unwanted information, the unboundedness of +increasing the cross-entropy loss causes not only unstable optimization, but +also catastrophic forgetting of knowledge that needs to be retained. We also +discover its joint application under low-rank adaptation results in +significantly suboptimal computational cost vs. generative performance +trade-offs. In light of this limitation, we propose two novel techniques for +robust and cost-efficient unlearning on LLMs. We first design an Inverted Hinge +loss that suppresses unwanted tokens by increasing the probability of the next +most likely token, thereby retaining fluency and structure in language +generation. We also propose to initialize low-rank adapter weights based on +Fisher-weighted low-rank approximation, which induces faster unlearning and +better knowledge retention by allowing model updates to be focused on +parameters that are important in generating textual data we wish to remove. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Unveiling the Flaws: A Critical Analysis of Initialization Effect on + Time Series Anomaly Detection + + +
+ Deep learning for time-series anomaly detection (TSAD) has gained significant +attention over the past decade. Despite the reported improvements in several +papers, the practical application of these models remains limited. Recent +studies have cast doubt on these models, attributing their results to flawed +evaluation techniques. However, the impact of initialization has largely been +overlooked. This paper provides a critical analysis of the initialization +effects on TSAD model performance. Our extensive experiments reveal that TSAD +models are highly sensitive to hyperparameters such as window size, seed +number, and normalization. This sensitivity often leads to significant +variability in performance, which can be exploited to artificially inflate the +reported efficacy of these models. We demonstrate that even minor changes in +initialization parameters can result in performance variations that overshadow +the claimed improvements from novel model architectures. Our findings highlight +the need for rigorous evaluation protocols and transparent reporting of +preprocessing steps to ensure the reliability and fairness of anomaly detection +methods. This paper calls for a more cautious interpretation of TSAD +advancements and encourages the development of more robust and transparent +evaluation practices to advance the field and its practical applications. + +
+
+
+
+
+ + ☆ Generalized knowledge-enhanced framework for biomedical entity and + relation extraction + + +
+ In recent years, there has been an increasing number of frameworks developed +for biomedical entity and relation extraction. This research effort aims to +address the accelerating growth in biomedical publications and the intricate +nature of biomedical texts, which are written for mainly domain experts. To +handle these challenges, we develop a novel framework that utilizes external +knowledge to construct a task-independent and reusable background knowledge +graph for biomedical entity and relation extraction. The design of our model is +inspired by how humans learn domain-specific topics. In particular, humans +often first acquire the most basic and common knowledge regarding a field to +build the foundational knowledge and then use that as a basis for extending to +various specialized topics. Our framework employs such common-knowledge-sharing +mechanism to build a general neural-network knowledge graph that is learning +transferable to different domain-specific biomedical texts effectively. +Experimental evaluations demonstrate that our model, equipped with this +generalized and cross-transferable knowledge base, achieves competitive +performance benchmarks, including BioRelEx for binding interaction detection +and ADE for Adverse Drug Effect identification. + +
+
+
+
+
+ + ☆ CROME: Cross-Modal Adapters for Efficient Multimodal LLM + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable +image-language capabilities, but their widespread use faces challenges in +cost-effective training and adaptation. Existing approaches often necessitate +expensive language model retraining and limited adaptability. Additionally, the +current focus on zero-shot performance improvements offers insufficient +guidance for task-specific tuning. We propose CROME, an efficient +vision-language instruction tuning framework. It features a novel gated +cross-modal adapter that effectively combines visual and textual +representations prior to input into a frozen LLM. This lightweight adapter, +trained with minimal parameters, enables efficient cross-modal understanding. +Notably, CROME demonstrates superior zero-shot performance on standard visual +question answering and instruction-following benchmarks. Moreover, it yields +fine-tuning with exceptional parameter efficiency, competing with task-specific +specialist state-of-the-art methods. CROME demonstrates the potential of pre-LM +alignment for building scalable, adaptable, and parameter-efficient multimodal +models. + +
+
+
+
+
+ + ☆ Prioritizing Modalities: Flexible Importance Scheduling in Federated + Multimodal Learning + + +
+ Federated Learning (FL) is a distributed machine learning approach that +enables devices to collaboratively train models without sharing their local +data, ensuring user privacy and scalability. However, applying FL to real-world +data presents challenges, particularly as most existing FL research focuses on +unimodal data. Multimodal Federated Learning (MFL) has emerged to address these +challenges, leveraging modality-specific encoder models to process diverse +datasets. Current MFL methods often uniformly allocate computational +frequencies across all modalities, which is inefficient for IoT devices with +limited resources. In this paper, we propose FlexMod, a novel approach to +enhance computational efficiency in MFL by adaptively allocating training +resources for each modality encoder based on their importance and training +requirements. We employ prototype learning to assess the quality of modality +encoders, use Shapley values to quantify the importance of each modality, and +adopt the Deep Deterministic Policy Gradient (DDPG) method from deep +reinforcement learning to optimize the allocation of training resources. Our +method prioritizes critical modalities, optimizing model performance and +resource utilization. Experimental results on three real-world datasets +demonstrate that our proposed method significantly improves the performance of +MFL models. + +
+
+ comment: Submitted to IEEE TMC, under review +
+
+
+
+
+ + ☆ Variance-Reduced Cascade Q-learning: Algorithms and Sample Complexity + + +
+ We study the problem of estimating the optimal Q-function of +$\gamma$-discounted Markov decision processes (MDPs) under the synchronous +setting, where independent samples for all state-action pairs are drawn from a +generative model at each iteration. We introduce and analyze a novel model-free +algorithm called Variance-Reduced Cascade Q-learning (VRCQ). VRCQ comprises two +key building blocks: (i) the established direct variance reduction technique +and (ii) our proposed variance reduction scheme, Cascade Q-learning. By +leveraging these techniques, VRCQ provides superior guarantees in the +$\ell_\infty$-norm compared with the existing model-free stochastic +approximation-type algorithms. Specifically, we demonstrate that VRCQ is +minimax optimal. Additionally, when the action set is a singleton (so that the +Q-learning problem reduces to policy evaluation), it achieves non-asymptotic +instance optimality while requiring the minimum number of samples theoretically +possible. Our theoretical results and their practical implications are +supported by numerical experiments. + +
+
+
+
+
+ + ☆ Value of Information and Reward Specification in Active Inference and + POMDPs + + +
+ Expected free energy (EFE) is a central quantity in active inference which +has recently gained popularity due to its intuitive decomposition of the +expected value of control into a pragmatic and an epistemic component. While +numerous conjectures have been made to justify EFE as a decision making +objective function, the most widely accepted is still its intuitiveness and +resemblance to variational free energy in approximate Bayesian inference. In +this work, we take a bottom up approach and ask: taking EFE as given, what's +the resulting agent's optimality gap compared with a reward-driven +reinforcement learning (RL) agent, which is well understood? By casting EFE +under a particular class of belief MDP and using analysis tools from RL theory, +we show that EFE approximates the Bayes optimal RL policy via information +value. We discuss the implications for objective specification of active +inference agents. + +
+
+
+
+
+ + ☆ Dynamic Exclusion of Low-Fidelity Data in Bayesian Optimization for + Autonomous Beamline Alignment + + +
+ Aligning beamlines at synchrotron light sources is a high-dimensional, +expensive-to-sample optimization problem, as beams are focused using a series +of dynamic optical components. Bayesian Optimization is an efficient machine +learning approach to finding global optima of beam quality, but the model can +easily be impaired by faulty data points caused by the beam going off the edge +of the sensor or by background noise. This study, conducted at the National +Synchrotron Light Source II (NSLS-II) facility at Brookhaven National +Laboratory (BNL), is an investigation of methods to identify untrustworthy +readings of beam quality and discourage the optimization model from seeking out +points likely to yield low-fidelity beams. The approaches explored include +dynamic pruning using loss analysis of size and position models and a +lengthscale-based genetic algorithm to determine which points to include in the +model for optimal fit. Each method successfully classified high and low +fidelity points. This research advances BNL's mission to tackle our nation's +energy challenges by providing scientists at all beamlines with access to +higher quality beams, and faster convergence to these optima for their +experiments. + +
+
+ comment: 12 pages, 6 figure sets +
+
+
+
+
+ + ☆ A Comparison of Imitation Learning Algorithms for Bimanual Manipulation + + +
+ Amidst the wide popularity of imitation learning algorithms in robotics, +their properties regarding hyperparameter sensitivity, ease of training, data +efficiency, and performance have not been well-studied in high-precision +industry-inspired environments. In this work, we demonstrate the limitations +and benefits of prominent imitation learning approaches and analyze their +capabilities regarding these properties. We evaluate each algorithm on a +complex bimanual manipulation task involving an over-constrained dynamics +system in a setting involving multiple contacts between the manipulated object +and the environment. While we find that imitation learning is well suited to +solve such complex tasks, not all algorithms are equal in terms of handling +environmental and hyperparameter perturbations, training requirements, +performance, and ease of use. We investigate the empirical influence of these +key characteristics by employing a carefully designed experimental procedure +and learning environment. Paper website: https://bimanual-imitation.github.io/ + +
+
+
+
+
+ + ☆ Using Advanced LLMs to Enhance Smaller LLMs: An Interpretable Knowledge + Distillation Approach + + +
+ Advanced Large language models (LLMs) like GPT-4 or LlaMa 3 provide superior +performance in complex human-like interactions. But they are costly, or too +large for edge devices such as smartphones and harder to self-host, leading to +security and privacy concerns. This paper introduces a novel interpretable +knowledge distillation approach to enhance the performance of smaller, more +economical LLMs that firms can self-host. We study this problem in the context +of building a customer service agent aimed at achieving high customer +satisfaction through goal-oriented dialogues. Unlike traditional knowledge +distillation, where the "student" model learns directly from the "teacher" +model's responses via fine-tuning, our interpretable "strategy" teaching +approach involves the teacher providing strategies to improve the student's +performance in various scenarios. This method alternates between a "scenario +generation" step and a "strategies for improvement" step, creating a customized +library of scenarios and optimized strategies for automated prompting. The +method requires only black-box access to both student and teacher models; hence +it can be used without manipulating model parameters. In our customer service +application, the method improves performance, and the learned strategies are +transferable to other LLMs and scenarios beyond the training set. The method's +interpretabilty helps safeguard against potential harms through human audit. + +
+
+
+
+
+ + ☆ Pan-cancer gene set discovery via scRNA-seq for optimal deep learning + based downstream tasks + + +
+ The application of machine learning to transcriptomics data has led to +significant advances in cancer research. However, the high dimensionality and +complexity of RNA sequencing (RNA-seq) data pose significant challenges in +pan-cancer studies. This study hypothesizes that gene sets derived from +single-cell RNA sequencing (scRNA-seq) data will outperform those selected +using bulk RNA-seq in pan-cancer downstream tasks. We analyzed scRNA-seq data +from 181 tumor biopsies across 13 cancer types. High-dimensional weighted gene +co-expression network analysis (hdWGCNA) was performed to identify relevant +gene sets, which were further refined using XGBoost for feature selection. +These gene sets were applied to downstream tasks using TCGA pan-cancer RNA-seq +data and compared to six reference gene sets and oncogenes from OncoKB +evaluated with deep learning models, including multilayer perceptrons (MLPs) +and graph neural networks (GNNs). The XGBoost-refined hdWGCNA gene set +demonstrated higher performance in most tasks, including tumor mutation burden +assessment, microsatellite instability classification, mutation prediction, +cancer subtyping, and grading. In particular, genes such as DPM1, BAD, and +FKBP4 emerged as important pan-cancer biomarkers, with DPM1 consistently +significant across tasks. This study presents a robust approach for feature +selection in cancer genomics by integrating scRNA-seq data and advanced +analysis techniques, offering a promising avenue for improving predictive +accuracy in cancer research. + +
+
+ comment: 16 pages, 3 figures, 1 tables, and 6 supplementary Table +
+
+
+
+
+ + ☆ A Review of Pseudo-Labeling for Computer Vision + + +
+ Deep neural models have achieved state of the art performance on a wide range +of problems in computer science, especially in computer vision. However, deep +neural networks often require large datasets of labeled samples to generalize +effectively, and an important area of active research is semi-supervised +learning, which attempts to instead utilize large quantities of (easily +acquired) unlabeled samples. One family of methods in this space is +pseudo-labeling, a class of algorithms that use model outputs to assign labels +to unlabeled samples which are then used as labeled samples during training. +Such assigned labels, called pseudo-labels, are most commonly associated with +the field of semi-supervised learning. In this work we explore a broader +interpretation of pseudo-labels within both self-supervised and unsupervised +methods. By drawing the connection between these areas we identify new +directions when advancements in one area would likely benefit others, such as +curriculum learning and self-supervised regularization. + +
+
+ comment: 21 pages, 4 figures +
+
+
+
+
+ + ☆ Causal Effect Estimation using identifiable Variational AutoEncoder with + Latent Confounders and Post-Treatment Variables + + +
+ Estimating causal effects from observational data is challenging, especially +in the presence of latent confounders. Much work has been done on addressing +this challenge, but most of the existing research ignores the bias introduced +by the post-treatment variables. In this paper, we propose a novel method of +joint Variational AutoEncoder (VAE) and identifiable Variational AutoEncoder +(iVAE) for learning the representations of latent confounders and latent +post-treatment variables from their proxy variables, termed CPTiVAE, to achieve +unbiased causal effect estimation from observational data. We further prove the +identifiability in terms of the representation of latent post-treatment +variables. Extensive experiments on synthetic and semi-synthetic datasets +demonstrate that the CPTiVAE outperforms the state-of-the-art methods in the +presence of latent confounders and post-treatment variables. We further apply +CPTiVAE to a real-world dataset to show its potential application. + +
+
+
+
+
+ + ☆ Deep Index Policy for Multi-Resource Restless Matching Bandit and Its + Application in Multi-Channel Scheduling + + +
+ Scheduling in multi-channel wireless communication system presents formidable +challenges in effectively allocating resources. To address these challenges, we +investigate the multi-resource restless matching bandit (MR-RMB) model for +heterogeneous resource systems with an objective of maximizing long-term +discounted total rewards while respecting resource constraints. We have also +generalized to applications beyond multi-channel wireless. We discuss the +Max-Weight Index Matching algorithm, which optimizes resource allocation based +on learned partial indexes. We have derived the policy gradient theorem for +index learning. Our main contribution is the introduction of a new Deep Index +Policy (DIP), an online learning algorithm tailored for MR-RMB. DIP learns the +partial index by leveraging the policy gradient theorem for restless arms with +convoluted and unknown transition kernels of heterogeneous resources. We +demonstrate the utility of DIP by evaluating its performance for three +different MR-RMB problems. Our simulation results show that DIP indeed learns +the partial indexes efficiently. + +
+
+
+
+
+ + ☆ Quantification of total uncertainty in the physics-informed + reconstruction of CVSim-6 physiology + + +
+ When predicting physical phenomena through simulation, quantification of the +total uncertainty due to multiple sources is as crucial as making sure the +underlying numerical model is accurate. Possible sources include irreducible +aleatoric uncertainty due to noise in the data, epistemic uncertainty induced +by insufficient data or inadequate parameterization, and model-form uncertainty +related to the use of misspecified model equations. Physics-based +regularization interacts in nontrivial ways with aleatoric, epistemic and +model-form uncertainty and their combination, and a better understanding of +this interaction is needed to improve the predictive performance of +physics-informed digital twins that operate under real conditions. With a +specific focus on biological and physiological models, this study investigates +the decomposition of total uncertainty in the estimation of states and +parameters of a differential system simulated with MC X-TFC, a new +physics-informed approach for uncertainty quantification based on random +projections and Monte-Carlo sampling. MC X-TFC is applied to a six-compartment +stiff ODE system, the CVSim-6 model, developed in the context of human +physiology. The system is analyzed by progressively removing data while +estimating an increasing number of parameters and by investigating total +uncertainty under model-form misspecification of non-linear resistance in the +pulmonary compartment. In particular, we focus on the interaction between the +formulation of the discrepancy term and quantification of model-form +uncertainty, and show how additional physics can help in the estimation +process. The method demonstrates robustness and efficiency in estimating +unknown states and parameters, even with limited, sparse, and noisy data. It +also offers great flexibility in integrating data with physics for improved +estimation, even in cases of model misspecification. + +
+
+
+
+
+ + ☆ Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents + + +
+ Large Language Models (LLMs) have shown remarkable capabilities in natural +language tasks requiring complex reasoning, yet their application in agentic, +multi-step reasoning within interactive environments remains a difficult +challenge. Traditional supervised pre-training on static datasets falls short +in enabling autonomous agent capabilities needed to perform complex +decision-making in dynamic settings like web navigation. Previous attempts to +bridge this ga-through supervised fine-tuning on curated expert +demonstrations-often suffer from compounding errors and limited exploration +data, resulting in sub-optimal policy outcomes. To overcome these challenges, +we propose a framework that combines guided Monte Carlo Tree Search (MCTS) +search with a self-critique mechanism and iterative fine-tuning on agent +interactions using an off-policy variant of the Direct Preference Optimization +(DPO) algorithm. Our method allows LLM agents to learn effectively from both +successful and unsuccessful trajectories, thereby improving their +generalization in complex, multi-step reasoning tasks. We validate our approach +in the WebShop environment-a simulated e-commerce platform where it +consistently outperforms behavior cloning and reinforced fine-tuning baseline, +and beats average human performance when equipped with the capability to do +online search. In real-world booking scenarios, our methodology boosts Llama-3 +70B model's zero-shot performance from 18.6% to 81.7% success rate (a 340% +relative increase) after a single day of data collection and further to 95.4% +with online search. We believe this represents a substantial leap forward in +the capabilities of autonomous agents, paving the way for more sophisticated +and reliable decision-making in real-world settings. + +
+
+
+
+
+ + ☆ Massive Dimensions Reduction and Hybridization with Meta-heuristics in + Deep Learning CEC + + +
+ Deep learning is mainly based on utilizing gradient-based optimization for +training Deep Neural Network (DNN) models. Although robust and widely used, +gradient-based optimization algorithms are prone to getting stuck in local +minima. In this modern deep learning era, the state-of-the-art DNN models have +millions and billions of parameters, including weights and biases, making them +huge-scale optimization problems in terms of search space. Tuning a huge number +of parameters is a challenging task that causes vanishing/exploding gradients +and overfitting; likewise, utilized loss functions do not exactly represent our +targeted performance metrics. A practical solution to exploring large and +complex solution space is meta-heuristic algorithms. Since DNNs exceed +thousands and millions of parameters, even robust meta-heuristic algorithms, +such as Differential Evolution, struggle to efficiently explore and converge in +such huge-dimensional search spaces, leading to very slow convergence and high +memory demand. To tackle the mentioned curse of dimensionality, the concept of +blocking was recently proposed as a technique that reduces the search space +dimensions by grouping them into blocks. In this study, we aim to introduce +Histogram-based Blocking Differential Evolution (HBDE), a novel approach that +hybridizes gradient-based and gradient-free algorithms to optimize parameters. +Experimental results demonstrated that the HBDE could reduce the parameters in +the ResNet-18 model from 11M to 3K during the training/optimizing phase by +metaheuristics, namely, the proposed HBDE, which outperforms baseline +gradient-based and parent gradient-free DE algorithms evaluated on CIFAR-10 and +CIFAR-100 datasets showcasing its effectiveness with reduced computational +demands for the very first time. + +
+
+ comment: 8 pages, 5 figures, 3 tables, accepted at IEEE CCECE 2024 (updated + Fig. 1 and conclusion remarks) +
+
+
+
+
+ + ☆ Solving Truly Massive Budgeted Monotonic POMDPs with Oracle-Guided + Meta-Reinforcement Learning + + +
+ Monotonic Partially Observable Markov Decision Processes (POMDPs), where the +system state progressively decreases until a restorative action is performed, +can be used to model sequential repair problems effectively. This paper +considers the problem of solving budget-constrained multi-component monotonic +POMDPs, where a finite budget limits the maximal number of restorative actions. +For a large number of components, solving such a POMDP using current methods is +computationally intractable due to the exponential growth in the state space +with an increasing number of components. To address this challenge, we propose +a two-step approach. Since the individual components of a budget-constrained +multi-component monotonic POMDP are only connected via the shared budget, we +first approximate the optimal budget allocation among these components using an +approximation of each component POMDP's optimal value function which is +obtained through a random forest model. Subsequently, we introduce an +oracle-guided meta-trained Proximal Policy Optimization (PPO) algorithm to +solve each of the independent budget-constrained single-component monotonic +POMDPs. The oracle policy is obtained by performing value iteration on the +corresponding monotonic Markov Decision Process (MDP). This two-step method +provides scalability in solving truly massive multi-component monotonic POMDPs. +To demonstrate the efficacy of our approach, we consider a real-world +maintenance scenario that involves inspection and repair of an administrative +building by a team of agents within a maintenance budget. Finally, we perform a +computational complexity analysis for a varying number of components to show +the scalability of the proposed approach. + +
+
+
+
+
+ + ☆ Joint Graph Rewiring and Feature Denoising via Spectral Resonance + + +
+ Graph neural networks (GNNs) take as input the graph structure and the +feature vectors associated with the nodes. Both contain noisy information about +the labels. Here we propose joint denoising and rewiring (JDR)--an algorithm to +jointly denoise the graph structure and features, which can improve the +performance of any downstream algorithm. We do this by defining and maximizing +the alignment between the leading eigenspaces of graph and feature matrices. To +approximately solve this computationally hard problem, we propose a heuristic +that efficiently handles real-world graph datasets with many classes and +different levels of homophily or heterophily. We experimentally verify the +effectiveness of our approach on synthetic data and real-world graph datasets. +The results show that JDR consistently outperforms existing rewiring methods on +node classification tasks using GNNs as downstream models. + +
+
+
+
+
+ + ☆ VulCatch: Enhancing Binary Vulnerability Detection through CodeT5 + Decompilation and KAN Advanced Feature Extraction + + +
+ Binary program vulnerability detection is critical for software security, yet +existing deep learning approaches often rely on source code analysis, limiting +their ability to detect unknown vulnerabilities. To address this, we propose +VulCatch, a binary-level vulnerability detection framework. VulCatch introduces +a Synergy Decompilation Module (SDM) and Kolmogorov-Arnold Networks (KAN) to +transform raw binary code into pseudocode using CodeT5, preserving high-level +semantics for deep analysis with tools like Ghidra and IDA. KAN further +enhances feature transformation, enabling the detection of complex +vulnerabilities. VulCatch employs word2vec, Inception Blocks, BiLSTM Attention, +and Residual connections to achieve high detection accuracy (98.88%) and +precision (97.92%), while minimizing false positives (1.56%) and false +negatives (2.71%) across seven CVE datasets. + +
+
+
+
+
+ + ♻ ☆ TraceFL: Achieving Interpretability in Federated Learning via Neuron + Provenance + + +
+ In Federated Learning, clients train models on local data and send updates to +a central server, which aggregates them into a global model using a fusion +algorithm. This collaborative yet privacy-preserving training comes at a +cost--FL developers face significant challenges in attributing global model +predictions to specific clients. Localizing responsible clients is a crucial +step towards (a) excluding clients primarily responsible for incorrect +predictions and (b) encouraging clients who contributed high-quality models to +continue participating in the future. Existing ML explainability approaches are +inherently inapplicable as they are designed for single-model, centralized +training. + We introduce TraceFL, a fine-grained neuron provenance capturing mechanism +that identifies clients responsible for the global model's prediction by +tracking the flow of information from individual clients to the global model. +Since inference on different inputs activates a different set of neurons of the +global model, TraceFL dynamically quantifies the significance of the global +model's neurons in a given prediction. It then selectively picks a slice of the +most crucial neurons in the global model and maps them to the corresponding +neurons in every participating client to determine each client's contribution, +ultimately localizing the responsible client. We evaluate TraceFL on six +datasets, including two real-world medical imaging datasets and four neural +networks, including advanced models such as GPT. TraceFL achieves 99% accuracy +in localizing the responsible client in FL tasks spanning both image and text +classification tasks. At a time when state-of-the-art ML debugging approaches +are mostly domain-specific (e.g., image classification only), TraceFL is the +first technique to enable highly accurate automated reasoning across a wide +range of FL applications. + +
+
+ comment: 13 pages. TraceFL is the first interpretability technique in FL that + can work on both image and text classification tasks. For source code please + contact at waris@vt.edu +
+
+
+
+
+ + ♻ ☆ RealGen: Retrieval Augmented Generation for Controllable Traffic + Scenarios ECCV 2024 + + +
+ Simulation plays a crucial role in the development of autonomous vehicles +(AVs) due to the potential risks associated with real-world testing. Although +significant progress has been made in the visual aspects of simulators, +generating complex behavior among agents remains a formidable challenge. It is +not only imperative to ensure realism in the scenarios generated but also +essential to incorporate preferences and conditions to facilitate controllable +generation for AV training and evaluation. Traditional methods, mainly relying +on memorizing the distribution of training datasets, often fall short in +generating unseen scenarios. Inspired by the success of retrieval augmented +generation in large language models, we present RealGen, a novel +retrieval-based in-context learning framework for traffic scenario generation. +RealGen synthesizes new scenarios by combining behaviors from multiple +retrieved examples in a gradient-free way, which may originate from templates +or tagged scenarios. This in-context learning framework endows versatile +generative capabilities, including the ability to edit scenarios, compose +various behaviors, and produce critical scenarios. Evaluations show that +RealGen offers considerable flexibility and controllability, marking a new +direction in the field of controllable traffic scenario generation. Check our +project website for more information: https://realgen.github.io. + +
+
+ comment: Accepted by ECCV 2024, Oral +
+
+
+
+
+ + ♻ ☆ The logic of rational graph neural networks + + +
+ The expressivity of Graph Neural Networks (GNNs) can be described via +appropriate fragments of the first order logic. Any query of the two variable +fragment of graded modal logic (GC2) interpreted over labeled graphs can be +expressed using a Rectified Linear Unit (ReLU) GNN whose size does not grow +with graph input sizes [Barcelo & Al., 2020]. Conversely, a GNN expresses at +most a query of GC2, for any choice of activation function. In this article, we +prove that some GC2 queries of depth $3$ cannot be expressed by GNNs with any +rational activation function. This shows that not all non-polynomial activation +functions confer GNNs maximal expressivity, answering a open question +formulated by [Grohe, 2021]. This result is also in contrast with the efficient +universal approximation properties of rational feedforward neural networks +investigated by [Boull\'e & Al., 2020]. We also present a rational subfragment +of the first order logic (RGC2), and prove that rational GNNs can express RGC2 +queries uniformly over all graphs. + +
+
+
+
+
+ + ♻ ☆ The Physics-Informed Neural Network Gravity Model: Generation III + + +
+ Scientific machine learning and the advent of the Physics-Informed Neural +Network (PINN) have shown high potential in their ability to solve complex +differential equations. One example is the use of PINNs to solve the gravity +field modeling problem -- learning convenient representations of the +gravitational potential from position and acceleration data. These PINN gravity +models, or PINN-GMs, have demonstrated advantages in model compactness, +robustness to noise, and sample efficiency when compared to popular +alternatives; however, further investigation has revealed various failure modes +for these and other machine learning gravity models which this manuscript aims +to address. Specifically, this paper introduces the third generation +Physics-Informed Neural Network Gravity Model (PINN-GM-III) which includes +design changes that solve the problems of feature divergence, bias towards +low-altitude samples, numerical instability, and extrapolation error. Six +evaluation metrics are proposed to expose these past pitfalls and illustrate +the PINN-GM-III's robustness to them. This study concludes by evaluating the +PINN-GM-III modeling accuracy on a heterogeneous density asteroid, and +comparing its performance to other analytic and machine learning gravity +models. + +
+
+ comment: 40 pages, 10 figures, submitted to The Journal of Astronautical + Sciences +
+
+
+
+
+ + ♻ ☆ SSHPool: The Separated Subgraph-based Hierarchical Pooling + + +
+ In this paper, we develop a novel local graph pooling method, namely the +Separated Subgraph-based Hierarchical Pooling (SSHPool), for graph +classification. We commence by assigning the nodes of a sample graph into +different clusters, resulting in a family of separated subgraphs. We +individually employ the local graph convolution units as the local structure to +further compress each subgraph into a coarsened node, transforming the original +graph into a coarsened graph. Since these subgraphs are separated by different +clusters and the structural information cannot be propagated between them, the +local convolution operation can significantly avoid the over-smoothing problem +caused by message passing through edges in most existing Graph Neural Networks +(GNNs). By hierarchically performing the proposed procedures on the resulting +coarsened graph, the proposed SSHPool can effectively extract the hierarchical +global features of the original graph structure, encapsulating rich intrinsic +structural characteristics. Furthermore, we develop an end-to-end GNN framework +associated with the SSHPool module for graph classification. Experimental +results demonstrate the superior performance of the proposed model on +real-world datasets. + +
+
+
+
+
+ + ♻ ☆ The Distributional Uncertainty of the SHAP score in Explainable Machine + Learning ECAI 2024 + + +
+ Attribution scores reflect how important the feature values in an input +entity are for the output of a machine learning model. One of the most popular +attribution scores is the SHAP score, which is an instantiation of the general +Shapley value used in coalition game theory. The definition of this score +relies on a probability distribution on the entity population. Since the exact +distribution is generally unknown, it needs to be assigned subjectively or be +estimated from data, which may lead to misleading feature scores. In this +paper, we propose a principled framework for reasoning on SHAP scores under +unknown entity population distributions. In our framework, we consider an +uncertainty region that contains the potential distributions, and the SHAP +score of a feature becomes a function defined over this region. We study the +basic problems of finding maxima and minima of this function, which allows us +to determine tight ranges for the SHAP scores of all features. In particular, +we pinpoint the complexity of these problems, and other related ones, showing +them to be NP-complete. Finally, we present experiments on a real-world +dataset, showing that our framework may contribute to a more robust feature +scoring. + +
+
+ comment: In ECAI 2024 proceedings +
+
+
+
+
+ + ♻ ☆ Active Learning for Control-Oriented Identification of Nonlinear Systems + + +
+ Model-based reinforcement learning is an effective approach for controlling +an unknown system. It is based on a longstanding pipeline familiar to the +control community in which one performs experiments on the environment to +collect a dataset, uses the resulting dataset to identify a model of the +system, and finally performs control synthesis using the identified model. As +interacting with the system may be costly and time consuming, targeted +exploration is crucial for developing an effective control-oriented model with +minimal experimentation. Motivated by this challenge, recent work has begun to +study finite sample data requirements and sample efficient algorithms for the +problem of optimal exploration in model-based reinforcement learning. However, +existing theory and algorithms are limited to model classes which are linear in +the parameters. Our work instead focuses on models with nonlinear parameter +dependencies, and presents the first finite sample analysis of an active +learning algorithm suitable for a general class of nonlinear dynamics. In +certain settings, the excess control cost of our algorithm achieves the optimal +rate, up to logarithmic factors. We validate our approach in simulation, +showcasing the advantage of active, control-oriented exploration for +controlling nonlinear systems. + +
+
+
+
+
+ + ♻ ☆ EEG-MACS: Manifold Attention and Confidence Stratification for EEG-based + Cross-Center Brain Disease Diagnosis under Unreliable Annotations + + +
+ Cross-center data heterogeneity and annotation unreliability significantly +challenge the intelligent diagnosis of diseases using brain signals. A notable +example is the EEG-based diagnosis of neurodegenerative diseases, which +features subtler abnormal neural dynamics typically observed in small-group +settings. To advance this area, in this work, we introduce a transferable +framework employing Manifold Attention and Confidence Stratification (MACS) to +diagnose neurodegenerative disorders based on EEG signals sourced from four +centers with unreliable annotations. The MACS framework's effectiveness stems +from these features: 1) The Augmentor generates various EEG-represented brain +variants to enrich the data space; 2) The Switcher enhances the feature space +for trusted samples and reduces overfitting on incorrectly labeled samples; 3) +The Encoder uses the Riemannian manifold and Euclidean metrics to capture +spatiotemporal variations and dynamic synchronization in EEG; 4) The Projector, +equipped with dual heads, monitors consistency across multiple brain variants +and ensures diagnostic accuracy; 5) The Stratifier adaptively stratifies +learned samples by confidence levels throughout the training process; 6) +Forward and backpropagation in MACS are constrained by confidence +stratification to stabilize the learning system amid unreliable annotations. +Our subject-independent experiments, conducted on both neurocognitive and +movement disorders using cross-center corpora, have demonstrated superior +performance compared to existing related algorithms. This work not only +improves EEG-based diagnostics for cross-center and small-setting brain +diseases but also offers insights into extending MACS techniques to other data +analyses, tackling data heterogeneity and annotation unreliability in +multimedia and multimodal content understanding. + +
+
+
+
+
+ + ♻ ☆ AKBR: Learning Adaptive Kernel-based Representations for Graph + Classification + + +
+ In this paper, we propose a new model to learn Adaptive Kernel-based +Representations (AKBR) for graph classification. Unlike state-of-the-art +R-convolution graph kernels that are defined by merely counting any pair of +isomorphic substructures between graphs and cannot provide an end-to-end +learning mechanism for the classifier, the proposed AKBR approach aims to +define an end-to-end representation learning model to construct an adaptive +kernel matrix for graphs. To this end, we commence by leveraging a novel +feature-channel attention mechanism to capture the interdependencies between +different substructure invariants of original graphs. The proposed AKBR model +can thus effectively identify the structural importance of different +substructures, and compute the R-convolution kernel between pairwise graphs +associated with the more significant substructures specified by their +structural attentions. Since each row of the resulting kernel matrix can be +theoretically seen as the embedding vector of a sample graph, the proposed AKBR +model is able to directly employ the resulting kernel matrix as the graph +feature matrix and input it into the classifier for classification (i.e., the +SoftMax layer), naturally providing an end-to-end learning architecture between +the kernel computation as well as the classifier. Experimental results show +that the proposed AKBR model outperforms existing state-of-the-art graph +kernels and deep learning methods on standard graph benchmarks. + +
+
+
+
+
+ + ♻ ☆ How Transformers Learn Causal Structure with Gradient Descent ICML 2024 + + +
+ The incredible success of transformers on sequence modeling tasks can be +largely attributed to the self-attention mechanism, which allows information to +be transferred between different parts of a sequence. Self-attention allows +transformers to encode causal structure which makes them particularly suitable +for sequence modeling. However, the process by which transformers learn such +causal structure via gradient-based training algorithms remains poorly +understood. To better understand this process, we introduce an in-context +learning task that requires learning latent causal structure. We prove that +gradient descent on a simplified two-layer transformer learns to solve this +task by encoding the latent causal graph in the first attention layer. The key +insight of our proof is that the gradient of the attention matrix encodes the +mutual information between tokens. As a consequence of the data processing +inequality, the largest entries of this gradient correspond to edges in the +latent causal graph. As a special case, when the sequences are generated from +in-context Markov chains, we prove that transformers learn an induction head +(Olsson et al., 2022). We confirm our theoretical findings by showing that +transformers trained on our in-context learning task are able to recover a wide +variety of causal structures. + +
+
+ comment: v2: ICML 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Optimizing Emotion Recognition with Wearable Sensor Data: Unveiling + Patterns in Body Movements and Heart Rate through Random Forest + Hyperparameter Tuning + + +
+ This research delves into the utilization of smartwatch sensor data and heart +rate monitoring to discern individual emotions based on body movement and heart +rate. Emotions play a pivotal role in human life, influencing mental +well-being, quality of life, and even physical and physiological responses. The +data were sourced from prior research by Juan C. Quiroz, PhD. The study +enlisted 50 participants who donned smartwatches and heart rate monitors while +completing a 250-meter walk. Emotions were induced through both audio-visual +and audio stimuli, with participants' emotional states evaluated using the +PANAS questionnaire. The study scrutinized three scenarios: viewing a movie +before walking, listening to music before walking, and listening to music while +walking. Personal baselines were established using DummyClassifier with the +'most_frequent' strategy from the sklearn library, and various models, +including Logistic Regression and Random Forest, were employed to gauge the +impacts of these activities. Notably, a novel approach was undertaken by +incorporating hyperparameter tuning to the Random Forest model using +RandomizedSearchCV. The outcomes showcased substantial enhancements with +hyperparameter tuning in the Random Forest model, yielding mean accuracies of +86.63% for happy vs. sad and 76.33% for happy vs. neutral vs. sad. + +
+
+ comment: 12 pages. Accepted by Jurnal Media Informatika Budidarma (Open + Access) +
+
+
+
+
+ + ♻ ☆ Maintaining Adversarial Robustness in Continuous Learning + + +
+ Adversarial robustness is essential for security and reliability of machine +learning systems. However, adversarial robustness enhanced by defense +algorithms is easily erased as the neural network's weights update to learn new +tasks. To address this vulnerability, it is essential to improve the capability +of neural networks in terms of robust continual learning. Specially, we propose +a novel gradient projection technique that effectively stabilizes sample +gradients from previous data by orthogonally projecting back-propagation +gradients onto a crucial subspace before using them for weight updates. This +technique can maintaining robustness by collaborating with a class of defense +algorithms through sample gradient smoothing. The experimental results on four +benchmarks including Split-CIFAR100 and Split-miniImageNet, demonstrate that +the superiority of the proposed approach in mitigating rapidly degradation of +robustness during continual learning even when facing strong adversarial +attacks. + +
+
+
+
+
+ + ♻ ☆ SE(3)-Hyena Operator for Scalable Equivariant Learning + + +
+ Modeling global geometric context while maintaining equivariance is crucial +for accurate predictions in many fields such as biology, chemistry, or vision. +Yet, this is challenging due to the computational demands of processing +high-dimensional data at scale. Existing approaches such as equivariant +self-attention or distance-based message passing, suffer from quadratic +complexity with respect to sequence length, while localized methods sacrifice +global information. Inspired by the recent success of state-space and +long-convolutional models, in this work, we introduce SE(3)-Hyena operator, an +equivariant long-convolutional model based on the Hyena operator. The +SE(3)-Hyena captures global geometric context at sub-quadratic complexity while +maintaining equivariance to rotations and translations. Evaluated on +equivariant associative recall and n-body modeling, SE(3)-Hyena matches or +outperforms equivariant self-attention while requiring significantly less +memory and computational resources for long sequences. Our model processes the +geometric context of 20k tokens x3.5 times faster than the equivariant +transformer and allows x175 longer a context within the same memory budget. + +
+
+
+
+
+ + ♻ ☆ FUGNN: Harmonizing Fairness and Utility in Graph Neural Networks KDD 2024 + + +
+ Fairness-aware Graph Neural Networks (GNNs) often face a challenging +trade-off, where prioritizing fairness may require compromising utility. In +this work, we re-examine fairness through the lens of spectral graph theory, +aiming to reconcile fairness and utility within the framework of spectral graph +learning. We explore the correlation between sensitive features and spectrum in +GNNs, using theoretical analysis to delineate the similarity between original +sensitive features and those after convolution under different spectra. Our +analysis reveals a reduction in the impact of similarity when the eigenvectors +associated with the largest magnitude eigenvalue exhibit directional +similarity. Based on these theoretical insights, we propose FUGNN, a novel +spectral graph learning approach that harmonizes the conflict between fairness +and utility. FUGNN ensures algorithmic fairness and utility by truncating the +spectrum and optimizing eigenvector distribution during the encoding process. +The fairness-aware eigenvector selection reduces the impact of convolution on +sensitive features while concurrently minimizing the sacrifice of utility. +FUGNN further optimizes the distribution of eigenvectors through a transformer +architecture. By incorporating the optimized spectrum into the graph +convolution network, FUGNN effectively learns node representations. Experiments +on six real-world datasets demonstrate the superiority of FUGNN over baseline +methods. The codes are available at https://github.com/yushuowiki/FUGNN. + +
+
+ comment: Accepted in SIGKDD 2024 +
+
+
+
+
+ + ♻ ☆ Learning Minimal Neural Specifications + + +
+ Formal verification is only as good as the specification of a system, which +is also true for neural network verification. Existing specifications follow +the paradigm of data as specification, where the local neighborhood around a +reference data point is considered correct or robust. While these +specifications provide a fair testbed for assessing model robustness, they are +too restrictive for verifying unseen test data-a challenging task with +significant real-world implications. Recent work shows great promise through a +new paradigm, neural representation as specification, which uses neural +activation patterns (NAPs) for this purpose. However, it computes the most +refined NAPs, which include many redundant neurons. In this paper, we study the +following problem: Given a neural network, find a minimal (general) NAP +specification that is sufficient for formal verification of the network's +robustness. Finding the minimal NAP specification not only expands verifiable +bounds but also provides insights into which neurons contribute to the model's +robustness. To address this problem, we propose several exact and approximate +approaches. Our exact approaches leverage the verification tool to find minimal +NAP specifications in either a deterministic or statistical manner. Whereas the +approximate methods efficiently estimate minimal NAPs using adversarial +examples and local gradients, without making calls to the verification tool. +This allows us to inspect potential causal links between neurons and the +robustness of state-of-the art neural networks, a task for which existing +verification frameworks fail to scale. Our experimental results suggest that +minimal NAP specifications require much smaller fractions of neurons compared +to the most refined NAP specifications computed by previous work, yet they can +significantly expand the verifiable boundaries to several orders of magnitude +larger. + +
+
+ comment: 31 pages,9 figures +
+
+
+
+
+ + ♻ ☆ An Ensemble Score Filter for Tracking High-Dimensional Nonlinear + Dynamical Systems + + +
+ We propose an ensemble score filter (EnSF) for solving high-dimensional +nonlinear filtering problems with superior accuracy. A major drawback of +existing filtering methods, e.g., particle filters or ensemble Kalman filters, +is the low accuracy in handling high-dimensional and highly nonlinear problems. +EnSF attacks this challenge by exploiting the score-based diffusion model, +defined in a pseudo-temporal domain, to characterizing the evolution of the +filtering density. EnSF stores the information of the recursively updated +filtering density function in the score function, instead of storing the +information in a set of finite Monte Carlo samples (used in particle filters +and ensemble Kalman filters). Unlike existing diffusion models that train +neural networks to approximate the score function, we develop a training-free +score estimation that uses a mini-batch-based Monte Carlo estimator to directly +approximate the score function at any pseudo-spatial-temporal location, which +provides sufficient accuracy in solving high-dimensional nonlinear problems as +well as saves a tremendous amount of time spent on training neural networks. +High-dimensional Lorenz-96 systems are used to demonstrate the performance of +our method. EnSF provides surprising performance, compared with the +state-of-the-art Local Ensemble Transform Kalman Filter method, in reliably and +efficiently tracking extremely high-dimensional Lorenz systems (up to 1,000,000 +dimensions) with highly nonlinear observation processes. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.09282 +
+
+
+
+
+ + ♻ ☆ Prioritize Alignment in Dataset Distillation + + +
+ Dataset Distillation aims to compress a large dataset into a significantly +more compact, synthetic one without compromising the performance of the trained +models. To achieve this, existing methods use the agent model to extract +information from the target dataset and embed it into the distilled dataset. +Consequently, the quality of extracted and embedded information determines the +quality of the distilled dataset. In this work, we find that existing methods +introduce misaligned information in both information extraction and embedding +stages. To alleviate this, we propose Prioritize Alignment in Dataset +Distillation (PAD), which aligns information from the following two +perspectives. 1) We prune the target dataset according to the compressing ratio +to filter the information that can be extracted by the agent model. 2) We use +only deep layers of the agent model to perform the distillation to avoid +excessively introducing low-level information. This simple strategy effectively +filters out misaligned information and brings non-trivial improvement for +mainstream matching-based distillation algorithms. Furthermore, built on +trajectory matching, \textbf{PAD} achieves remarkable improvements on various +benchmarks, achieving state-of-the-art performance. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Novel Cartography-Based Curriculum Learning Method Applied on RoNLI: + The First Romanian Natural Language Inference Corpus ACL 2024 + + +
+ Natural language inference (NLI), the task of recognizing the entailment +relationship in sentence pairs, is an actively studied topic serving as a proxy +for natural language understanding. Despite the relevance of the task in +building conversational agents and improving text classification, machine +translation and other NLP tasks, to the best of our knowledge, there is no +publicly available NLI corpus for the Romanian language. To this end, we +introduce the first Romanian NLI corpus (RoNLI) comprising 58K training +sentence pairs, which are obtained via distant supervision, and 6K validation +and test sentence pairs, which are manually annotated with the correct labels. +We conduct experiments with multiple machine learning methods based on distant +learning, ranging from shallow models based on word embeddings to +transformer-based neural networks, to establish a set of competitive baselines. +Furthermore, we improve on the best model by employing a new curriculum +learning strategy based on data cartography. Our dataset and code to reproduce +the baselines are available at https://github.com/Eduard6421/RONLI. + +
+
+ comment: Accepted at ACL 2024 (Main) +
+
+
+
+
+ + ♻ ☆ Continual Driving Policy Optimization with Closed-Loop Individualized + Curricula ICRA 2024 + + +
+ The safety of autonomous vehicles (AV) has been a long-standing top concern, +stemming from the absence of rare and safety-critical scenarios in the +long-tail naturalistic driving distribution. To tackle this challenge, a surge +of research in scenario-based autonomous driving has emerged, with a focus on +generating high-risk driving scenarios and applying them to conduct +safety-critical testing of AV models. However, limited work has been explored +on the reuse of these extensive scenarios to iteratively improve AV models. +Moreover, it remains intractable and challenging to filter through gigantic +scenario libraries collected from other AV models with distinct behaviors, +attempting to extract transferable information for current AV improvement. +Therefore, we develop a continual driving policy optimization framework +featuring Closed-Loop Individualized Curricula (CLIC), which we factorize into +a set of standardized sub-modules for flexible implementation choices: AV +Evaluation, Scenario Selection, and AV Training. CLIC frames AV Evaluation as a +collision prediction task, where it estimates the chance of AV failures in +these scenarios at each iteration. Subsequently, by re-sampling from historical +scenarios based on these failure probabilities, CLIC tailors individualized +curricula for downstream training, aligning them with the evaluated capability +of AV. Accordingly, CLIC not only maximizes the utilization of the vast +pre-collected scenario library for closed-loop driving policy optimization but +also facilitates AV improvement by individualizing its training with more +challenging cases out of those poorly organized scenarios. Experimental results +clearly indicate that CLIC surpasses other curriculum-based training +strategies, showing substantial improvement in managing risky scenarios, while +still maintaining proficiency in handling simpler cases. + +
+
+ comment: ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Improved Random Features for Dot Product Kernels + + +
+ Dot product kernels, such as polynomial and exponential (softmax) kernels, +are among the most widely used kernels in machine learning, as they enable +modeling the interactions between input features, which is crucial in +applications like computer vision, natural language processing, and recommender +systems. We make several novel contributions for improving the efficiency of +random feature approximations for dot product kernels, to make these kernels +more useful in large scale learning. First, we present a generalization of +existing random feature approximations for polynomial kernels, such as +Rademacher and Gaussian sketches and TensorSRHT, using complex-valued random +features. We show empirically that the use of complex features can +significantly reduce the variances of these approximations. Second, we provide +a theoretical analysis for understanding the factors affecting the efficiency +of various random feature approximations, by deriving closed-form expressions +for their variances. These variance formulas elucidate conditions under which +certain approximations (e.g., TensorSRHT) achieve lower variances than others +(e.g., Rademacher sketches), and conditions under which the use of complex +features leads to lower variances than real features. Third, by using these +variance formulas, which can be evaluated in practice, we develop a data-driven +optimization approach to improve random feature approximations for general dot +product kernels, which is also applicable to the Gaussian kernel. We describe +the improvements brought by these contributions with extensive experiments on a +variety of tasks and datasets. + +
+
+ comment: To appear in Journal of Machine Learning Research (JMLR) +
+
+
+
+
+ + ♻ ☆ Detectability of hierarchical communities in networks + + +
+ We study the problem of recovering a planted hierarchy of partitions in a +network. The detectability of a single planted partition has previously been +analysed in detail and a phase transition has been identified below which the +partition cannot be detected. Here we show that, in the hierarchical setting, +there exist additional phases in which the presence of multiple consistent +partitions can either help or hinder detection. Accordingly, the detectability +limit for non-hierarchical partitions typically provides insufficient +information about the detectability of the complete hierarchical structure, as +we highlight with several constructive examples. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought remarkable +generative capabilities across diverse tasks. However, despite the impressive +achievements, these LLMs still have numerous inherent vulnerabilities, +particularly when faced with jailbreak attacks. By investigating jailbreak +attacks, we can uncover hidden weaknesses in LLMs and inform the development of +more robust defense mechanisms to fortify their security. In this paper, we +further explore the boundary of jailbreak attacks on LLMs and propose +Analyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes +advantage of LLMs' growing analyzing and reasoning capability and reveals their +underlying vulnerabilities when facing analyzing-based tasks. We conduct a +detailed evaluation of ABJ across various open-source and closed-source LLMs, +which achieves 94.8% attack success rate (ASR) and 1.06 attack efficiency (AE) +on GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and +efficiency. Our research highlights the importance of prioritizing and +enhancing the safety of LLMs to mitigate the risks of misuse. The code is +publicly available at hhttps://github.com/theshi-1128/ABJ-Attack. Warning: This +paper contains examples of LLMs that might be offensive or harmful. + +
+
+
+
+
+ + ♻ ☆ Decentralized Intelligence Network (DIN) + + +
+ Decentralized Intelligence Network (DIN) is a theoretical framework +addressing data fragmentation and siloing challenges, enabling scalable AI +through data sovereignty. It facilitates effective AI utilization within +sovereign networks by overcoming barriers to accessing diverse data sources, +leveraging: 1) personal data stores to ensure data sovereignty, where data +remains securely within Participants' control; 2) a scalable federated learning +protocol implemented on a public blockchain for decentralized AI training, +where only model parameter updates are shared, keeping data within the personal +data stores; and 3) a scalable, trustless cryptographic rewards mechanism on a +public blockchain to incentivize participation and ensure fair reward +distribution through a decentralized auditing protocol. This approach +guarantees that no entity can prevent or control access to training data or +influence financial benefits, as coordination and reward distribution are +managed on the public blockchain with an immutable record. The framework +supports effective AI training by allowing Participants to maintain control +over their data, benefit financially, and contribute to a decentralized, +scalable ecosystem that leverages collective AI to develop beneficial +algorithms. + +
+
+ comment: 14 pages, 1 figure. DIN was presented by the author as a speaker at + the Summit on Responsible Decentralized Intelligence - Future of + Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the + Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC +
+
+
+
+
+ + ♻ ☆ kNN-CLIP: Retrieval Enables Training-Free Segmentation on Continually + Expanding Large Vocabularies + + +
+ Continual segmentation has not yet tackled the challenge of improving +open-vocabulary segmentation models with training data for accurate +segmentation across large, continually expanding vocabularies. We discover that +traditional continual training results in severe catastrophic forgetting, +failing to outperform a zero-shot segmentation baseline. We introduce a novel +training-free strategy, kNN-CLIP, which augments the model with a database of +instance embeddings for semantic and panoptic segmentation that achieves zero +forgetting. We demonstrate that kNN-CLIP can adapt to continually growing +vocabularies without the need for retraining or large memory costs. kNN-CLIP +enables open-vocabulary segmentation methods to expand their vocabularies on +any domain with a single pass through the data, while only storing compact +embeddings. This approach minimizes both compute and memory costs. kNN-CLIP +achieves state-of-the-art performance across large-vocabulary semantic and +panoptic segmentation datasets. We hope kNN-CLIP represents a significant step +forward in enabling more efficient and adaptable continual segmentation, paving +the way for advances in real-world large-vocabulary continual segmentation +methods. + +
+
+
+
+
+ + ♻ ☆ Learning Optimal Filters Using Variational Inference ICML + + +
+ Filtering - the task of estimating the conditional distribution of states of +a dynamical system given partial, noisy, observations - is important in many +areas of science and engineering, including weather and climate prediction. +However, the filtering distribution is generally intractable to obtain for +high-dimensional, nonlinear systems. Filters used in practice, such as the +ensemble Kalman filter (EnKF), are biased for nonlinear systems and have +numerous tuning parameters. Here, we present a framework for learning a +parameterized analysis map - the map that takes a forecast distribution and +observations to the filtering distribution - using variational inference. We +show that this methodology can be used to learn gain matrices for filtering +linear and nonlinear dynamical systems, as well as inflation and localization +parameters for an EnKF. Future work will apply this framework to learn new +filtering algorithms. + +
+
+ comment: Workshop on Machine Learning for Earth System Modeling, International + Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Interpretable Graph Neural Networks for Tabular Data ECAI 2024 + + +
+ Data in tabular format is frequently occurring in real-world applications. +Graph Neural Networks (GNNs) have recently been extended to effectively handle +such data, allowing feature interactions to be captured through representation +learning. However, these approaches essentially produce black-box models, in +the form of deep neural networks, precluding users from following the logic +behind the model predictions. We propose an approach, called IGNNet +(Interpretable Graph Neural Network for tabular data), which constrains the +learning algorithm to produce an interpretable model, where the model shows how +the predictions are exactly computed from the original input features. A +large-scale empirical investigation is presented, showing that IGNNet is +performing on par with state-of-the-art machine-learning algorithms that target +tabular data, including XGBoost, Random Forests, and TabNet. At the same time, +the results show that the explanations obtained from IGNNet are aligned with +the true Shapley values of the features without incurring any additional +computational overhead. + +
+
+ comment: Accepted at ECAI 2024 +
+
+
+
+
+ + ♻ ☆ Federated Smoothing Proximal Gradient for Quantile Regression with + Non-Convex Penalties + + +
+ Distributed sensors in the internet-of-things (IoT) generate vast amounts of +sparse data. Analyzing this high-dimensional data and identifying relevant +predictors pose substantial challenges, especially when data is preferred to +remain on the device where it was collected for reasons such as data integrity, +communication bandwidth, and privacy. This paper introduces a federated +quantile regression algorithm to address these challenges. Quantile regression +provides a more comprehensive view of the relationship between variables than +mean regression models. However, traditional approaches face difficulties when +dealing with nonconvex sparse penalties and the inherent non-smoothness of the +loss function. For this purpose, we propose a federated smoothing proximal +gradient (FSPG) algorithm that integrates a smoothing mechanism with the +proximal gradient framework, thereby enhancing both precision and computational +speed. This integration adeptly handles optimization over a network of devices, +each holding local data samples, making it particularly effective in federated +learning scenarios. The FSPG algorithm ensures steady progress and reliable +convergence in each iteration by maintaining or reducing the value of the +objective function. By leveraging nonconvex penalties, such as the minimax +concave penalty (MCP) and smoothly clipped absolute deviation (SCAD), the +proposed method can identify and preserve key predictors within sparse models. +Comprehensive simulations validate the robust theoretical foundations of the +proposed algorithm and demonstrate improved estimation precision and reliable +convergence. + +
+
+
+
+
+ + ♻ ☆ A Practical Solver for Scalar Data Topological Simplification IEEE VIS 2024 + + +
+ This paper presents a practical approach for the optimization of topological +simplification, a central pre-processing step for the analysis and +visualization of scalar data. Given an input scalar field f and a set of +"signal" persistence pairs to maintain, our approach produces an output field g +that is close to f and which optimizes (i) the cancellation of "non-signal" +pairs, while (ii) preserving the "signal" pairs. In contrast to pre-existing +simplification algorithms, our approach is not restricted to persistence pairs +involving extrema and can thus address a larger class of topological features, +in particular saddle pairs in three-dimensional scalar data. Our approach +leverages recent generic persistence optimization frameworks and extends them +with tailored accelerations specific to the problem of topological +simplification. Extensive experiments report substantial accelerations over +these frameworks, thereby making topological simplification optimization +practical for real-life datasets. Our approach enables a direct visualization +and analysis of the topologically simplified data, e.g., via isosurfaces of +simplified topology (fewer components and handles). We apply our approach to +the extraction of prominent filament structures in three-dimensional data. +Specifically, we show that our pre-simplification of the data leads to +practical improvements over standard topological techniques for removing +filament loops. We also show how our approach can be used to repair genus +defects in surface processing. Finally, we provide a C++ implementation for +reproducibility purposes. + +
+
+ comment: 13 pages, 10 figures, IEEE VIS 2024 +
+
+
+
+
+ + ♻ ☆ There is No Silver Bullet: Benchmarking Methods in Predictive + Combinatorial Optimization + + +
+ Predictive combinatorial optimization, where the parameters of combinatorial +optimization (CO) are unknown at the decision-making time, is the precise +modeling of many real-world applications, including energy cost-aware +scheduling and budget allocation on advertising. Tackling such a problem +usually involves a prediction model and a CO solver. These two modules are +integrated into the predictive CO pipeline following two design principles: +``Predict-then-Optimize (PtO)'', which learns predictions by supervised +training and subsequently solves CO using predicted coefficients, while the +other, named ``Predict-and-Optimize (PnO)'', directly optimizes towards the +ultimate decision quality and claims to yield better decisions than traditional +PtO approaches. However, there lacks a systematic benchmark of both approaches, +including the specific design choices at the module level, as well as an +evaluation dataset that covers representative real-world scenarios. To this +end, we develop a modular framework to benchmark 11 existing PtO/PnO methods on +8 problems, including a new industrial dataset for combinatorial advertising +that will be released. Our study shows that PnO approaches are better than PtO +on 7 out of 8 benchmarks, but there is no silver bullet found for the specific +design choices of PnO. A comprehensive categorization of current approaches and +integration of typical scenarios are provided under a unified benchmark. +Therefore, this paper could serve as a comprehensive benchmark for future PnO +approach development and also offer fast prototyping for application-focused +development. + +
+
+
+
+
+ + ♻ ☆ Interpretable Pre-Trained Transformers for Heart Time-Series Data + + +
+ Decoder-only transformers are the backbone of the popular generative +pre-trained transformer (GPT) series of large language models. In this work, we +employ this framework to the analysis of clinical heart time-series data, to +create two pre-trained general purpose cardiac models, termed PPG-PT and +ECG-PT. We place a special emphasis on making both such pre-trained models +fully interpretable. This is achieved firstly through aggregate attention maps +which show that, in order to make predictions, the model focuses on similar +points in previous cardiac cycles and gradually broadens its attention in +deeper layers. Next, we show that tokens with the same value, which occur at +different distinct points in the electrocardiography (ECG) and +photoplethysmography (PPG) cycle, form separate clusters in high dimensional +space. The clusters form according to phase, as the tokens propagate through +the transformer blocks. Finally, we highlight that individual attention heads +respond to specific physiologically relevent features, such as the dicrotic +notch in PPG and the P-wave in ECG. It is also demonstrated that these +pre-trained models are straightforward to fine-tune for tasks such as +classification of atrial fibrillation (AF), and beat detection in +photoplethysmography. For the example of AF, the fine-tuning took 11 minutes of +computer time, and achieved the respective leave-one-subject-out AUCs of 0.99 +and 0.93 for ECG and PPG within the MIMIC Perform AF dataset. In addition, the +fine-tuned beat detector achieved a state-of-the-art F1 score of 98%, as well +as uniquely providing a beat confidence level which acts as a signal quality +estimator. Importantly, the fine-tuned models for AF screening are also fully +explainable, with attention shifting to regions in the context that are +strongly indicative of atrial fibrillation. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Convergence of Message Passing Graph Neural Networks with Generic + Aggregation On Large Random Graphs + + +
+ We study the convergence of message passing graph neural networks on random +graph models to their continuous counterpart as the number of nodes tends to +infinity. Until now, this convergence was only known for architectures with +aggregation functions in the form of normalized means, or, equivalently, of an +application of classical operators like the adjacency matrix or the graph +Laplacian. We extend such results to a large class of aggregation functions, +that encompasses all classically used message passing graph neural networks, +such as attention-based message passing, max convolutional message passing, +(degree-normalized) convolutional message passing, or moment-based aggregation +message passing. Under mild assumptions, we give non-asymptotic bounds with +high probability to quantify this convergence. Our main result is based on the +McDiarmid inequality. Interestingly, this result does not apply to the case +where the aggregation is a coordinate-wise maximum. We treat this case +separately and obtain a different convergence rate. + +
+
+
+
+
+ + ♻ ☆ Neural networks can detect model-free static arbitrage strategies + + +
+ In this paper we demonstrate both theoretically as well as numerically that +neural networks can detect model-free static arbitrage opportunities whenever +the market admits some. Due to the use of neural networks, our method can be +applied to financial markets with a high number of traded securities and +ensures almost immediate execution of the corresponding trading strategies. To +demonstrate its tractability, effectiveness, and robustness we provide examples +using real financial data. From a technical point of view, we prove that a +single neural network can approximately solve a class of convex semi-infinite +programs, which is the key result in order to derive our theoretical results +that neural networks can detect model-free static arbitrage strategies whenever +the financial market admits such opportunities. + +
+
+
+
+
+ + ♻ ☆ Transformers Can Do Bayesian Inference ICLR 2022 + + +
+ Currently, it is hard to reap the benefits of deep learning for Bayesian +methods, which allow the explicit specification of prior knowledge and +accurately capture model uncertainty. We present Prior-Data Fitted Networks +(PFNs). PFNs leverage in-context learning in large-scale machine learning +techniques to approximate a large set of posteriors. The only requirement for +PFNs to work is the ability to sample from a prior distribution over supervised +learning tasks (or functions). Our method restates the objective of posterior +approximation as a supervised classification problem with a set-valued input: +it repeatedly draws a task (or function) from the prior, draws a set of data +points and their labels from it, masks one of the labels and learns to make +probabilistic predictions for it based on the set-valued input of the rest of +the data points. Presented with a set of samples from a new supervised learning +task as input, PFNs make probabilistic predictions for arbitrary other data +points in a single forward propagation, having learned to approximate Bayesian +inference. We demonstrate that PFNs can near-perfectly mimic Gaussian processes +and also enable efficient Bayesian inference for intractable problems, with +over 200-fold speedups in multiple setups compared to current methods. We +obtain strong results in very diverse areas such as Gaussian process +regression, Bayesian neural networks, classification for small tabular data +sets, and few-shot image classification, demonstrating the generality of PFNs. +Code and trained PFNs are released at +https://github.com/automl/TransformersCanDoBayesianInference. + +
+
+ comment: Published at ICLR 2022 +
+
+
+
+
+ + ♻ ☆ HyperMono: A Monotonicity-aware Approach to Hyper-Relational Knowledge + Representation + + +
+ In a hyper-relational knowledge graph (HKG), each fact is composed of a main +triple associated with attribute-value qualifiers, which express additional +factual knowledge. The hyper-relational knowledge graph completion (HKGC) task +aims at inferring plausible missing links in a HKG. Most existing approaches to +HKGC focus on enhancing the communication between qualifier pairs and main +triples, while overlooking two important properties that emerge from the +monotonicity of the hyper-relational graphs representation regime. Stage +Reasoning allows for a two-step reasoning process, facilitating the integration +of coarse-grained inference results derived solely from main triples and +fine-grained inference results obtained from hyper-relational facts with +qualifiers. In the initial stage, coarse-grained results provide an upper bound +for correct predictions, which are subsequently refined in the fine-grained +step. More generally, Qualifier Monotonicity implies that by attaching more +qualifier pairs to a main triple, we may only narrow down the answer set, but +never enlarge it. This paper proposes the HyperMono model for hyper-relational +knowledge graph completion, which realizes stage reasoning and qualifier +monotonicity. To implement qualifier monotonicity HyperMono resorts to cone +embeddings. Experiments on three real-world datasets with three different +scenario conditions demonstrate the strong performance of HyperMono when +compared to the SoTA. + +
+
+
+
+
+ + ♻ ☆ Surprisingly Strong Performance Prediction with Neural Graph Features ICML 2024 + + +
+ Performance prediction has been a key part of the neural architecture search +(NAS) process, allowing to speed up NAS algorithms by avoiding +resource-consuming network training. Although many performance predictors +correlate well with ground truth performance, they require training data in the +form of trained networks. Recently, zero-cost proxies have been proposed as an +efficient method to estimate network performance without any training. However, +they are still poorly understood, exhibit biases with network properties, and +their performance is limited. Inspired by the drawbacks of zero-cost proxies, +we propose neural graph features (GRAF), simple to compute properties of +architectural graphs. GRAF offers fast and interpretable performance prediction +while outperforming zero-cost proxies and other common encodings. In +combination with other zero-cost proxies, GRAF outperforms most existing +performance predictors at a fraction of the cost. + +
+
+ comment: ICML 2024. Code at https://github.com/gabikadlecova/zc_combine , blog + post: https://gabikadlecova.github.io/blog/2024/graf/ +
+
+
+
+
+ + ♻ ☆ When to Accept Automated Predictions and When to Defer to Human + Judgment? + + +
+ Ensuring the reliability and safety of automated decision-making is crucial. +It is well-known that data distribution shifts in machine learning can produce +unreliable outcomes. This paper proposes a new approach for measuring the +reliability of predictions under distribution shifts. We analyze how the +outputs of a trained neural network change using clustering to measure +distances between outputs and class centroids. We propose this distance as a +metric to evaluate the confidence of predictions under distribution shifts. We +assign each prediction to a cluster with centroid representing the mean softmax +output for all correct predictions of a given class. We then define a safety +threshold for a class as the smallest distance from an incorrect prediction to +the given class centroid. We evaluate the approach on the MNIST and CIFAR-10 +datasets using a Convolutional Neural Network and a Vision Transformer, +respectively. The results show that our approach is consistent across these +data sets and network models, and indicate that the proposed metric can offer +an efficient way of determining when automated predictions are acceptable and +when they should be deferred to human operators given a distribution shift. + +
+
+ comment: 9 pages, 10 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Can virtual staining for high-throughput screening generalize? + + +
+ The large volume and variety of imaging data from high-throughput screening +(HTS) in the pharmaceutical industry present an excellent resource for training +virtual staining models. However, the potential of models trained under one set +of experimental conditions to generalize to other conditions remains +underexplored. This study systematically investigates whether data from three +cell types (lung, ovarian, and breast) and two phenotypes (toxic and non-toxic +conditions) commonly found in HTS can effectively train virtual staining models +to generalize across three typical HTS distribution shifts: unseen phenotypes, +unseen cell types, and the combination of both. Utilizing a dataset of 772,416 +paired bright-field, cytoplasm, nuclei, and DNA-damage stain images, we +evaluate the generalization capabilities of models across pixel-based, +instance-wise, and biological-feature-based levels. Our findings indicate that +training virtual nuclei and cytoplasm models on non-toxic condition samples not +only generalizes to toxic condition samples but leads to improved performance +across all evaluation levels compared to training on toxic condition samples. +Generalization to unseen cell types shows variability depending on the cell +type; models trained on ovarian or lung cell samples often perform well under +other conditions, while those trained on breast cell samples consistently show +poor generalization. Generalization to unseen cell types and phenotypes shows +good generalization across all levels of evaluation compared to addressing +unseen cell types alone. This study represents the first large-scale, +data-centric analysis of the generalization capability of virtual staining +models trained on diverse HTS datasets, providing valuable strategies for +experimental training data generation. + +
+
+
+
+
+ + ♻ ☆ Navigating Cultural Chasms: Exploring and Unlocking the Cultural POV of + Text-To-Image Models + + +
+ Text-To-Image (TTI) models, such as DALL-E and StableDiffusion, have +demonstrated remarkable prompt-based image generation capabilities. +Multilingual encoders may have a substantial impact on the cultural agency of +these models, as language is a conduit of culture. In this study, we explore +the cultural perception embedded in TTI models by characterizing culture across +three hierarchical tiers: cultural dimensions, cultural domains, and cultural +concepts. Based on this ontology, we derive prompt templates to unlock the +cultural knowledge in TTI models, and propose a comprehensive suite of +evaluation techniques, including intrinsic evaluations using the CLIP space, +extrinsic evaluations with a Visual-Question-Answer (VQA) model and human +assessments, to evaluate the cultural content of TTI-generated images. To +bolster our research, we introduce the CulText2I dataset, derived from six +diverse TTI models and spanning ten languages. Our experiments provide insights +regarding Do, What, Which and How research questions about the nature of +cultural encoding in TTI models, paving the way for cross-cultural applications +of these models. + +
+
+ comment: Project page: https://venturamor.github.io/CulText2IWeb/ +
+
+
+
+
+ + ♻ ☆ mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal + Large Language Models + + +
+ Multi-modal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in executing instructions for a variety of single-image tasks. +Despite this progress, significant challenges remain in modeling long image +sequences. In this work, we introduce the versatile multi-modal large language +model, mPLUG-Owl3, which enhances the capability for long image-sequence +understanding in scenarios that incorporate retrieved image-text knowledge, +interleaved image-text, and lengthy videos. Specifically, we propose novel +hyper attention blocks to efficiently integrate vision and language into a +common language-guided semantic space, thereby facilitating the processing of +extended multi-image scenarios. Extensive experimental results suggest that +mPLUG-Owl3 achieves state-of-the-art performance among models with a similar +size on single-image, multi-image, and video benchmarks. Moreover, we propose a +challenging long visual sequence evaluation named Distractor Resistance to +assess the ability of models to maintain focus amidst distractions. Finally, +with the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance +on ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to +the development of more efficient and powerful multimodal large language +models. + +
+
+
+
+
+ + ♻ ☆ Clinical information extraction for Low-resource languages with Few-shot + learning using Pre-trained language models and Prompting + + +
+ Automatic extraction of medical information from clinical documents poses +several challenges: high costs of required clinical expertise, limited +interpretability of model predictions, restricted computational resources and +privacy regulations. Recent advances in domain-adaptation and prompting methods +showed promising results with minimal training data using lightweight masked +language models, which are suited for well-established interpretability +methods. We are first to present a systematic evaluation of these methods in a +low-resource setting, by performing multi-class section classification on +German doctor's letters. We conduct extensive class-wise evaluations supported +by Shapley values, to validate the quality of our small training data set and +to ensure the interpretability of model predictions. We demonstrate that a +lightweight, domain-adapted pretrained model, prompted with just 20 shots, +outperforms a traditional classification model by 30.5% accuracy. Our results +serve as a process-oriented guideline for clinical information extraction +projects working with low-resource. + +
+
+ comment: Paper accepted for publication in the journal: Natural Language + Engineering (Cambridge Core) +
+
+
+
+
+ + ♻ ☆ Is Power-Seeking AI an Existential Risk? + + +
+ This report examines what I see as the core argument for concern about +existential risk from misaligned artificial intelligence. I proceed in two +stages. First, I lay out a backdrop picture that informs such concern. On this +picture, intelligent agency is an extremely powerful force, and creating agents +much more intelligent than us is playing with fire -- especially given that if +their objectives are problematic, such agents would plausibly have instrumental +incentives to seek power over humans. Second, I formulate and evaluate a more +specific six-premise argument that creating agents of this kind will lead to +existential catastrophe by 2070. On this argument, by 2070: (1) it will become +possible and financially feasible to build relevantly powerful and agentic AI +systems; (2) there will be strong incentives to do so; (3) it will be much +harder to build aligned (and relevantly powerful/agentic) AI systems than to +build misaligned (and relevantly powerful/agentic) AI systems that are still +superficially attractive to deploy; (4) some such misaligned systems will seek +power over humans in high-impact ways; (5) this problem will scale to the full +disempowerment of humanity; and (6) such disempowerment will constitute an +existential catastrophe. I assign rough subjective credences to the premises in +this argument, and I end up with an overall estimate of ~5% that an existential +catastrophe of this kind will occur by 2070. (May 2022 update: since making +this report public in April 2021, my estimate here has gone up, and is now at +>10%.) + +
+
+ comment: 57 pages, 1 figure. Edited to fix link to audio version, add links to + short version and reviews, and fix a typo in section 2.1.2 +
+
+
+
+
+ + ♻ ☆ The Misclassification Likelihood Matrix: Some Classes Are More Likely To + Be Misclassified Than Others + + +
+ This study introduces the Misclassification Likelihood Matrix (MLM) as a +novel tool for quantifying the reliability of neural network predictions under +distribution shifts. The MLM is obtained by leveraging softmax outputs and +clustering techniques to measure the distances between the predictions of a +trained neural network and class centroids. By analyzing these distances, the +MLM provides a comprehensive view of the model's misclassification tendencies, +enabling decision-makers to identify the most common and critical sources of +errors. The MLM allows for the prioritization of model improvements and the +establishment of decision thresholds based on acceptable risk levels. The +approach is evaluated on the MNIST dataset using a Convolutional Neural Network +(CNN) and a perturbed version of the dataset to simulate distribution shifts. +The results demonstrate the effectiveness of the MLM in assessing the +reliability of predictions and highlight its potential in enhancing the +interpretability and risk mitigation capabilities of neural networks. The +implications of this work extend beyond image classification, with ongoing +applications in autonomous systems, such as self-driving cars, to improve the +safety and reliability of decision-making in complex, real-world environments. + +
+
+ comment: 9 pages, 7 figures, 1 table +
+
+
+
+
+ + ♻ ☆ YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph + Convolutional Networks and Transformer-Attention + + +
+ Accurate prediction of drug molecule solubility is crucial for therapeutic +effectiveness and safety. Traditional methods often miss complex molecular +structures, leading to inaccuracies. We introduce the YZS-Model, a deep +learning framework integrating Graph Convolutional Networks (GCN), Transformer +architectures, and Long Short-Term Memory (LSTM) networks to enhance prediction +precision. GCNs excel at capturing intricate molecular topologies by modeling +the relationships between atoms and bonds. Transformers, with their +self-attention mechanisms, effectively identify long-range dependencies within +molecules, capturing global interactions. LSTMs process sequential data, +preserving long-term dependencies and integrating temporal information within +molecular sequences. This multifaceted approach leverages the strengths of each +component, resulting in a model that comprehensively understands and predicts +molecular properties. Trained on 9,943 compounds and tested on an anticancer +dataset, the YZS-Model achieved an $R^2$ of 0.59 and an RMSE of 0.57, +outperforming benchmark models ($R^2$ of 0.52 and RMSE of 0.61). In an +independent test, it demonstrated an RMSE of 1.05, improving accuracy by 45.9%. +The integration of these deep learning techniques allows the YZS-Model to learn +valuable features from complex data without predefined parameters, handle large +datasets efficiently, and adapt to various molecular types. This comprehensive +capability significantly improves predictive accuracy and model +generalizability. Its precision in solubility predictions can expedite drug +development by optimizing candidate selection, reducing costs, and enhancing +efficiency. Our research underscores deep learning's transformative potential +in pharmaceutical science, particularly for solubility prediction and drug +design. + +
+
+ comment: 23 pages, 16 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ etuner: Redundancy-Aware Efficient Continual Learning on Edge Devices + + +
+ Many emerging applications, such as robot-assisted eldercare and object +recognition, generally employ deep learning neural networks (DNNs) and require +the deployment of DNN models on edge devices. These applications naturally +require i) handling streaming-in inference requests and ii) fine-tuning the +deployed models to adapt to possible deployment scenario changes. Continual +learning (CL) is widely adopted to satisfy these needs. CL is a popular deep +learning paradigm that handles both continuous model fine-tuning and overtime +inference requests. However, an inappropriate model fine-tuning scheme could +involve significant redundancy and consume considerable time and energy, making +it challenging to apply CL on edge devices. In this paper, we propose ETuner, +an efficient edge continual learning framework that optimizes inference +accuracy, fine-tuning execution time, and energy efficiency through both +inter-tuning and intra-tuning optimizations. Experimental results show that, on +average, ETuner reduces overall fine-tuning execution time by 64%, energy +consumption by 56%, and improves average inference accuracy by 1.75% over the +immediate model fine-tuning approach. + +
+
+
+
+
+ + ♻ ☆ Separable Operator Networks + + +
+ Operator learning has become a powerful tool in machine learning for modeling +complex physical systems governed by partial differential equations (PDEs). +Although Deep Operator Networks (DeepONet) show promise, they require extensive +data acquisition. Physics-informed DeepONets (PI-DeepONet) mitigate data +scarcity but suffer from inefficient training processes. We introduce Separable +Operator Networks (SepONet), a novel framework that significantly enhances the +efficiency of physics-informed operator learning. SepONet uses independent +trunk networks to learn basis functions separately for different coordinate +axes, enabling faster and more memory-efficient training via forward-mode +automatic differentiation. We provide a universal approximation theorem for +SepONet proving that it generalizes to arbitrary operator learning problems, +and then validate its performance through comprehensive benchmarking against +PI-DeepONet. Our results demonstrate SepONet's superior performance across +various nonlinear and inseparable PDEs, with SepONet's advantages increasing +with problem complexity, dimension, and scale. For 1D time-dependent PDEs, +SepONet achieves up to $112\times$ faster training and $82\times$ reduction in +GPU memory usage compared to PI-DeepONet, while maintaining comparable +accuracy. For the 2D time-dependent nonlinear diffusion equation, SepONet +efficiently handles the complexity, achieving a 6.44\% mean relative $\ell_{2}$ +test error, while PI-DeepONet fails due to memory constraints. This work paves +the way for extreme-scale learning of continuous mappings between +infinite-dimensional function spaces. Open source code is available at +\url{https://github.com/HewlettPackard/separable-operator-networks}. + +
+
+ comment: SepONet version 2. This revised version polishes writing and open + sources code. The initial version was submitted to arXiv on July 15, 2024 +
+
+
+
+
+ + ♻ ☆ Two-scale Neural Networks for Partial Differential Equations with Small + Parameters + + +
+ We propose a two-scale neural network method for solving partial differential +equations (PDEs) with small parameters using physics-informed neural networks +(PINNs). We directly incorporate the small parameters into the architecture of +neural networks. The proposed method enables solving PDEs with small parameters +in a simple fashion, without adding Fourier features or other computationally +taxing searches of truncation parameters. Various numerical examples +demonstrate reasonable accuracy in capturing features of large derivatives in +the solutions caused by small parameters. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on EEG-Based Emotion Recognition: A Graph-Based + Perspective + + +
+ Compared to other modalities, electroencephalogram (EEG) based emotion +recognition can intuitively respond to emotional patterns in the human brain +and, therefore, has become one of the most focused tasks in affective +computing. The nature of emotions is a physiological and psychological state +change in response to brain region connectivity, making emotion recognition +focus more on the dependency between brain regions instead of specific brain +regions. A significant trend is the application of graphs to encapsulate such +dependency as dynamic functional connections between nodes across temporal and +spatial dimensions. Concurrently, the neuroscientific underpinnings behind this +dependency endow the application of graphs in this field with a distinctive +significance. However, there is neither a comprehensive review nor a tutorial +for constructing emotion-relevant graphs in EEG-based emotion recognition. In +this paper, we present a comprehensive survey of these studies, delivering a +systematic review of graph-related methods in this field from a methodological +perspective. We propose a unified framework for graph applications in this +field and categorize these methods on this basis. Finally, based on previous +studies, we also present several open challenges and future directions in this +field. + +
+
+
+
+
+ + ♻ ☆ Recent Advances in Predictive Modeling with Electronic Health Records IJCAI 24 + + +
+ The development of electronic health records (EHR) systems has enabled the +collection of a vast amount of digitized patient data. However, utilizing EHR +data for predictive modeling presents several challenges due to its unique +characteristics. With the advancements in machine learning techniques, deep +learning has demonstrated its superiority in various applications, including +healthcare. This survey systematically reviews recent advances in deep +learning-based predictive models using EHR data. Specifically, we begin by +introducing the background of EHR data and providing a mathematical definition +of the predictive modeling task. We then categorize and summarize predictive +deep models from multiple perspectives. Furthermore, we present benchmarks and +toolkits relevant to predictive modeling in healthcare. Finally, we conclude +this survey by discussing open challenges and suggesting promising directions +for future research. + +
+
+ comment: This paper has been accepted by IJCAI 24 Survey Track +
+
+
+
+
+ + ♻ ☆ Rethinking Channel Dependence for Multivariate Time Series Forecasting: + Learning from Leading Indicators ICLR 2024 + + +
+ Recently, channel-independent methods have achieved state-of-the-art +performance in multivariate time series (MTS) forecasting. Despite reducing +overfitting risks, these methods miss potential opportunities in utilizing +channel dependence for accurate predictions. We argue that there exist locally +stationary lead-lag relationships between variates, i.e., some lagged variates +may follow the leading indicators within a short time period. Exploiting such +channel dependence is beneficial since leading indicators offer advance +information that can be used to reduce the forecasting difficulty of the lagged +variates. In this paper, we propose a new method named LIFT that first +efficiently estimates leading indicators and their leading steps at each time +step and then judiciously allows the lagged variates to utilize the advance +information from leading indicators. LIFT plays as a plugin that can be +seamlessly collaborated with arbitrary time series forecasting methods. +Extensive experiments on six real-world datasets demonstrate that LIFT improves +the state-of-the-art methods by 5.5% in average forecasting performance. Our +code is available at https://github.com/SJTU-Quant/LIFT. + +
+
+ comment: Accepted to ICLR 2024. Code is at https://github.com/SJTU-DMTai/LIFT +
+
+
+
+
+ + ♻ ☆ Simplified Diffusion Schrödinger Bridge + + +
+ This paper introduces a novel theoretical simplification of the Diffusion +Schr\"odinger Bridge (DSB) that facilitates its unification with Score-based +Generative Models (SGMs), addressing the limitations of DSB in complex data +generation and enabling faster convergence and enhanced performance. By +employing SGMs as an initial solution for DSB, our approach capitalizes on the +strengths of both frameworks, ensuring a more efficient training process and +improving the performance of SGM. We also propose a reparameterization +technique that, despite theoretical approximations, practically improves the +network's fitting capabilities. Our extensive experimental evaluations confirm +the effectiveness of the simplified DSB, demonstrating its significant +improvements. We believe the contributions of this work pave the way for +advanced generative modeling. + +
+
+
+
+
+ + ♻ ☆ A Laplacian-based Quantum Graph Neural Network for Semi-Supervised + Learning + + +
+ Laplacian learning method is a well-established technique in classical +graph-based semi-supervised learning, but its potential in the quantum domain +remains largely unexplored. This study investigates the performance of the +Laplacian-based Quantum Semi-Supervised Learning (QSSL) method across four +benchmark datasets -- Iris, Wine, Breast Cancer Wisconsin, and Heart Disease. +Further analysis explores the impact of increasing Qubit counts, revealing that +adding more Qubits to a quantum system doesn't always improve performance. The +effectiveness of additional Qubits depends on the quantum algorithm and how +well it matches the dataset. Additionally, we examine the effects of varying +entangling layers on entanglement entropy and test accuracy. The performance of +Laplacian learning is highly dependent on the number of entangling layers, with +optimal configurations varying across different datasets. Typically, moderate +levels of entanglement offer the best balance between model complexity and +generalization capabilities. These observations highlight the crucial need for +precise hyperparameter tuning tailored to each dataset to achieve optimal +performance in Laplacian learning methods. + +
+
+
+
+
+ + ♻ ☆ DPO: Differential reinforcement learning with application to optimal + configuration search + + +
+ Reinforcement learning (RL) with continuous state and action spaces remains +one of the most challenging problems within the field. Most current learning +methods focus on integral identities such as value functions to derive an +optimal strategy for the learning agent. In this paper, we instead study the +dual form of the original RL formulation to propose the first differential RL +framework that can handle settings with limited training samples and +short-length episodes. Our approach introduces Differential Policy Optimization +(DPO), a pointwise and stage-wise iteration method that optimizes policies +encoded by local-movement operators. We prove a pointwise convergence estimate +for DPO and provide a regret bound comparable with the best current theoretical +derivation. Such pointwise estimate ensures that the learned policy matches the +optimal path uniformly across different steps. We then apply DPO to a class of +practical RL problems with continuous state and action spaces, and which search +for optimal configurations with Lagrangian rewards. DPO is easy to implement, +scalable, and shows competitive results on benchmarking experiments against +several popular RL methods. + +
+
+
+
+
+ + ♻ ☆ PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator + + +
+ We present Piecewise Rectified Flow (PeRFlow), a flow-based method for +accelerating diffusion models. PeRFlow divides the sampling process of +generative flows into several time windows and straightens the trajectories in +each interval via the reflow operation, thereby approaching piecewise linear +flows. PeRFlow achieves superior performance in a few-step generation. +Moreover, through dedicated parameterizations, the PeRFlow models inherit +knowledge from the pretrained diffusion models. Thus, the training converges +fast and the obtained models show advantageous transfer ability, serving as +universal plug-and-play accelerators that are compatible with various workflows +based on the pre-trained diffusion models. Codes for training and inference are +publicly released. https://github.com/magic-research/piecewise-rectified-flow + +
+
+
+
+
+ + ♻ ☆ Made to Order: Discovering monotonic temporal changes via + self-supervised video ordering ECCV 2024 + + +
+ Our objective is to discover and localize monotonic temporal changes in a +sequence of images. To achieve this, we exploit a simple proxy task of ordering +a shuffled image sequence, with `time' serving as a supervisory signal, since +only changes that are monotonic with time can give rise to the correct +ordering. We also introduce a transformer-based model for ordering of image +sequences of arbitrary length with built-in attribution maps. After training, +the model successfully discovers and localizes monotonic changes while ignoring +cyclic and stochastic ones. We demonstrate applications of the model in +multiple domains covering different scene and object types, discovering both +object-level and environmental changes in unseen sequences. We also demonstrate +that the attention-based attribution maps function as effective prompts for +segmenting the changing regions, and that the learned representations can be +used for downstream applications. Finally, we show that the model achieves the +state-of-the-art on standard benchmarks for image ordering. + +
+
+ comment: ECCV 2024 Oral. Project page: https://charigyang.github.io/order/ +
+
+
+
+
+ + ♻ ☆ PoisonedRAG: Knowledge Corruption Attacks to Retrieval-Augmented + Generation of Large Language Models USENIX Security + + +
+ Large language models (LLMs) have achieved remarkable success due to their +exceptional generative capabilities. Despite their success, they also have +inherent limitations such as a lack of up-to-date knowledge and hallucination. +Retrieval-Augmented Generation (RAG) is a state-of-the-art technique to +mitigate these limitations. The key idea of RAG is to ground the answer +generation of an LLM on external knowledge retrieved from a knowledge database. +Existing studies mainly focus on improving the accuracy or efficiency of RAG, +leaving its security largely unexplored. We aim to bridge the gap in this work. +We find that the knowledge database in a RAG system introduces a new and +practical attack surface. Based on this attack surface, we propose PoisonedRAG, +the first knowledge corruption attack to RAG, where an attacker could inject a +few malicious texts into the knowledge database of a RAG system to induce an +LLM to generate an attacker-chosen target answer for an attacker-chosen target +question. We formulate knowledge corruption attacks as an optimization problem, +whose solution is a set of malicious texts. Depending on the background +knowledge (e.g., black-box and white-box settings) of an attacker on a RAG +system, we propose two solutions to solve the optimization problem, +respectively. Our results show PoisonedRAG could achieve a 90% attack success +rate when injecting five malicious texts for each target question into a +knowledge database with millions of texts. We also evaluate several defenses +and our results show they are insufficient to defend against PoisonedRAG, +highlighting the need for new defenses. + +
+
+ comment: To appear in USENIX Security Symposium 2025. The code is available at + https://github.com/sleeepeer/PoisonedRAG +
+
+
+
+
+ + ♻ ☆ GraNNDis: Efficient Unified Distributed Training Framework for Deep GNNs + on Large Clusters + + +
+ Graph neural networks (GNNs) are one of the rapidly growing fields within +deep learning. While many distributed GNN training frameworks have been +proposed to increase the training throughput, they face three limitations when +applied to multi-server clusters. 1) They suffer from an inter-server +communication bottleneck because they do not consider the inter-/intra-server +bandwidth gap, a representative characteristic of multi-server clusters. 2) +Redundant memory usage and computation hinder the scalability of the +distributed frameworks. 3) Sampling methods, de facto standard in mini-batch +training, incur unnecessary errors in multi-server clusters. We found that +these limitations can be addressed by exploiting the characteristics of +multi-server clusters. Here, we propose GraNNDis, a fast distributed GNN +training framework for multi-server clusters. Firstly, we present Flexible +Preloading, which preloads the essential vertex dependencies server-wise to +reduce the low-bandwidth inter-server communications. Secondly, we introduce +Cooperative Batching, which enables memory-efficient, less redundant mini-batch +training by utilizing high-bandwidth intra-server communications. Thirdly, we +propose Expansion-aware Sampling, a cluster-aware sampling method, which +samples the edges that affect the system speedup. As sampling the intra-server +dependencies does not contribute much to the speedup as they are communicated +through fast intra-server links, it only targets a server boundary to be +sampled. Lastly, we introduce One-Hop Graph Masking, a computation and +communication structure to realize the above methods in multi-server +environments. We evaluated GraNNDis on multi-server clusters, and it provided +significant speedup over the state-of-the-art distributed GNN training +frameworks. GraNNDis is open-sourced at +https://github.com/AIS-SNU/GraNNDis_Artifact to facilitate its use. + +
+
+
+
+
+ + ♻ ☆ Weyl Calculus and Exactly Solvable Schrödinger Bridges with + Quadratic State Cost + + +
+ Schr\"{o}dinger bridge--a stochastic dynamical generalization of optimal mass +transport--exhibits a learning-control duality. Viewed as a stochastic control +problem, the Schr\"{o}dinger bridge finds an optimal control policy that steers +a given joint state statistics to another while minimizing the total control +effort subject to controlled diffusion and deadline constraints. Viewed as a +stochastic learning problem, the Schr\"{o}dinger bridge finds the most-likely +distribution-valued trajectory connecting endpoint distributional observations, +i.e., solves the two point boundary-constrained maximum likelihood problem over +the manifold of probability distributions. Recent works have shown that solving +the Schr\"{o}dinger bridge problem with state cost requires finding the Markov +kernel associated with a reaction-diffusion PDE where the state cost appears as +a state-dependent reaction rate. We explain how ideas from Weyl calculus in +quantum mechanics, specifically the Weyl operator and the Weyl symbol, can help +determine such Markov kernels. We illustrate these ideas by explicitly finding +the Markov kernel for the case of quadratic state cost via Weyl calculus, +recovering our earlier results but avoiding tedious computation with Hermite +polynomials. + +
+
+
+
+
+ + ♻ ☆ Fuzzy Recurrent Stochastic Configuration Networks for Industrial Data + Analytics + + +
+ This paper presents a novel neuro-fuzzy model, termed fuzzy recurrent +stochastic configuration networks (F-RSCNs), for industrial data analytics. +Unlike the original recurrent stochastic configuration network (RSCN), the +proposed F-RSCN is constructed by multiple sub-reservoirs, and each +sub-reservoir is associated with a Takagi-Sugeno-Kang (TSK) fuzzy rule. Through +this hybrid framework, first, the interpretability of the model is enhanced by +incorporating fuzzy reasoning to embed the prior knowledge into the network. +Then, the parameters of the neuro-fuzzy model are determined by the recurrent +stochastic configuration (RSC) algorithm. This scheme not only ensures the +universal approximation property and fast learning speed of the built model but +also overcomes uncertain problems, such as unknown dynamic orders, arbitrary +structure determination, and the sensitivity of learning parameters in +modelling nonlinear dynamics. Finally, an online update of the output weights +is performed using the projection algorithm, and the convergence analysis of +the learning parameters is given. By integrating TSK fuzzy inference systems +into RSCNs, F-RSCNs have strong fuzzy inference capability and can achieve +sound performance for both learning and generalization. Comprehensive +experiments show that the proposed F-RSCNs outperform other classical +neuro-fuzzy and non-fuzzy models, demonstrating great potential for modelling +complex industrial systems. + +
+
+
+
+
+ + ♻ ☆ Latent Space Symmetry Discovery + + +
+ Equivariant neural networks require explicit knowledge of the symmetry group. +Automatic symmetry discovery methods aim to relax this constraint and learn +invariance and equivariance from data. However, existing symmetry discovery +methods are limited to simple linear symmetries and cannot handle the +complexity of real-world data. We propose a novel generative model, Latent +LieGAN (LaLiGAN), which can discover symmetries of nonlinear group actions. It +learns a mapping from the data space to a latent space where the symmetries +become linear and simultaneously discovers symmetries in the latent space. +Theoretically, we show that our model can express nonlinear symmetries under +some conditions about the group action. Experimentally, we demonstrate that our +method can accurately discover the intrinsic symmetry in high-dimensional +dynamical systems. LaLiGAN also results in a well-structured latent space that +is useful for downstream tasks including equation discovery and long-term +forecasting. + +
+
+
+
+
+ + ♻ ☆ On minimizing the training set fill distance in machine learning + regression + + +
+ For regression tasks one often leverages large datasets for training +predictive machine learning models. However, using large datasets may not be +feasible due to computational limitations or high data labelling costs. +Therefore, suitably selecting small training sets from large pools of +unlabelled data points is essential to maximize model performance while +maintaining efficiency. In this work, we study Farthest Point Sampling (FPS), a +data selection approach that aims to minimize the fill distance of the selected +set. We derive an upper bound for the maximum expected prediction error, +conditional to the location of the unlabelled data points, that linearly +depends on the training set fill distance. For empirical validation, we perform +experiments using two regression models on three datasets. We empirically show +that selecting a training set by aiming to minimize the fill distance, thereby +minimizing our derived bound, significantly reduces the maximum prediction +error of various regression models, outperforming alternative sampling +approaches by a large margin. Furthermore, we show that selecting training sets +with the FPS can also increase model stability for the specific case of +Gaussian kernel regression approaches. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Real-Time Ground Delay Program Revision + and Corresponding Flight Delay Assignments + + +
+ This paper explores the optimization of Ground Delay Programs (GDP), a +prevalent Traffic Management Initiative used in Air Traffic Management (ATM) to +reconcile capacity and demand discrepancies at airports. Employing +Reinforcement Learning (RL) to manage the inherent uncertainties in the +national airspace system-such as weather variability, fluctuating flight +demands, and airport arrival rates-we developed two RL models: Behavioral +Cloning (BC) and Conservative Q-Learning (CQL). These models are designed to +enhance GDP efficiency by utilizing a sophisticated reward function that +integrates ground and airborne delays and terminal area congestion. We +constructed a simulated single-airport environment, SAGDP_ENV, which +incorporates real operational data along with predicted uncertainties to +facilitate realistic decision-making scenarios. Utilizing the whole year 2019 +data from Newark Liberty International Airport (EWR), our models aimed to +preemptively set airport program rates. Despite thorough modeling and +simulation, initial outcomes indicated that the models struggled to learn +effectively, attributed potentially to oversimplified environmental +assumptions. This paper discusses the challenges encountered, evaluates the +models' performance against actual operational data, and outlines future +directions to refine RL applications in ATM. + +
+
+
+
+
+ + ♻ ☆ A Note on Stability in Asynchronous Stochastic Approximation without + Communication Delays + + +
+ In this paper, we study asynchronous stochastic approximation algorithms +without communication delays. Our main contribution is a stability proof for +these algorithms that extends a method of Borkar and Meyn by accommodating more +general noise conditions. We also derive convergence results from this +stability result and discuss their application in important average-reward +reinforcement learning problems. + +
+
+ comment: Corrected typos and a minor error; parts of this material will be + included in a separate future arXiv preprint +
+
+
+
+
+ + ♻ ☆ Spiking Neural Networks in Vertical Federated Learning: Performance + Trade-offs + + +
+ Federated machine learning enables model training across multiple clients +while maintaining data privacy. Vertical Federated Learning (VFL) specifically +deals with instances where the clients have different feature sets of the same +samples. As federated learning models aim to improve efficiency and +adaptability, innovative neural network architectures like Spiking Neural +Networks (SNNs) are being leveraged to enable fast and accurate processing at +the edge. SNNs, known for their efficiency over Artificial Neural Networks +(ANNs), have not been analyzed for their applicability in VFL, thus far. In +this paper, we investigate the benefits and trade-offs of using SNN models in a +vertical federated learning setting. We implement two different federated +learning architectures -- with model splitting and without model splitting -- +that have different privacy and performance implications. We evaluate the setup +using CIFAR-10 and CIFAR-100 benchmark datasets along with SNN implementations +of VGG9 and ResNET classification models. Comparative evaluations demonstrate +that the accuracy of SNN models is comparable to that of traditional ANNs for +VFL applications, albeit significantly more energy efficient. + +
+
+
+
+
+ + ♻ ☆ DFML: Decentralized Federated Mutual Learning + + +
+ In the realm of real-world devices, centralized servers in Federated Learning +(FL) present challenges including communication bottlenecks and susceptibility +to a single point of failure. Additionally, contemporary devices inherently +exhibit model and data heterogeneity. Existing work lacks a Decentralized FL +(DFL) framework capable of accommodating such heterogeneity without imposing +architectural restrictions or assuming the availability of public data. To +address these issues, we propose a Decentralized Federated Mutual Learning +(DFML) framework that is serverless, supports nonrestrictive heterogeneous +models, and avoids reliance on public data. DFML effectively handles model and +data heterogeneity through mutual learning, which distills knowledge between +clients, and cyclically varying the amount of supervision and distillation +signals. Extensive experimental results demonstrate consistent effectiveness of +DFML in both convergence speed and global accuracy, outperforming prevalent +baselines under various conditions. For example, with the CIFAR-100 dataset and +50 clients, DFML achieves a substantial increase of +17.20% and +19.95% in +global accuracy under Independent and Identically Distributed (IID) and non-IID +data shifts, respectively. + +
+
+
+
+
+ + ♻ ☆ Training LLMs over Neurally Compressed Text + + +
+ In this paper, we explore the idea of training large language models (LLMs) +over highly compressed text. While standard subword tokenizers compress text by +a small factor, neural text compressors can achieve much higher rates of +compression. If it were possible to train LLMs directly over neurally +compressed text, this would confer advantages in training and serving +efficiency, as well as easier handling of long text spans. The main obstacle to +this goal is that strong compression tends to produce opaque outputs that are +not well-suited for learning. In particular, we find that text na\"ively +compressed via Arithmetic Coding is not readily learnable by LLMs. To overcome +this, we propose Equal-Info Windows, a novel compression technique whereby text +is segmented into blocks that each compress to the same bit length. Using this +method, we demonstrate effective learning over neurally compressed text that +improves with scale, and outperforms byte-level baselines by a wide margin on +perplexity and inference speed benchmarks. While our method delivers worse +perplexity than subword tokenizers for models trained with the same parameter +count, it has the benefit of shorter sequence lengths. Shorter sequence lengths +require fewer autoregressive generation steps, and reduce latency. Finally, we +provide extensive analysis of the properties that contribute to learnability, +and offer concrete suggestions for how to further improve the performance of +high-compression tokenizers. + +
+
+
+
+
+ + ♻ ☆ AI-Powered Energy Algorithmic Trading: Integrating Hidden Markov Models + with Neural Networks + + +
+ In quantitative finance, machine learning methods are essential for alpha +generation. This study introduces a new approach that combines Hidden Markov +Models (HMM) and neural networks, integrated with Black-Litterman portfolio +optimization. During the COVID period (2019-2022), this dual-model approach +achieved a 83% return with a Sharpe ratio of 0.77. It incorporates two risk +models to enhance risk management, showing efficiency during volatile periods. +The methodology was implemented on the QuantConnect platform, which was chosen +for its robust framework and experimental reproducibility. The system, which +predicts future price movements, includes a three-year warm-up to ensure proper +algorithm function. It targets highly liquid, large-cap energy stocks to ensure +stable and predictable performance while also considering broker payments. The +dual-model alpha system utilizes log returns to select the optimal state based +on the historical performance. It combines state predictions with neural +network outputs, which are based on historical data, to generate trading +signals. This study examined the architecture of the trading system, data +pre-processing, training, and performance. The full code and backtesting data +are available under the QuantConnect terms. + +
+
+ comment: 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Time series forecasting with high stakes: A field study of the air cargo + industry KDD + + +
+ Time series forecasting in the air cargo industry presents unique challenges +due to volatile market dynamics and the significant impact of accurate +forecasts on generated revenue. This paper explores a comprehensive approach to +demand forecasting at the origin-destination (O\&D) level, focusing on the +development and implementation of machine learning models in decision-making +for the air cargo industry. We leverage a mixture of experts framework, +combining statistical and advanced deep learning models to provide reliable +forecasts for cargo demand over a six-month horizon. The results demonstrate +that our approach outperforms industry benchmarks, offering actionable insights +for cargo capacity allocation and strategic decision-making in the air cargo +industry. While this work is applied in the airline industry, the methodology +is broadly applicable to any field where forecast-based decision-making in a +volatile environment is crucial. + +
+
+ comment: The 10th Mining and Learning from Time Series Workshop: From + Classical Methods to LLMs. SIGKDD, Barcelona, Spain, 6 page +
+
+
+
+
+ + ♻ ☆ Offline to Online Learning for Personalized Bandwidth Estimation + + +
+ In this work, we tackle the problem of bandwidth estimation (BWE) for +real-time communication systems through expert personalization. While expert +heuristic-based methods have been widely adopted, tailoring these methods for +each and every end user environment is cumbersome due to the level of domain +expertise and manual effort required to adjust the carefully tuned heuristic +parameters. Thus. we propose Merlin, a data-driven solution to BWE that +harnesses expert demonstrations from prior heuristic-based methods to extract +an expert BWE policy. The extracted policy can then be finetuned to end user +network conditions to improve user quality of experience (QoE). In real-world +videoconferencing calls, Merlin matches our expert's policy with no +statistically significant movements in terms of objective QoE metrics. +Additionally, we show that personalizing Merlin's control policy is possible +through a small number of online data-driven parameter updates. + +
+
+ comment: 7 pages, 6 figures, under review. Trimmed content to 6 pages, added + finetuning evaluations, and updated writing to focus on IL + finetuning as + opposed to IL only +
+
+
+
+
+ + ♻ ☆ Clinical translation of machine learning algorithms for seizure + detection in scalp electroencephalography: systematic review + + +
+ Machine learning algorithms for seizure detection have shown considerable +diagnostic potential, with recent reported accuracies reaching 100%. Yet, only +few published algorithms have fully addressed the requirements for successful +clinical translation. This is, for example, because the properties of training +data may limit the generalisability of algorithms, algorithm performance may +vary depending on which electroencephalogram (EEG) acquisition hardware was +used, or run-time processing costs may be prohibitive to real-time clinical use +cases. To address these issues in a critical manner, we systematically review +machine learning algorithms for seizure detection with a focus on clinical +translatability, assessed by criteria including generalisability, run-time +costs, explainability, and clinically-relevant performance metrics. For +non-specialists, the domain-specific knowledge necessary to contextualise model +development and evaluation is provided. It is our hope that such critical +evaluation of machine learning algorithms with respect to their potential +real-world effectiveness can help accelerate clinical translation and identify +gaps in the current seizure detection literature. + +
+
+ comment: 60 pages, LaTeX; Addition of co-authors, keywords alphabetically + sorted, text in figure 1 changed to black, references added ([9],[56] ), + abbreviations defined (CNN, RNN), added section 6.4, corrected the + referencing style, added a sentence about the existence of non-epileptic + attacks, added an explanation about the drawback of the 10-20 system, removed + bold from Figure/Table titles +
+
+
+
+
+ + ♻ ☆ Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis + Search + + +
+ Deep neural network models have a complex architecture and are +overparameterized. The number of parameters is more than the whole dataset, +which is highly resource-consuming. This complicates their application and +limits its usage on different devices. Reduction in the number of network +parameters helps to reduce the size of the model, but at the same time, +thoughtlessly applied, can lead to a deterioration in the quality of the +network. One way to reduce the number of model parameters is matrix +decomposition, where a matrix is represented as a product of smaller matrices. +In this paper, we propose a new way of applying the matrix decomposition with +respect to the weights of convolutional layers. The essence of the method is to +train not all convolutions, but only the subset of convolutions (basis +convolutions), and represent the rest as linear combinations of the basis ones. +Experiments on models from the ResNet family and the CIFAR-10 dataset +demonstrate that basis convolutions can not only reduce the size of the model +but also accelerate the forward and backward passes of the network. Another +contribution of this work is that we propose a fast method for selecting a +subset of network layers in which the use of matrix decomposition does not +degrade the quality of the final model. + +
+
+ comment: Increase the size of matrix pictures for better UX in PDF view +
+
+
+
+
+ + ♻ ☆ What Matters in Hierarchical Search for Combinatorial Reasoning + Problems? ICLR + 2024 + + +
+ Efficiently tackling combinatorial reasoning problems, particularly the +notorious NP-hard tasks, remains a significant challenge for AI research. +Recent efforts have sought to enhance planning by incorporating hierarchical +high-level search strategies, known as subgoal methods. While promising, their +performance against traditional low-level planners is inconsistent, raising +questions about their application contexts. In this study, we conduct an +in-depth exploration of subgoal-planning methods for combinatorial reasoning. +We identify the attributes pivotal for leveraging the advantages of high-level +search: hard-to-learn value functions, complex action spaces, presence of dead +ends in the environment, or using data collected from diverse experts. We +propose a consistent evaluation methodology to achieve meaningful comparisons +between methods and reevaluate the state-of-the-art algorithms. + +
+
+ comment: Accepted for Generative Models for Decision Making Workshop at ICLR + 2024 +
+
+
+
+
+ + ♻ ☆ Sociodemographic Bias in Language Models: A Survey and Forward Path + + +
+ Sociodemographic bias in language models (LMs) has the potential for harm +when deployed in real-world settings. This paper presents a comprehensive +survey of the past decade of research on sociodemographic bias in LMs, +organized into a typology that facilitates examining the different aims: types +of bias, quantifying bias, and debiasing techniques. We track the evolution of +the latter two questions, then identify current trends and their limitations, +as well as emerging techniques. To guide future research towards more effective +and reliable solutions, and to help authors situate their work within this +broad landscape, we conclude with a checklist of open questions. + +
+
+ comment: 23 pages, 3 figure +
+
+
+
+
+ + ♻ ☆ Retrieval-enhanced Knowledge Editing in Language Models for Multi-Hop + Question Answering CIKM 2024 + + +
+ Large Language Models (LLMs) have shown proficiency in question-answering +tasks but often struggle to integrate real-time knowledge, leading to +potentially outdated or inaccurate responses. This problem becomes even more +challenging when dealing with multi-hop questions, since they require LLMs to +update and integrate multiple knowledge pieces relevant to the questions. To +tackle the problem, we propose the Retrieval-Augmented model Editing (RAE) +framework for multi-hop question answering. RAE first retrieves edited facts +and then refines the language model through in-context learning. Specifically, +our retrieval approach, based on mutual information maximization, leverages the +reasoning abilities of LLMs to identify chain facts that traditional +similarity-based searches might miss. In addition, our framework includes a +pruning strategy to eliminate redundant information from the retrieved facts, +which enhances the editing accuracy and mitigates the hallucination problem. +Our framework is supported by theoretical justification for its fact retrieval +efficacy. Finally, comprehensive evaluation across various LLMs validates RAE's +ability in providing accurate answers with updated knowledge. Our code is +available at: https://github.com/sycny/RAE. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Detecting Audio-Visual Deepfakes with Fine-Grained Inconsistencies BMVC 2024 + + +
+ Existing methods on audio-visual deepfake detection mainly focus on +high-level features for modeling inconsistencies between audio and visual data. +As a result, these approaches usually overlook finer audio-visual artifacts, +which are inherent to deepfakes. Herein, we propose the introduction of +fine-grained mechanisms for detecting subtle artifacts in both spatial and +temporal domains. First, we introduce a local audio-visual model capable of +capturing small spatial regions that are prone to inconsistencies with audio. +For that purpose, a fine-grained mechanism based on a spatially-local distance +coupled with an attention module is adopted. Second, we introduce a +temporally-local pseudo-fake augmentation to include samples incorporating +subtle temporal inconsistencies in our training set. Experiments on the DFDC +and the FakeAVCeleb datasets demonstrate the superiority of the proposed method +in terms of generalization as compared to the state-of-the-art under both +in-dataset and cross-dataset settings. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ☆ ViMo: Generating Motions from Casual Videos + + +
+ Although humans have the innate ability to imagine multiple possible actions +from videos, it remains an extraordinary challenge for computers due to the +intricate camera movements and montages. Most existing motion generation +methods predominantly rely on manually collected motion datasets, usually +tediously sourced from motion capture (Mocap) systems or Multi-View cameras, +unavoidably resulting in a limited size that severely undermines their +generalizability. Inspired by recent advance of diffusion models, we probe a +simple and effective way to capture motions from videos and propose a novel +Video-to-Motion-Generation framework (ViMo) which could leverage the immense +trove of untapped video content to produce abundant and diverse 3D human +motions. Distinct from prior work, our videos could be more causal, including +complicated camera movements and occlusions. Striking experimental results +demonstrate the proposed model could generate natural motions even for videos +where rapid movements, varying perspectives, or frequent occlusions might +exist. We also show this work could enable three important downstream +applications, such as generating dancing motions according to arbitrary music +and source video style. Extensive experimental results prove that our model +offers an effective and scalable way to generate diversity and realistic +motions. Code and demos will be public soon. + +
+
+
+
+
+ + ♻ ☆ PointPCA: Point Cloud Objective Quality Assessment Using PCA-Based + Descriptors + + +
+ Point clouds denote a prominent solution for the representation of 3D +photo-realistic content in immersive applications. Similarly to other imaging +modalities, quality predictions for point cloud contents are vital for a wide +range of applications, enabling trade-off optimizations between data quality +and data size in every processing step from acquisition to rendering. In this +work, we focus on use cases that consider human end-users consuming point cloud +contents and, hence, we concentrate on visual quality metrics. In particular, +we propose a set of perceptually relevant descriptors based on Principal +Component Analysis (PCA) decomposition, which is applied to both geometry and +texture data for full-reference point cloud quality assessment. Statistical +features are derived from these descriptors to characterize local shape and +appearance properties for both a reference and a distorted point cloud. The +extracted statistical features are subsequently compared to provide +corresponding predictions of visual quality for the distorted point cloud. As +part of our method, a learning-based approach is proposed to fuse these +individual predictors to a unified perceptual score. We validate the accuracy +of the individual predictors, as well as the unified quality scores obtained +after regression against subjectively annotated datasets, showing that our +metric outperforms state-of-the-art solutions. Insights regarding design +decisions are provided through exploratory studies, evaluating the performance +of our metric under different parameter configurations, attribute domains, +color spaces, and regression models. A software implementation of the proposed +metric is made available at the following link: +https://github.com/cwi-dis/pointpca. + +
+
+ comment: 32 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Generative AI for Immersive Communication: The Next Frontier in + Internet-of-Senses Through 6G + + +
+ Over the past two decades, the Internet-of-Things (IoT) has become a +transformative concept, and as we approach 2030, a new paradigm known as the +Internet of Senses (IoS) is emerging. Unlike conventional Virtual Reality (VR), +IoS seeks to provide multi-sensory experiences, acknowledging that in our +physical reality, our perception extends far beyond just sight and sound; it +encompasses a range of senses. This article explores the existing technologies +driving immersive multi-sensory media, delving into their capabilities and +potential applications. This exploration includes a comparative analysis +between conventional immersive media streaming and a proposed use case that +leverages semantic communication empowered by generative Artificial +Intelligence (AI). The focal point of this analysis is the substantial +reduction in bandwidth consumption by 99.93% in the proposed scheme. Through +this comparison, we aim to underscore the practical applications of generative +AI for immersive media. Concurrently addressing major challenges in this field, +such as temporal synchronization of multiple media, ensuring high throughput, +minimizing the End-to-End (E2E) latency, and robustness to low bandwidth while +outlining future trajectories. + +
+
+
+
+
+ + ♻ ☆ Improving Audio Generation with Visual Enhanced Captions + + +
+ Generative models have shown significant achievements in audio generation +tasks. However, existing models struggle with complex and detailed prompts, +leading to potential performance degradation. We hypothesize that this problem +stems from the simplicity and scarcity of the training data. This work aims to +create a large-scale audio dataset with rich captions for improving audio +generation models. We first develop an automated pipeline to generate detailed +captions by transforming predicted visual captions, audio captions, and tagging +labels into comprehensive descriptions using a Large Language Model (LLM). The +resulting dataset, Sound-VECaps, comprises 1.66M high-quality audio-caption +pairs with enriched details including audio event orders, occurred places and +environment information. We then demonstrate that training the text-to-audio +generation models with Sound-VECaps significantly improves the performance on +complex prompts. Furthermore, we conduct ablation studies of the models on +several downstream audio-language tasks, showing the potential of Sound-VECaps +in advancing audio-text representation learning. Our dataset and models are +available online. + +
+
+ comment: 5 pages with 1 appendix +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 88 + +
+
+
+ + ☆ LOLgorithm: Integrating Semantic,Syntactic and Contextual Elements for + Humor Classification + + +
+ This paper explores humor detection through a linguistic lens, prioritizing +syntactic, semantic, and contextual features over computational methods in +Natural Language Processing. We categorize features into syntactic, semantic, +and contextual dimensions, including lexicons, structural statistics, Word2Vec, +WordNet, and phonetic style. Our proposed model, Colbert, utilizes BERT +embeddings and parallel hidden layers to capture sentence congruity. By +combining syntactic, semantic, and contextual features, we train Colbert for +humor detection. Feature engineering examines essential syntactic and semantic +features alongside BERT embeddings. SHAP interpretations and decision trees +identify influential features, revealing that a holistic approach improves +humor detection accuracy on unseen data. Integrating linguistic cues from +different dimensions enhances the model's ability to understand humor +complexity beyond traditional computational methods. + +
+
+
+
+
+ + ☆ FastFiD: Improve Inference Efficiency of Open Domain Question Answering + via Sentence Selection ACL 2024 + + +
+ Open Domain Question Answering (ODQA) has been advancing rapidly in recent +times, driven by significant developments in dense passage retrieval and +pretrained language models. Current models typically incorporate the FiD +framework, which is composed by a neural retriever alongside an encoder-decoder +neural reader. In the answer generation process, the retriever will retrieve +numerous passages (around 100 for instance), each of which is then individually +encoded by the encoder. Subsequently, the decoder makes predictions based on +these encoded passages. Nevertheless, this framework can be relatively +time-consuming, particularly due to the extensive length of the gathered +passages. To address this, we introduce FastFiD in this paper, a novel approach +that executes sentence selection on the encoded passages. This aids in +retaining valuable sentences while reducing the context length required for +generating answers. Experiments on three commonly used datasets (Natural +Questions, TriviaQA and ASQA) demonstrate that our method can enhance the +inference speed by 2.3X-5.7X, while simultaneously maintaining the model's +performance. Moreover, an in-depth analysis of the model's attention reveals +that the selected sentences indeed hold a substantial contribution towards the +final answer. The codes are publicly available at +https://github.com/thunlp/FastFiD. + +
+
+ comment: ACL 2024 Main Conference +
+
+
+
+
+ + ☆ Animate, or Inanimate, That is the Question for Large Language Models + + +
+ The cognitive essence of humans is deeply intertwined with the concept of +animacy, which plays an essential role in shaping their memory, vision, and +multi-layered language understanding. Although animacy appears in language via +nuanced constraints on verbs and adjectives, it is also learned and refined +through extralinguistic information. Similarly, we assume that the LLMs' +limited abilities to understand natural language when processing animacy are +motivated by the fact that these models are trained exclusively on text. + Hence, the question this paper aims to answer arises: can LLMs, in their +digital wisdom, process animacy in a similar way to what humans would do? We +then propose a systematic analysis via prompting approaches. In particular, we +probe different LLMs by prompting them using animate, inanimate, usual, and +stranger contexts. Results reveal that, although LLMs have been trained +predominantly on textual data, they exhibit human-like behavior when faced with +typical animate and inanimate entities in alignment with earlier studies. +Hence, LLMs can adapt to understand unconventional situations by recognizing +oddities as animated without needing to interface with unspoken cognitive +triggers humans rely on to break down animations. + +
+
+
+
+
+ + ☆ VisualAgentBench: Towards Large Multimodal Models as Visual Foundation + Agents + + +
+ Large Multimodal Models (LMMs) have ushered in a new era in artificial +intelligence, merging capabilities in both language and vision to form highly +capable Visual Foundation Agents. These agents are postulated to excel across a +myriad of tasks, potentially approaching general artificial intelligence. +However, existing benchmarks fail to sufficiently challenge or showcase the +full potential of LMMs in complex, real-world environments. To address this +gap, we introduce VisualAgentBench (VAB), a comprehensive and pioneering +benchmark specifically designed to train and evaluate LMMs as visual foundation +agents across diverse scenarios, including Embodied, Graphical User Interface, +and Visual Design, with tasks formulated to probe the depth of LMMs' +understanding and interaction capabilities. Through rigorous testing across +nine proprietary LMM APIs and eight open models, we demonstrate the +considerable yet still developing agent capabilities of these models. +Additionally, VAB constructs a trajectory training set constructed through +hybrid methods including Program-based Solvers, LMM Agent Bootstrapping, and +Human Demonstrations, promoting substantial performance improvements in LMMs +through behavior cloning. Our work not only aims to benchmark existing models +but also provides a solid foundation for future development into visual +foundation agents. Code, train \& test data, and part of fine-tuned open LMMs +are available at \url{https://github.com/THUDM/VisualAgentBench}. + +
+
+
+
+
+ + ☆ Long-Form Answers to Visual Questions from Blind and Low Vision People + + +
+ Vision language models can now generate long-form answers to questions about +images - long-form visual question answers (LFVQA). We contribute VizWiz-LF, a +dataset of long-form answers to visual questions posed by blind and low vision +(BLV) users. VizWiz-LF contains 4.2k long-form answers to 600 visual questions, +collected from human expert describers and six VQA models. We develop and +annotate functional roles of sentences of LFVQA and demonstrate that long-form +answers contain information beyond the question answer such as explanations and +suggestions. We further conduct automatic and human evaluations with BLV and +sighted people to evaluate long-form answers. BLV people perceive both +human-written and generated long-form answers to be plausible, but generated +answers often hallucinate incorrect visual details, especially for unanswerable +visual questions (e.g., blurry or irrelevant images). To reduce hallucinations, +we evaluate the ability of VQA models to abstain from answering unanswerable +questions across multiple prompting strategies. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aids to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+ + ☆ Synthetic Patient-Physician Dialogue Generation from Clinical Notes + Using LLM + + +
+ Medical dialogue systems (MDS) enhance patient-physician communication, +improve healthcare accessibility, and reduce costs. However, acquiring suitable +data to train these systems poses significant challenges. Privacy concerns +prevent the use of real conversations, necessitating synthetic alternatives. +Synthetic dialogue generation from publicly available clinical notes offers a +promising solution to this issue, providing realistic data while safeguarding +privacy. Our approach, SynDial, uses a single LLM iteratively with zero-shot +prompting and a feedback loop to generate and refine high-quality synthetic +dialogues. The feedback consists of weighted evaluation scores for similarity +and extractiveness. The iterative process ensures dialogues meet predefined +thresholds, achieving superior extractiveness as a result of the feedback loop. +Additionally, evaluation shows that the generated dialogues excel in factuality +metric compared to the baselines and has comparable diversity scores with GPT4. + +
+
+
+
+
+ + ☆ MovieSum: An Abstractive Summarization Dataset for Movie Screenplays ACL 2024 + + +
+ Movie screenplay summarization is challenging, as it requires an +understanding of long input contexts and various elements unique to movies. +Large language models have shown significant advancements in document +summarization, but they often struggle with processing long input contexts. +Furthermore, while television transcripts have received attention in recent +studies, movie screenplay summarization remains underexplored. To stimulate +research in this area, we present a new dataset, MovieSum, for abstractive +summarization of movie screenplays. This dataset comprises 2200 movie +screenplays accompanied by their Wikipedia plot summaries. We manually +formatted the movie screenplays to represent their structural elements. +Compared to existing datasets, MovieSum possesses several distinctive features: +(1) It includes movie screenplays, which are longer than scripts of TV +episodes. (2) It is twice the size of previous movie screenplay datasets. (3) +It provides metadata with IMDb IDs to facilitate access to additional external +knowledge. We also show the results of recently released large language models +applied to summarization on our dataset to provide a detailed baseline. + +
+
+ comment: ACL 2024 Findings +
+
+
+
+
+ + ☆ Review-driven Personalized Preference Reasoning with Large Language + Models for Recommendation + + +
+ Recent advancements in Large Language Models (LLMs) have demonstrated +exceptional performance across a wide range of tasks, generating significant +interest in their application to recommendation systems. However, existing +methods have not fully capitalized on the potential of LLMs, often constrained +by limited input information or failing to fully utilize their advanced +reasoning capabilities. To address these limitations, we introduce EXP3RT, a +novel LLM-based recommender designed to leverage rich preference information +contained in user and item reviews. EXP3RT is basically fine-tuned through +distillation from a teacher LLM to perform three key tasks in order: EXP3RT +first extracts and encapsulates essential subjective preferences from raw +reviews, aggregates and summarizes them according to specific criteria to +create user and item profiles. It then generates detailed step-by-step +reasoning followed by predicted rating, i.e., reasoning-enhanced rating +prediction, by considering both subjective and objective information from +user/item profiles and item descriptions. This personalized preference +reasoning from EXP3RT enhances rating prediction accuracy and also provides +faithful and reasonable explanations for recommendation. Extensive experiments +show that EXP3RT outperforms existing methods on both rating prediction and +candidate item reranking for top-k recommendation, while significantly +enhancing the explainability of recommendation systems. + +
+
+
+
+
+ + ☆ FuxiTranyu: A Multilingual Large Language Model Trained with Balanced + Data + + +
+ Large language models (LLMs) have demonstrated prowess in a wide range of +tasks. However, many LLMs exhibit significant performance discrepancies between +high- and low-resource languages. To mitigate this challenge, we present +FuxiTranyu, an open-source multilingual LLM, which is designed to satisfy the +need of the research community for balanced and high-performing multilingual +capabilities. FuxiTranyu-8B, the base model with 8 billion parameters, is +trained from scratch on a meticulously balanced multilingual data repository +that contains 600 billion tokens covering 43 natural languages and 16 +programming languages. In addition to the base model, we also develop two +instruction-tuned models: FuxiTranyu-8B-SFT that is fine-tuned on a diverse +multilingual instruction dataset, and FuxiTranyu-8B-DPO that is further refined +with DPO on a preference dataset for enhanced alignment ability. Extensive +experiments on a wide range of multilingual benchmarks demonstrate the +competitive performance of FuxiTranyu against existing multilingual LLMs, e.g., +BLOOM-7B, PolyLM-13B, Llama-2-Chat-7B and Mistral-7B-Instruct. Interpretability +analyses at both the neuron and representation level suggest that FuxiTranyu is +able to learn consistent multilingual representations across different +languages. To promote further research into multilingual LLMs and their working +mechanisms, we release both the base and instruction-tuned FuxiTranyu models +together with 58 pretraining checkpoints at HuggingFace and Github. + +
+
+
+
+
+ + ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ☆ Context-aware Visual Storytelling with Visual Prefix Tuning and + Contrastive Learning + + +
+ Visual storytelling systems generate multi-sentence stories from image +sequences. In this task, capturing contextual information and bridging visual +variation bring additional challenges. We propose a simple yet effective +framework that leverages the generalization capabilities of pretrained +foundation models, only training a lightweight vision-language mapping network +to connect modalities, while incorporating context to enhance coherence. We +introduce a multimodal contrastive objective that also improves visual +relevance and story informativeness. Extensive experimental results, across +both automatic metrics and human evaluations, demonstrate that the stories +generated by our framework are diverse, coherent, informative, and interesting. + +
+
+ comment: 18 pages, 12 figures, accepted by INLG 2024 +
+
+
+
+
+ + ☆ FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks + + +
+ This paper introduces FLEURS-R, a speech restoration applied version of the +Few-shot Learning Evaluation of Universal Representations of Speech (FLEURS) +corpus. FLEURS-R maintains an N-way parallel speech corpus in 102 languages as +FLEURS, with improved audio quality and fidelity by applying the speech +restoration model Miipher. The aim of FLEURS-R is to advance speech technology +in more languages and catalyze research including text-to-speech (TTS) and +other speech generation tasks in low-resource languages. Comprehensive +evaluations with the restored speech and TTS baseline models trained from the +new corpus show that the new corpus obtained significantly improved speech +quality while maintaining the semantic contents of the speech. The corpus is +publicly released via Hugging Face. + +
+
+
+
+
+ + ☆ On Effects of Steering Latent Representation for Large Language Model + Unlearning + + +
+ Representation Misdirection for Unlearning (RMU), which steers model +representation in the intermediate layer to a target random representation, is +an effective method for large language model (LLM) unlearning. Despite its high +performance, the underlying cause and explanation remain underexplored. In this +paper, we first theoretically demonstrate that steering forget representations +in the intermediate layer reduces token confidence, causing LLMs to generate +wrong or nonsense responses. Second, we investigate how the coefficient +influences the alignment of forget-sample representations with the random +direction and hint at the optimal coefficient values for effective unlearning +across different network layers. Third, we show that RMU unlearned models are +robust against adversarial jailbreak attacks. Last, our empirical analysis +shows that RMU is less effective when applied to the middle and later layers in +LLMs. To resolve this drawback, we propose Adaptive RMU -- a simple yet +effective alternative method that makes unlearning effective with most layers. +Extensive experiments demonstrate that Adaptive RMU significantly improves the +unlearning performance compared to prior art while incurring no additional +computational cost. + +
+
+ comment: 15 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers + + +
+ This paper introduces rStar, a self-play mutual reasoning approach that +significantly improves reasoning capabilities of small language models (SLMs) +without fine-tuning or superior models. rStar decouples reasoning into a +self-play mutual generation-discrimination process. First, a target SLM +augments the Monte Carlo Tree Search (MCTS) with a rich set of human-like +reasoning actions to construct higher quality reasoning trajectories. Next, +another SLM, with capabilities similar to the target SLM, acts as a +discriminator to verify each trajectory generated by the target SLM. The +mutually agreed reasoning trajectories are considered mutual consistent, thus +are more likely to be correct. Extensive experiments across five SLMs +demonstrate rStar can effectively solve diverse reasoning problems, including +GSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K +accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for +Mistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be +available at https://github.com/zhentingqi/rStar. + +
+
+
+
+
+ + ☆ Improving Structural Diversity of Blackbox LLMs via + Chain-of-Specification Prompting + + +
+ The capability to generate diverse text is a key challenge facing large +language models (LLMs). Thus far, diversity has been studied via metrics such +as $n$-gram diversity or diversity of BERT embeddings. However, for these kinds +of diversity, the user has little control over the dimensions along which +diversity is considered. For example, in the poetry domain, one might desire +diversity in terms of rhyme and meter, whereas in the code domain, one might +desire diversity in terms of the kinds of expressions used to solve a problem. +We propose a diversity metric called structural diversity, where the user +provides a mapping from generated text to features capturing the kinds of +diversity that they care about. In addition, we propose a novel strategy called +chain-of-specification (CoS) prompting for improving diversity by first having +the LLM generate a specification encoding one instance of structural features, +and then prompting the LLM to generate text that satisfies these features; +notably, our strategy works with blackbox LLMs. In our experiments, we show +that for structural diversity in the poetry and code domains, CoS significantly +improves diversity compared to several baselines. + +
+
+
+
+
+ + ☆ LipidBERT: A Lipid Language Model Pre-trained on METiS de novo Lipid + Library + + +
+ In this study, we generate and maintain a database of 10 million virtual +lipids through METiS's in-house de novo lipid generation algorithms and lipid +virtual screening techniques. These virtual lipids serve as a corpus for +pre-training, lipid representation learning, and downstream task knowledge +transfer, culminating in state-of-the-art LNP property prediction performance. +We propose LipidBERT, a BERT-like model pre-trained with the Masked Language +Model (MLM) and various secondary tasks. Additionally, we compare the +performance of embeddings generated by LipidBERT and PhatGPT, our GPT-like +lipid generation model, on downstream tasks. The proposed bilingual LipidBERT +model operates in two languages: the language of ionizable lipid pre-training, +using in-house dry-lab lipid structures, and the language of LNP fine-tuning, +utilizing in-house LNP wet-lab data. This dual capability positions LipidBERT +as a key AI-based filter for future screening tasks, including new versions of +METiS de novo lipid libraries and, more importantly, candidates for in vivo +testing for orgran-targeting LNPs. To the best of our knowledge, this is the +first successful demonstration of the capability of a pre-trained language +model on virtual lipids and its effectiveness in downstream tasks using web-lab +data. This work showcases the clever utilization of METiS's in-house de novo +lipid library as well as the power of dry-wet lab integration. + +
+
+
+
+
+ + ☆ Med42-v2: A Suite of Clinical LLMs + + +
+ Med42-v2 introduces a suite of clinical large language models (LLMs) designed +to address the limitations of generic models in healthcare settings. These +models are built on Llama3 architecture and fine-tuned using specialized +clinical data. They underwent multi-stage preference alignment to effectively +respond to natural prompts. While generic models are often preference-aligned +to avoid answering clinical queries as a precaution, Med42-v2 is specifically +trained to overcome this limitation, enabling its use in clinical settings. +Med42-v2 models demonstrate superior performance compared to the original +Llama3 models in both 8B and 70B parameter configurations and GPT-4 across +various medical benchmarks. These LLMs are developed to understand clinical +queries, perform reasoning tasks, and provide valuable assistance in clinical +environments. The models are now publicly available at +\href{https://huggingface.co/m42-health}{https://huggingface.co/m42-health}. + +
+
+
+
+
+ + ☆ Utilize Transformers for translating Wikipedia category names + + +
+ On Wikipedia, articles are categorized to aid readers in navigating content +efficiently. The manual creation of new categories can be laborious and +time-intensive. To tackle this issue, we built language models to translate +Wikipedia categories from English to Vietnamese with a dataset containing +15,000 English-Vietnamese category pairs. Subsequently, small to medium-scale +Transformer pre-trained models with a sequence-to-sequence architecture were +fine-tuned for category translation. The experiments revealed that +OPUS-MT-en-vi surpassed other models, attaining the highest performance with a +BLEU score of 0.73, despite its smaller model storage. We expect our paper to +be an alternative solution for translation tasks with limited computer +resources. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ How ChatGPT Changed the Media's Narratives on AI: A Semi-Automated + Narrative Analysis Through Frame Semantics + + +
+ The recent explosion of attention to AI is arguably one of the biggest in the +technology's media coverage. To investigate the effects it has on the +discourse, we perform a mixed-method frame semantics-based analysis on a +dataset of more than 49,000 sentences collected from 5846 news articles that +mention AI. The dataset covers the twelve-month period centred around the +launch of OpenAI's chatbot ChatGPT and is collected from the most visited +open-access English-language news publishers. Our findings indicate that during +the half year succeeding the launch, media attention rose +tenfold$\unicode{x2014}$from already historically high levels. During this +period, discourse has become increasingly centred around experts and political +leaders, and AI has become more closely associated with dangers and risks. A +deeper review of the data also suggests a qualitative shift in the types of +threat AI is thought to represent, as well as the anthropomorphic qualities +ascribed to it. + +
+
+ comment: 18 pages, 6 figures and 2 appendices (5 pages) +
+
+
+
+
+ + ☆ Building Decision Making Models Through Language Model Regime + + +
+ We propose a novel approach for decision making problems leveraging the +generalization capabilities of large language models (LLMs). Traditional +methods such as expert systems, planning algorithms, and reinforcement learning +often exhibit limited generalization, typically requiring the training of new +models for each unique task. In contrast, LLMs demonstrate remarkable success +in generalizing across varied language tasks, inspiring a new strategy for +training decision making models. Our approach, referred to as "Learning then +Using" (LTU), entails a two-stage process. Initially, the \textit{learning} +phase develops a robust foundational decision making model by integrating +diverse knowledge from various domains and decision making contexts. The +subsequent \textit{using} phase refines this foundation model for specific +decision making scenarios. Distinct from other studies that employ LLMs for +decision making through supervised learning, our LTU method embraces a +versatile training methodology that combines broad pre-training with targeted +fine-tuning. Experiments in e-commerce domains such as advertising and search +optimization have shown that LTU approach outperforms traditional supervised +learning regimes in decision making capabilities and generalization. The LTU +approach is the first practical training architecture for both single-step and +multi-step decision making tasks combined with LLMs, which can be applied +beyond game and robot domains. It provides a robust and adaptable framework for +decision making, enhances the effectiveness and flexibility of various systems +in tackling various challenges. + +
+
+
+
+
+ + ☆ An Investigation Into Explainable Audio Hate Speech Detection SIGDIAL 2024 + + +
+ Research on hate speech has predominantly revolved around detection and +interpretation from textual inputs, leaving verbal content largely unexplored. +While there has been limited exploration into hate speech detection within +verbal acoustic speech inputs, the aspect of interpretability has been +overlooked. Therefore, we introduce a new task of explainable audio hate speech +detection. Specifically, we aim to identify the precise time intervals, +referred to as audio frame-level rationales, which serve as evidence for hate +speech classification. Towards this end, we propose two different approaches: +cascading and End-to-End (E2E). The cascading approach initially converts audio +to transcripts, identifies hate speech within these transcripts, and +subsequently locates the corresponding audio time frames. Conversely, the E2E +approach processes audio utterances directly, which allows it to pinpoint hate +speech within specific time frames. Additionally, due to the lack of +explainable audio hate speech datasets that include audio frame-level +rationales, we curated a synthetic audio dataset to train our models. We +further validated these models on actual human speech utterances and found that +the E2E approach outperforms the cascading method in terms of the audio frame +Intersection over Union (IoU) metric. Furthermore, we observed that including +frame-level rationales significantly enhances hate speech detection accuracy +for the E2E approach. + \textbf{Disclaimer} The reader may encounter content of an offensive or +hateful nature. However, given the nature of the work, this cannot be avoided. + +
+
+ comment: Accepted to SIGDIAL 2024 +
+
+
+
+
+ + ☆ On Tables with Numbers, with Numbers + + +
+ This paper is a critical reflection on the epistemic culture of contemporary +computational linguistics, framed in the context of its growing obsession with +tables with numbers. We argue against tables with numbers on the basis of their +epistemic irrelevance, their environmental impact, their role in enabling and +exacerbating social inequalities, and their deep ties to commercial +applications and profit-driven research. We substantiate our arguments with +empirical evidence drawn from a meta-analysis of computational linguistics +research over the last decade. + +
+
+
+
+
+ + ☆ Quantum Algorithms for Compositional Text Processing + + +
+ Quantum computing and AI have found a fruitful intersection in the field of +natural language processing. We focus on the recently proposed DisCoCirc +framework for natural language, and propose a quantum adaptation, QDisCoCirc. +This is motivated by a compositional approach to rendering AI interpretable: +the behavior of the whole can be understood in terms of the behavior of parts, +and the way they are put together. For the model-native primitive operation of +text similarity, we derive quantum algorithms for fault-tolerant quantum +computers to solve the task of question-answering within QDisCoCirc, and show +that this is BQP-hard; note that we do not consider the complexity of +question-answering in other natural language processing models. Assuming +widely-held conjectures, implementing the proposed model classically would +require super-polynomial resources. Therefore, it could provide a meaningful +demonstration of the power of practical quantum processors. The model +construction builds on previous work in compositional quantum natural language +processing. Word embeddings are encoded as parameterized quantum circuits, and +compositionality here means that the quantum circuits compose according to the +linguistic structure of the text. We outline a method for evaluating the model +on near-term quantum processors, and elsewhere we report on a recent +implementation of this on quantum hardware. In addition, we adapt a quantum +algorithm for the closest vector problem to obtain a Grover-like speedup in the +fault-tolerant regime for our model. This provides an unconditional quadratic +speedup over any classical algorithm in certain circumstances, which we will +verify empirically in future work. + +
+
+ comment: In Proceedings QPL 2024, arXiv:2408.05113 +
+
+
+
+
+ + ☆ DiagESC: Dialogue Synthesis for Integrating Depression Diagnosis into + Emotional Support Conversation SIGDIAL 2024 + + +
+ Dialogue systems for mental health care aim to provide appropriate support to +individuals experiencing mental distress. While extensive research has been +conducted to deliver adequate emotional support, existing studies cannot +identify individuals who require professional medical intervention and cannot +offer suitable guidance. We introduce the Diagnostic Emotional Support +Conversation task for an advanced mental health management system. We develop +the DESC dataset to assess depression symptoms while maintaining user +experience by utilizing task-specific utterance generation prompts and a strict +filtering algorithm. Evaluations by professional psychological counselors +indicate that DESC has a superior ability to diagnose depression than existing +data. Additionally, conversational quality evaluation reveals that DESC +maintains fluent, consistent, and coherent dialogues. + +
+
+ comment: Accepted by SIGDIAL 2024 +
+
+
+
+
+ + ☆ Enhancing Dialogue Speech Recognition with Robust Contextual Awareness + via Noise Representation Learning SIGDIAL2024 + + +
+ Recent dialogue systems rely on turn-based spoken interactions, requiring +accurate Automatic Speech Recognition (ASR). Errors in ASR can significantly +impact downstream dialogue tasks. To address this, using dialogue context from +user and agent interactions for transcribing subsequent utterances has been +proposed. This method incorporates the transcription of the user's speech and +the agent's response as model input, using the accumulated context generated by +each turn. However, this context is susceptible to ASR errors because it is +generated by the ASR model in an auto-regressive fashion. Such noisy context +can further degrade the benefits of context input, resulting in suboptimal ASR +performance. In this paper, we introduce Context Noise Representation Learning +(CNRL) to enhance robustness against noisy context, ultimately improving +dialogue speech recognition accuracy. To maximize the advantage of context +awareness, our approach includes decoder pre-training using text-based dialogue +data and noise representation learning for a context encoder. Based on the +evaluation of speech dialogues, our method shows superior results compared to +baselines. Furthermore, the strength of our approach is highlighted in noisy +environments where user speech is barely audible due to real-world noise, +relying on contextual information to transcribe the input accurately. + +
+
+ comment: 11 pages, 2 figures, Accepted to SIGDIAL2024 +
+
+
+
+
+ + ☆ ARPA: A Novel Hybrid Model for Advancing Visual Word Disambiguation + Using Large Language Models and Transformers + + +
+ In the rapidly evolving fields of natural language processing and computer +vision, Visual Word Sense Disambiguation (VWSD) stands as a critical, yet +challenging task. The quest for models that can seamlessly integrate and +interpret multimodal data is more pressing than ever. Imagine a system that can +understand language with the depth and nuance of human cognition, while +simultaneously interpreting the rich visual context of the world around it. + We present ARPA, an architecture that fuses the unparalleled contextual +understanding of large language models with the advanced feature extraction +capabilities of transformers, which then pass through a custom Graph Neural +Network (GNN) layer to learn intricate relationships and subtle nuances within +the data. This innovative architecture not only sets a new benchmark in visual +word disambiguation but also introduces a versatile framework poised to +transform how linguistic and visual data interact by harnessing the synergistic +strengths of its components, ensuring robust performance even in the most +complex disambiguation scenarios. Through a series of experiments and +comparative analysis, we reveal the substantial advantages of our model, +underscoring its potential to redefine standards in the field. Beyond its +architectural prowess, our architecture excels through experimental +enrichments, including sophisticated data augmentation and multi-modal training +techniques. + ARPA's introduction marks a significant milestone in visual word +disambiguation, offering a compelling solution that bridges the gap between +linguistic and visual modalities. We invite researchers and practitioners to +explore the capabilities of our model, envisioning a future where such hybrid +models drive unprecedented advancements in artificial intelligence. + +
+
+
+
+
+ + ☆ Controlling Surprisal in Music Generation via Information Content Curve + Matching + + +
+ In recent years, the quality and public interest in music generation systems +have grown, encouraging research into various ways to control these systems. We +propose a novel method for controlling surprisal in music generation using +sequence models. To achieve this goal, we define a metric called Instantaneous +Information Content (IIC). The IIC serves as a proxy function for the perceived +musical surprisal (as estimated from a probabilistic model) and can be +calculated at any point within a music piece. This enables the comparison of +surprisal across different musical content even if the musical events occur in +irregular time intervals. We use beam search to generate musical material whose +IIC curve closely approximates a given target IIC. We experimentally show that +the IIC correlates with harmonic and rhythmic complexity and note density. The +correlation decreases with the length of the musical context used for +estimating the IIC. Finally, we conduct a qualitative user study to test if +human listeners can identify the IIC curves that have been used as targets when +generating the respective musical material. We provide code for creating IIC +interpolations and IIC visualizations on https://github.com/muthissar/iic. + +
+
+ comment: 8 pages, 4 figures, 2 tables, accepted at the 25th Int. Society for + Music Information Retrieval Conf., San Francisco, USA, 2024 +
+
+
+
+
+ + ☆ The Language of Trauma: Modeling Traumatic Event Descriptions Across + Domains with Explainable AI + + +
+ Psychological trauma can manifest following various distressing events and is +captured in diverse online contexts. However, studies traditionally focus on a +single aspect of trauma, often neglecting the transferability of findings +across different scenarios. We address this gap by training language models +with progressing complexity on trauma-related datasets, including +genocide-related court data, a Reddit dataset on post-traumatic stress disorder +(PTSD), counseling conversations, and Incel forum posts. Our results show that +the fine-tuned RoBERTa model excels in predicting traumatic events across +domains, slightly outperforming large language models like GPT-4. Additionally, +SLALOM-feature scores and conceptual explanations effectively differentiate and +cluster trauma-related language, highlighting different trauma aspects and +identifying sexual abuse and experiences related to death as a common traumatic +event across all datasets. This transferability is crucial as it allows for the +development of tools to enhance trauma detection and intervention in diverse +populations and settings. + +
+
+
+
+
+ + ☆ ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge + Graph QA datasets with Large Language Models + + +
+ The rapid advancement of Large Language Models (LLMs) and conversational +assistants necessitates dynamic, scalable, and configurable conversational +datasets for training and evaluation. These datasets must accommodate diverse +user interaction modes, including text and voice, each presenting unique +modeling challenges. Knowledge Graphs (KGs), with their structured and evolving +nature, offer an ideal foundation for current and precise knowledge. Although +human-curated KG-based conversational datasets exist, they struggle to keep +pace with the rapidly changing user information needs. We present ConvKGYarn, a +scalable method for generating up-to-date and configurable conversational KGQA +datasets. Qualitative psychometric analyses confirm our method can generate +high-quality datasets rivaling a popular conversational KGQA dataset while +offering it at scale and covering a wide range of human-interaction +configurations. We showcase its utility by testing LLMs on diverse +conversations - exploring model behavior on conversational KGQA sets with +different configurations grounded in the same KG fact set. Our results +highlight the ability of ConvKGYarn to improve KGQA foundations and evaluate +parametric knowledge of LLMs, thus offering a robust solution to the constantly +evolving landscape of conversational assistants. + +
+
+
+
+
+ + ☆ A New Pipeline For Generating Instruction Dataset via RAG and Self + Fine-Tuning SC + + +
+ With the rapid development of large language models in recent years, there +has been an increasing demand for domain-specific Agents that can cater to the +unique needs of enterprises and organizations. Unlike general models, which +strive for broad coverage, these specialized Agents rely on focused datasets +tailored to their intended applications. This research proposes a pipeline that +leverages the power of LLMs and the Retrieval-Augmented Generation related +framework to construct high-quality instruction datasets for fine-tuning on +specific domains using custom document collections. By ingesting +domain-specific documents, the pipeline generates relevant and contextually +appropriate instructions, thus effectively creating a comprehensive dataset for +fine-tuning LLMs on the target domain. This approach overcomes the limitations +of traditional dataset creation methods, which often rely on manual curation or +web-scraping techniques that may introduce noise and irrelevant data. Notably, +our pipeline offers a dynamic solution that can quickly adapt to updates or +modifications in the domain-specific document collection, eliminating the need +for complete retraining. Additionally, it addresses the challenge of data +scarcity by enabling the generation of instruction datasets from a limited set +of initial documents, rendering it suitable for unpopular or specialized +domains where comprehensive datasets are scarce. As a case study, we apply this +approach to the domain of psychiatry, a field requiring specialized knowledge +and sensitive handling of patient information. The resulting fine-tuned LLM +demonstrates showcases the viability of the proposed approach and underscores +its potential for widespread adoption across various industries and domains +where tailored, accurate, and contextually relevant language models are +indispensable. + +
+
+ comment: 5 pages, SCA 2024: The 7th IEEE International Workshop on Smart + Computing & Applications +
+
+
+
+
+ + ☆ AdTEC: A Unified Benchmark for Evaluating Text Quality in Search Engine + Advertising + + +
+ With the increase in the more fluent ad texts automatically created by +natural language generation technology, it is in the high demand to verify the +quality of these creatives in a real-world setting. We propose AdTEC, the first +public benchmark to evaluate ad texts in multiple aspects from the perspective +of practical advertising operations. Our contributions are: (i) Defining five +tasks for evaluating the quality of ad texts and building a dataset based on +the actual operational experience of advertising agencies, which is typically +kept in-house. (ii) Validating the performance of existing pre-trained language +models (PLMs) and human evaluators on the dataset. (iii) Analyzing the +characteristics and providing challenges of the benchmark. The results show +that while PLMs have already reached the practical usage level in several +tasks, human still outperforms in certain domains, implying that there is +significant room for improvement in such area. + +
+
+
+
+
+ + ☆ GlyphPattern: An Abstract Pattern Recognition for Vision-Language Models + + +
+ Vision-Language Models (VLMs) building upon the foundation of powerful large +language models have made rapid progress in reasoning across visual and textual +data. While VLMs perform well on vision tasks that they are trained on, our +results highlight key challenges in abstract pattern recognition. We present +GlyphPattern, a 954 item dataset that pairs 318 human-written descriptions of +visual patterns from 40 writing systems with three visual presentation styles. + GlyphPattern evaluates abstract pattern recognition in VLMs, requiring models +to understand and judge natural language descriptions of visual patterns. +GlyphPattern patterns are drawn from a large-scale cognitive science +investigation of human writing systems; as a result, they are rich in spatial +reference and compositionality. Our experiments show that GlyphPattern is +challenging for state-of-the-art VLMs (GPT-4o achieves only 55% accuracy), with +marginal gains from few-shot prompting. Our detailed error analysis reveals +challenges at multiple levels, including visual processing, natural language +understanding, and pattern generalization. + +
+
+
+
+
+ + ☆ Creating Arabic LLM Prompts at Scale + + +
+ The debut of chatGPT and BARD has popularized instruction following text +generation using LLMs, where a user can interrogate an LLM using natural +language requests and obtain natural language answers that matches their +requests. Training LLMs to respond in this manner requires a large number of +worked out examples of user requests (aka prompts) with corresponding gold +responses. In this paper, we introduce two methods for creating such prompts +for Arabic cheaply and quickly. The first methods entails automatically +translating existing prompt datasets from English, such as PromptSource and +Super-NaturalInstructions, and then using machine translation quality +estimation to retain high quality translations only. The second method involves +creating natural language prompts on top of existing Arabic NLP datasets. Using +these two methods we were able to create more than 67.4 million Arabic prompts +that cover a variety of tasks including summarization, headline generation, +grammar checking, open/closed question answering, creative writing, etc. We +show that fine tuning an open 7 billion parameter large language model, namely +base Qwen2 7B, enables it to outperform a state-of-the-art 70 billion parameter +instruction tuned model, namely Llama3 70B, in handling Arabic prompts. + +
+
+
+
+
+ + ☆ Chain-of-Strategy Planning with LLMs: Aligning the Generation of + Psychotherapy Dialogue with Strategy in Motivational Interviewing + + +
+ Recent advancements in large language models (LLMs) have shown promise in +generating psychotherapeutic dialogues, especially in Motivational Interviewing +(MI). However, how to employ strategies, a set of motivational interviewing +(MI) skills, to generate therapeutic-adherent conversations with explainability +is underexplored. We propose an approach called strategy-aware dialogue +generation with Chain-of-Strategy (CoS) planning, which first predicts MI +strategies as reasoning and utilizes these strategies to guide the subsequent +dialogue generation. It brings the potential for controllable and explainable +generation in psychotherapy by aligning the generated MI dialogues with +therapeutic strategies. Extensive experiments including automatic and human +evaluations are conducted to validate the effectiveness of the MI strategy. Our +findings demonstrate the potential of LLMs in producing strategically aligned +dialogues and suggest directions for practical applications in +psychotherapeutic settings. + +
+
+
+
+
+ + ☆ Hierarchical in-Context Reinforcement Learning with Hindsight Modular + Reflections for Planning + + +
+ Large Language Models (LLMs) have demonstrated remarkable abilities in +various language tasks, making them promising candidates for decision-making in +robotics. Inspired by Hierarchical Reinforcement Learning (HRL), we propose +Hierarchical in-Context Reinforcement Learning (HCRL), a novel framework that +decomposes complex tasks into sub-tasks using an LLM-based high-level policy, +in which a complex task is decomposed into sub-tasks by a high-level policy +on-the-fly. The sub-tasks, defined by goals, are assigned to the low-level +policy to complete. Once the LLM agent determines that the goal is finished, a +new goal will be proposed. To improve the agent's performance in multi-episode +execution, we propose Hindsight Modular Reflection (HMR), where, instead of +reflecting on the full trajectory, we replace the task objective with +intermediate goals and let the agent reflect on shorter trajectories to improve +reflection efficiency. We evaluate the decision-making ability of the proposed +HCRL in three benchmark environments--ALFWorld, Webshop, and HotpotQA. Results +show that HCRL can achieve 9%, 42%, and 10% performance improvement in 5 +episodes of execution over strong in-context learning baselines. + +
+
+
+
+
+ + ☆ Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in + Language Models + + +
+ Despite their wide adoption, the biases and unintended behaviors of language +models remain poorly understood. In this paper, we identify and characterize a +phenomenon never discussed before, which we call semantic leakage, where models +leak irrelevant information from the prompt into the generation in unexpected +ways. We propose an evaluation setting to detect semantic leakage both by +humans and automatically, curate a diverse test suite for diagnosing this +behavior, and measure significant semantic leakage in 13 flagship models. We +also show that models exhibit semantic leakage in languages besides English and +across different settings and generation scenarios. This discovery highlights +yet another type of bias in language models that affects their generation +patterns and behavior. + +
+
+
+
+
+ + ☆ Cross-Lingual Conversational Speech Summarization with Large Language + Models + + +
+ Cross-lingual conversational speech summarization is an important problem, +but suffers from a dearth of resources. While transcriptions exist for a number +of languages, translated conversational speech is rare and datasets containing +summaries are non-existent. We build upon the existing Fisher and Callhome +Spanish-English Speech Translation corpus by supplementing the translations +with summaries. The summaries are generated using GPT-4 from the reference +translations and are treated as ground truth. The task is to generate similar +summaries in the presence of transcription and translation errors. We build a +baseline cascade-based system using open-source speech recognition and machine +translation models. We test a range of LLMs for summarization and analyze the +impact of transcription and translation errors. Adapting the Mistral-7B model +for this task performs significantly better than off-the-shelf models and +matches the performance of GPT-4. + +
+
+
+
+
+ + ☆ TOGGL: Transcribing Overlapping Speech with Staggered Labeling + + +
+ Transcribing the speech of multiple overlapping speakers typically requires +separating the audio into multiple streams and recognizing each one +independently. More recent work jointly separates and transcribes, but requires +a separate decoding component for each speaker. We propose the TOGGL model to +simultaneously transcribe the speech of multiple speakers. The TOGGL model uses +special output tokens to attribute the speech to each speaker with only a +single decoder. Our approach generalizes beyond two speakers, even when trained +only on two-speaker data. We demonstrate superior performance compared to +competing approaches on a conversational speech dataset. Our approach also +improves performance on single-speaker audio. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Towards Autonomous Agents: Adaptive-planning, Reasoning, and Acting in + Language Models + + +
+ We propose a novel in-context learning algorithm for building autonomous +decision-making language agents. The language agent continuously attempts to +solve the same task by self-correcting each time the task fails. Our selected +language agent demonstrates the ability to solve tasks in a text-based game +environment. Our results show that the gemma-2-9b-it language model, using our +proposed method, can successfully complete two of six tasks that failed in the +first attempt. This highlights the effectiveness of our approach in enhancing +the problem-solving capabilities of a single language model through +self-correction, paving the way for more advanced autonomous agents. The code +is publicly available at +https://github.com/YenCheHsiao/AutonomousLLMAgentwithAdaptingPlanning. + +
+
+
+
+
+ + ☆ Evaluating Language Models for Efficient Code Generation + + +
+ We introduce Differential Performance Evaluation (DPE), a framework designed +to reliably evaluate Large Language Models (LLMs) for efficient code +generation. Traditional coding benchmarks often fail to provide reliable +insights into code efficiency, due to their reliance on simplistic test inputs +and the absence of effective compound metrics. DPE addresses these issues by +focusing on efficiency-demanding programming tasks and establishing an +insightful compound metric for performance evaluation. DPE operates in two +phases: To curate efficiency datasets, it selects efficiency-demanding tasks +from existing coding benchmarks and generates computationally expensive inputs +to stress the efficiency of LLM solutions. To assess the code efficiency, DPE +profiles the new solution and compares it globally against a set of reference +solutions that exhibit distinct efficiency levels, where the matched level +defines its efficiency score. As a proof of concept, we use DPE to create +EvalPerf, a benchmark with 121 performance-challenging coding tasks. Our +comprehensive evaluation draws interesting findings on the efficiency impact of +model sizes, instruction tuning, and prompting. For example, while the scaling +law fails to account for code efficiency, general instruction tuning benefits +both code correctness and efficiency. We also evaluate the evaluation by +examining the effectiveness of DPE, showing that EvalPerf is reliable and +convenient to use even across platforms. + +
+
+
+
+
+ + ☆ Evaluating Language Models on Entity Disambiguation in Tables + + +
+ Tables are crucial containers of information, but understanding their meaning +may be challenging. Indeed, recently, there has been a focus on Semantic Table +Interpretation (STI), i.e., the task that involves the semantic annotation of +tabular data to disambiguate their meaning. Over the years, there has been a +surge in interest in data-driven approaches based on deep learning that have +increasingly been combined with heuristic-based approaches. In the last period, +the advent of Large Language Models (LLMs) has led to a new category of +approaches for table annotation. The interest in this research field, +characterised by multiple challenges, has led to a proliferation of approaches +employing different techniques. However, these approaches have not been +consistently evaluated on a common ground, making evaluation and comparison +difficult. This work proposes an extensive evaluation of four state-of-the-art +(SOTA) approaches - Alligator (formerly s-elBat), Dagobah, TURL, and +TableLlama; the first two belong to the family of heuristic-based algorithms, +while the others are respectively encoder-only and decoder-only LLMs. The +primary objective is to measure the ability of these approaches to solve the +entity disambiguation task, with the ultimate aim of charting new research +paths in the field. + +
+
+
+
+
+ + ☆ Global-to-Local Support Spectrums for Language Model Explainability + + +
+ Existing sample-based methods, like influence functions and representer +points, measure the importance of a training point by approximating the effect +of its removal from training. As such, they are skewed towards outliers and +points that are very close to the decision boundaries. The explanations +provided by these methods are often static and not specific enough for +different test points. In this paper, we propose a method to generate an +explanation in the form of support spectrums which are based on two main ideas: +the support sets and a global-to-local importance measure. The support set is +the set of training points, in the predicted class, that ``lie in between'' the +test point and training points in the other classes. They indicate how well the +test point can be distinguished from the points not in the predicted class. The +global-to-local importance measure is obtained by decoupling existing methods +into the global and local components which are then used to select the points +in the support set. Using this method, we are able to generate explanations +that are tailored to specific test points. In the experiments, we show the +effectiveness of the method in image classification and text generation tasks. + +
+
+
+
+
+ + ♻ ☆ On the Impact of Calibration Data in Post-training Quantization and + Pruning ACL 2024 + + +
+ Quantization and pruning form the foundation of compression for neural +networks, enabling efficient inference for large language models (LLMs). +Recently, various quantization and pruning techniques have demonstrated +remarkable performance in a post-training setting. They rely upon calibration +data, a small set of unlabeled examples that are used to generate layer +activations. However, no prior work has systematically investigated how the +calibration data impacts the effectiveness of model compression methods. In +this paper, we present the first extensive empirical study on the effect of +calibration data upon LLM performance. We trial a variety of quantization and +pruning methods, datasets, tasks, and models. Surprisingly, we find substantial +variations in downstream task performance, contrasting existing work that +suggests a greater level of robustness to the calibration data. Finally, we +make a series of recommendations for the effective use of calibration data in +LLM quantization and pruning. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Formal-LLM: Integrating Formal Language and Natural Language for + Controllable LLM-based Agents + + +
+ Recent advancements on Large Language Models (LLMs) enable AI Agents to +automatically generate and execute multi-step plans to solve complex tasks. +However, since LLM's content generation process is hardly controllable, current +LLM-based agents frequently generate invalid or non-executable plans, which +jeopardizes the performance of the generated plans and corrupts users' trust in +LLM-based agents. In response, this paper proposes a novel "Formal-LLM" +framework for LLM-based agents by integrating the expressiveness of natural +language and the precision of formal language. Specifically, the framework +allows agent developers to express their requirements or constraints for the +planning process as an automaton. A stack-based LLM plan generation process is +then conducted under the supervision of the automaton to ensure that the +generated plan satisfies the constraints, making the planning process +controllable. We conduct experiments on both benchmark tasks and practical +real-life tasks, and our framework achieves over 50% overall performance +increase, which validates the feasibility and effectiveness of employing +Formal-LLM to guide the plan generation of agents, preventing the agents from +generating invalid and unsuccessful plans. Further, more controllable LLM-based +agents can facilitate the broader utilization of LLM in application scenarios +where high validity of planning is essential. The source code of this work is +available at https://github.com/agiresearch/Formal-LLM. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Cognitive Biases in Large Language Models as Evaluators + + +
+ Large Language Models (LLMs) have recently been shown to be effective as +automatic evaluators with simple prompting and in-context learning. In this +work, we assemble 15 LLMs of four different size ranges and evaluate their +output responses by preference ranking from the other LLMs as evaluators, such +as System Star is better than System Square. We then evaluate the quality of +ranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators +(CoBBLEr), a benchmark to measure six different cognitive biases in LLM +evaluation outputs, such as the Egocentric bias where a model prefers to rank +its own outputs highly in evaluation. We find that LLMs are biased text quality +evaluators, exhibiting strong indications on our bias benchmark (average of 40% +of comparisons across all models) within each of their evaluations that +question their robustness as evaluators. Furthermore, we examine the +correlation between human and machine preferences and calculate the average +Rank-Biased Overlap (RBO) score to be 49.6%, indicating that machine +preferences are misaligned with humans. According to our findings, LLMs may +still be unable to be utilized for automatic annotation aligned with human +preferences. Our project page is at: https://minnesotanlp.github.io/cobbler. + +
+
+ comment: Publishsed at 2024. 29 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Moderating Illicit Online Image Promotion for Unsafe User-Generated + Content Games Using Large Vision-Language Models USENIX Security + + +
+ Online user generated content games (UGCGs) are increasingly popular among +children and adolescents for social interaction and more creative online +entertainment. However, they pose a heightened risk of exposure to explicit +content, raising growing concerns for the online safety of children and +adolescents. Despite these concerns, few studies have addressed the issue of +illicit image-based promotions of unsafe UGCGs on social media, which can +inadvertently attract young users. This challenge arises from the difficulty of +obtaining comprehensive training data for UGCG images and the unique nature of +these images, which differ from traditional unsafe content. In this work, we +take the first step towards studying the threat of illicit promotions of unsafe +UGCGs. We collect a real-world dataset comprising 2,924 images that display +diverse sexually explicit and violent content used to promote UGCGs by their +game creators. Our in-depth studies reveal a new understanding of this problem +and the urgent need for automatically flagging illicit UGCG promotions. We +additionally create a cutting-edge system, UGCG-Guard, designed to aid social +media platforms in effectively identifying images used for illicit UGCG +promotions. This system leverages recently introduced large vision-language +models (VLMs) and employs a novel conditional prompting strategy for zero-shot +domain adaptation, along with chain-of-thought (CoT) reasoning for contextual +identification. UGCG-Guard achieves outstanding results, with an accuracy rate +of 94% in detecting these images used for the illicit promotion of such games +in real-world scenarios. + +
+
+ comment: To Appear in the 33rd USENIX Security Symposium, August 14-16, 2024 +
+
+
+
+
+ + ♻ ☆ RAGGED: Towards Informed Design of Retrieval Augmented Generation + Systems + + +
+ Retrieval-augmented generation (RAG) can significantly improve the +performance of language models (LMs) by providing additional context for tasks +such as document-based question answering (DBQA). However, the effectiveness of +RAG is highly dependent on its configuration. To systematically find the +optimal configuration, we introduce RAGGED, a framework for analyzing RAG +configurations across various DBQA tasks. Using the framework, we discover +distinct LM behaviors in response to varying context quantities, context +qualities, and retrievers. For instance, while some models are robust to noisy +contexts, monotonically performing better with more contexts, others are more +noise-sensitive and can effectively use only a few contexts before declining in +performance. This framework also provides a deeper analysis of these +differences by evaluating the LMs' sensitivity to signal and noise under +specific context quality conditions. Using RAGGED, researchers and +practitioners can derive actionable insights about how to optimally configure +their RAG systems for their specific question-answering tasks. + +
+
+
+
+
+ + ♻ ☆ Across Platforms and Languages: Dutch Influencers and Legal Disclosures + on Instagram, YouTube and TikTok + + +
+ Content monetization on social media fuels a growing influencer economy. +Influencer marketing remains largely undisclosed or inappropriately disclosed +on social media. Non-disclosure issues have become a priority for national and +supranational authorities worldwide, who are starting to impose increasingly +harsher sanctions on them. This paper proposes a transparent methodology for +measuring whether and how influencers comply with disclosures based on legal +standards. We introduce a novel distinction between disclosures that are +legally sufficient (green) and legally insufficient (yellow). We apply this +methodology to an original dataset reflecting the content of 150 Dutch +influencers publicly registered with the Dutch Media Authority based on +recently introduced registration obligations. The dataset consists of 292,315 +posts and is multi-language (English and Dutch) and cross-platform (Instagram, +YouTube and TikTok). We find that influencer marketing remains generally +underdisclosed on social media, and that bigger influencers are not necessarily +more compliant with disclosure standards. + +
+
+ comment: Accept for publication at the 16th International Conference on + Advances in Social Networks Analysis and Mining - ASONAM-2024 +
+
+
+
+
+ + ♻ ☆ Semisupervised Neural Proto-Language Reconstruction ACL 2024 + + +
+ Existing work implementing comparative reconstruction of ancestral languages +(proto-languages) has usually required full supervision. However, historical +reconstruction models are only of practical value if they can be trained with a +limited amount of labeled data. We propose a semisupervised historical +reconstruction task in which the model is trained on only a small amount of +labeled data (cognate sets with proto-forms) and a large amount of unlabeled +data (cognate sets without proto-forms). We propose a neural architecture for +comparative reconstruction (DPD-BiReconstructor) incorporating an essential +insight from linguists' comparative method: that reconstructed words should not +only be reconstructable from their daughter words, but also deterministically +transformable back into their daughter words. We show that this architecture is +able to leverage unlabeled cognate sets to outperform strong semisupervised +baselines on this novel task. + +
+
+ comment: Accepted to ACL 2024; v2: correct typo +
+
+
+
+
+ + ♻ ☆ Private Fine-tuning of Large Language Models with Zeroth-order + Optimization + + +
+ Differentially private stochastic gradient descent (DP-SGD) allows models to +be trained in a privacy-preserving manner, but has proven difficult to scale to +the era of foundation models. We introduce DP-ZO, a private fine-tuning +framework for large language models by privatizing zeroth order optimization +methods. A key insight into the design of our method is that the direction of +the gradient in the zeroth-order optimization we use is random and the only +information from training data is the step size, i.e., a scalar. Therefore, we +only need to privatize the scalar step size, which is memory-efficient. DP-ZO +provides a strong privacy-utility trade-off across different tasks, and model +sizes that are comparable to DP-SGD in $(\varepsilon,\delta)$-DP. Notably, +DP-ZO possesses significant advantages over DP-SGD in memory efficiency, and +obtains higher utility in $\varepsilon$-DP when using the Laplace mechanism. + +
+
+
+
+
+ + ♻ ☆ Fakes of Varying Shades: How Warning Affects Human Perception and + Engagement Regarding LLM Hallucinations + + +
+ The widespread adoption and transformative effects of large language models +(LLMs) have sparked concerns regarding their capacity to produce inaccurate and +fictitious content, referred to as `hallucinations'. Given the potential risks +associated with hallucinations, humans should be able to identify them. This +research aims to understand the human perception of LLM hallucinations by +systematically varying the degree of hallucination (genuine, minor +hallucination, major hallucination) and examining its interaction with warning +(i.e., a warning of potential inaccuracies: absent vs. present). Participants +(N=419) from Prolific rated the perceived accuracy and engaged with content +(e.g., like, dislike, share) in a Q/A format. Participants ranked content as +truthful in the order of genuine, minor hallucination, and major hallucination, +and user engagement behaviors mirrored this pattern. More importantly, we +observed that warning improved the detection of hallucination without +significantly affecting the perceived truthfulness of genuine content. We +conclude by offering insights for future tools to aid human detection of +hallucinations. All survey materials, demographic questions, and post-session +questions are available at: +https://github.com/MahjabinNahar/fakes-of-varying-shades-survey-materials + +
+
+ comment: Accepted at COLM 2024 +
+
+
+
+
+ + ♻ ☆ XMainframe: A Large Language Model for Mainframe Modernization + + +
+ Mainframe operating systems, despite their inception in the 1940s, continue +to support critical sectors like finance and government. However, these systems +are often viewed as outdated, requiring extensive maintenance and +modernization. Addressing this challenge necessitates innovative tools that can +understand and interact with legacy codebases. To this end, we introduce +XMainframe, a state-of-the-art large language model (LLM) specifically designed +with knowledge of mainframe legacy systems and COBOL codebases. Our solution +involves the creation of an extensive data collection pipeline to produce +high-quality training datasets, enhancing XMainframe's performance in this +specialized domain. Additionally, we present MainframeBench, a comprehensive +benchmark for assessing mainframe knowledge, including multiple-choice +questions, question answering, and COBOL code summarization. Our empirical +evaluations demonstrate that XMainframe consistently outperforms existing +state-of-the-art LLMs across these tasks. Specifically, XMainframe achieves 30% +higher accuracy than DeepSeek-Coder on multiple-choice questions, doubles the +BLEU score of Mixtral-Instruct 8x7B on question answering, and scores six times +higher than GPT-3.5 on COBOL summarization. Our work highlights the potential +of XMainframe to drive significant advancements in managing and modernizing +legacy systems, thereby enhancing productivity and saving time for software +developers. + +
+
+
+
+
+ + ♻ ☆ MC-GPT: Empowering Vision-and-Language Navigation with Memory Map and + Reasoning Chains + + +
+ In the Vision-and-Language Navigation (VLN) task, the agent is required to +navigate to a destination following a natural language instruction. While +learning-based approaches have been a major solution to the task, they suffer +from high training costs and lack of interpretability. Recently, Large Language +Models (LLMs) have emerged as a promising tool for VLN due to their strong +generalization capabilities. However, existing LLM-based methods face +limitations in memory construction and diversity of navigation strategies. To +address these challenges, we propose a suite of techniques. Firstly, we +introduce a method to maintain a topological map that stores navigation +history, retaining information about viewpoints, objects, and their spatial +relationships. This map also serves as a global action space. Additionally, we +present a Navigation Chain of Thoughts module, leveraging human navigation +examples to enrich navigation strategy diversity. Finally, we establish a +pipeline that integrates navigational memory and strategies with perception and +action prediction modules. Experimental results on the REVERIE and R2R datasets +show that our method effectively enhances the navigation ability of the LLM and +improves the interpretability of navigation reasoning. + +
+
+
+
+
+ + ♻ ☆ It's Morphing Time: Unleashing the Potential of Multiple LLMs via + Multi-objective Optimization + + +
+ In this paper, we introduce a novel approach for large language model merging +via black-box multi-objective optimization algorithms. The goal of model +merging is to combine multiple models, each excelling in different tasks, into +a single model that outperforms any of the individual source models. However, +model merging faces two significant challenges: First, existing methods rely +heavily on human intuition and customized strategies to tackle multiple tasks. +Second, it's difficult to search for the great model merging configuration in +limited evaluations. To address these challenges, we propose a multi-objective +optimization based model merging method named MM-MO. The proposed method can +automatically search merging configurations for multiple tasks with +multi-objective optimization algorithms. Moreover, to obtain high-quality model +merging configurations within a limited number of evaluation iterations, we +have made several improvements to multi-objective Bayesian optimization +specifically for model merging scenarios. First, we introduced a weak-to-strong +method to improve the acquisition strategy. Second, we employed Fisher +information to select configurations, further increasing the chances of +discovering superior model merging configurations. Third, we designed a +sparsity metric as an additional optimization objective to enhance the model's +generalization performance across different tasks. We conducted comprehensive +experiments with other mainstream model merging methods, demonstrating that our +method consistently outperforms them. Moreover, performance improvements are +observed even on the tasks not explicitly targeted as optimization objectives, +indicating that our method enhances the overall potential of the model. ... + +
+
+
+
+
+ + ♻ ☆ Strong and weak alignment of large language models with human values + + +
+ Minimizing negative impacts of Artificial Intelligent (AI) systems on human +societies without human supervision requires them to be able to align with +human values. However, most current work only addresses this issue from a +technical point of view, e.g., improving current methods relying on +reinforcement learning from human feedback, neglecting what it means and is +required for alignment to occur. Here, we propose to distinguish strong and +weak value alignment. Strong alignment requires cognitive abilities (either +human-like or different from humans) such as understanding and reasoning about +agents' intentions and their ability to causally produce desired effects. We +argue that this is required for AI systems like large language models (LLMs) to +be able to recognize situations presenting a risk that human values may be +flouted. To illustrate this distinction, we present a series of prompts showing +ChatGPT's, Gemini's and Copilot's failures to recognize some of these +situations. We moreover analyze word embeddings to show that the nearest +neighbors of some human values in LLMs differ from humans' semantic +representations. We then propose a new thought experiment that we call "the +Chinese room with a word transition dictionary", in extension of John Searle's +famous proposal. We finally mention current promising research directions +towards a weak alignment, which could produce statistically satisfying answers +in a number of common situations, however so far without ensuring any truth +value. + +
+
+ comment: Accepted for publication in Scientific Reports, special issue on AI + aligment +
+
+
+
+
+ + ♻ ☆ XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for + Noise-Robust Speech Perception ACL2024 + + +
+ Speech recognition and translation systems perform poorly on noisy inputs, +which are frequent in realistic environments. Augmenting these systems with +visual signals has the potential to improve robustness to noise. However, +audio-visual (AV) data is only available in limited amounts and for fewer +languages than audio-only resources. To address this gap, we present XLAVS-R, a +cross-lingual audio-visual speech representation model for noise-robust speech +recognition and translation in over 100 languages. It is designed to maximize +the benefits of limited multilingual AV pre-training data, by building on top +of audio-only multilingual pre-training and simplifying existing pre-training +schemes. Extensive evaluation on the MuAViC benchmark shows the strength of +XLAVS-R on downstream audio-visual speech recognition and translation tasks, +where it outperforms the previous state of the art by up to 18.5% WER and 4.7 +BLEU given noisy AV inputs, and enables strong zero-shot audio-visual ability +with audio-only fine-tuning. + +
+
+ comment: ACL2024 +
+
+
+
+
+ + ♻ ☆ A Survey on LoRA of Large Language Models + + +
+ Low-Rank Adaptation~(LoRA), which updates the dense neural network layers +with pluggable low-rank matrices, is one of the best performed parameter +efficient fine-tuning paradigms. Furthermore, it has significant advantages in +cross-task generalization and privacy-preserving. Hence, LoRA has gained much +attention recently, and the number of related literature demonstrates +exponential growth. It is necessary to conduct a comprehensive overview of the +current progress on LoRA. This survey categorizes and reviews the progress from +the perspectives of (1) downstream adaptation improving variants that improve +LoRA's performance on downstream tasks; (2) cross-task generalization methods +that mix multiple LoRA plugins to achieve cross-task generalization; (3) +efficiency-improving methods that boost the computation-efficiency of LoRA; (4) +data privacy-preserving methods that use LoRA in federated learning; (5) +application. Besides, this survey also discusses the future directions in this +field. At last, we provide a Github +page~\footnote{\href{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}} +for readers to check the updates and initiate discussions on this survey paper. + +
+
+
+
+
+ + ♻ ☆ ExpNote: Black-box Large Language Models are Better Task Solvers with + Experience Notebook EMNLP 2023 + + +
+ Black-box Large Language Models (LLMs) have shown great power in solving +various tasks and are considered general problem solvers. However, LLMs still +fail in many specific tasks although understand the task instruction. In this +paper, we focus on the problem of boosting the ability of black-box LLMs to +solve downstream tasks. We propose ExpNote, an automated framework to help LLMs +better adapt to unfamiliar tasks through reflecting and noting experiences from +training data and retrieving them from external memory during testing. We +evaluate ExpNote on multiple tasks and the experimental results demonstrate +that the proposed method significantly improves the performance of black-box +LLMs. The data and code are available at +https://github.com/forangel2014/ExpNote + +
+
+ comment: EMNLP 2023 findings +
+
+
+
+
+ + ♻ ☆ ConspEmoLLM: Conspiracy Theory Detection Using an Emotion-Based Large + Language Model + + +
+ The internet has brought both benefits and harms to society. A prime example +of the latter is misinformation, including conspiracy theories, which flood the +web. Recent advances in natural language processing, particularly the emergence +of large language models (LLMs), have improved the prospects of accurate +misinformation detection. However, most LLM-based approaches to conspiracy +theory detection focus only on binary classification and fail to account for +the important relationship between misinformation and affective features (i.e., +sentiment and emotions). Driven by a comprehensive analysis of conspiracy text +that reveals its distinctive affective features, we propose ConspEmoLLM, the +first open-source LLM that integrates affective information and is able to +perform diverse tasks relating to conspiracy theories. These tasks include not +only conspiracy theory detection, but also classification of theory type and +detection of related discussion (e.g., opinions towards theories). ConspEmoLLM +is fine-tuned based on an emotion-oriented LLM using our novel ConDID dataset, +which includes five tasks to support LLM instruction tuning and evaluation. We +demonstrate that when applied to these tasks, ConspEmoLLM largely outperforms +several open-source general domain LLMs and ChatGPT, as well as an LLM that has +been fine-tuned using ConDID, but which does not use affective features. This +project will be released on https://github.com/lzw108/ConspEmoLLM/. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Tamil Language Computing: the Present and the Future + + +
+ This paper delves into the text processing aspects of Language Computing, +which enables computers to understand, interpret, and generate human language. +Focusing on tasks such as speech recognition, machine translation, sentiment +analysis, text summarization, and language modelling, language computing +integrates disciplines including linguistics, computer science, and cognitive +psychology to create meaningful human-computer interactions. Recent +advancements in deep learning have made computers more accessible and capable +of independent learning and adaptation. In examining the landscape of language +computing, the paper emphasises foundational work like encoding, where Tamil +transitioned from ASCII to Unicode, enhancing digital communication. It +discusses the development of computational resources, including raw data, +dictionaries, glossaries, annotated data, and computational grammars, necessary +for effective language processing. The challenges of linguistic annotation, the +creation of treebanks, and the training of large language models are also +covered, emphasising the need for high-quality, annotated data and advanced +language models. The paper underscores the importance of building practical +applications for languages like Tamil to address everyday communication needs, +highlighting gaps in current technology. It calls for increased research +collaboration, digitization of historical texts, and fostering digital usage to +ensure the comprehensive development of Tamil language processing, ultimately +enhancing global communication and access to digital services. + +
+
+ comment: 11 pages, This is the write-up of the address delivered at the 30th + Annual Sessions of the Jaffna Science Association, held from March 29-31, + 2023, at the University of Jaffna, Sri Lanka +
+
+
+
+
+ + ♻ ☆ Representation Learning with Conditional Information Flow Maximization ACL 2024 + + +
+ This paper proposes an information-theoretic representation learning +framework, named conditional information flow maximization, to extract +noise-invariant sufficient representations for the input data and target task. +It promotes the learned representations have good feature uniformity and +sufficient predictive ability, which can enhance the generalization of +pre-trained language models (PLMs) for the target task. Firstly, an information +flow maximization principle is proposed to learn more sufficient +representations for the input and target by simultaneously maximizing both +input-representation and representation-label mutual information. Unlike the +information bottleneck, we handle the input-representation information in an +opposite way to avoid the over-compression issue of latent representations. +Besides, to mitigate the negative effect of potential redundant features from +the input, we design a conditional information minimization principle to +eliminate negative redundant features while preserve noise-invariant features. +Experiments on 13 language understanding benchmarks demonstrate that our method +effectively improves the performance of PLMs for classification and regression. +Extensive experiments show that the learned representations are more +sufficient, robust and transferable. + +
+
+ comment: 16 pages, accepted to ACL 2024 (main conference), the code is + available at https://github.com/zerohd4869/CIFM +
+
+
+
+
+ + ♻ ☆ Unsupervised Parsing by Searching for Frequent Word Sequences among + Sentences with Equivalent Predicate-Argument Structures + + +
+ Unsupervised constituency parsing focuses on identifying word sequences that +form a syntactic unit (i.e., constituents) in target sentences. Linguists +identify the constituent by evaluating a set of Predicate-Argument Structure +(PAS) equivalent sentences where we find the constituent appears more +frequently than non-constituents (i.e., the constituent corresponds to a +frequent word sequence within the sentence set). However, such frequency +information is unavailable in previous parsing methods that identify the +constituent by observing sentences with diverse PAS. In this study, we +empirically show that constituents correspond to frequent word sequences in the +PAS-equivalent sentence set. We propose a frequency-based parser span-overlap +that (1) computes the span-overlap score as the word sequence's frequency in +the PAS-equivalent sentence set and (2) identifies the constituent structure by +finding a constituent tree with the maximum span-overlap score. The parser +achieves state-of-the-art level parsing accuracy, outperforming existing +unsupervised parsers in eight out of ten languages. Additionally, we discover a +multilingual phenomenon: participant-denoting constituents tend to have higher +span-overlap scores than equal-length event-denoting constituents, meaning that +the former tend to appear more frequently in the PAS-equivalent sentence set +than the latter. The phenomenon indicates a statistical difference between the +two constituent types, laying the foundation for future labeled unsupervised +parsing research. + +
+
+
+
+
+ + ♻ ☆ Text Generation: A Systematic Literature Review of Tasks, Evaluation, + and Challenges + + +
+ Text generation has become more accessible than ever, and the increasing +interest in these systems, especially those using large language models, has +spurred an increasing number of related publications. We provide a systematic +literature review comprising 244 selected papers between 2017 and 2024. This +review categorizes works in text generation into five main tasks: open-ended +text generation, summarization, translation, paraphrasing, and question +answering. For each task, we review their relevant characteristics, sub-tasks, +and specific challenges (e.g., missing datasets for multi-document +summarization, coherence in story generation, and complex reasoning for +question answering). Additionally, we assess current approaches for evaluating +text generation systems and ascertain problems with current metrics. Our +investigation shows nine prominent challenges common to all tasks and sub-tasks +in recent text generation publications: bias, reasoning, hallucinations, +misuse, privacy, interpretability, transparency, datasets, and computing. We +provide a detailed analysis of these challenges, their potential solutions, and +which gaps still require further engagement from the community. This systematic +literature review targets two main audiences: early career researchers in +natural language processing looking for an overview of the field and promising +research directions, as well as experienced researchers seeking a detailed view +of tasks, evaluation methodologies, open challenges, and recent mitigation +strategies. + +
+
+ comment: 35 pages, 2 figures, 2 tables, Under review +
+
+
+
+
+ + ♻ ☆ Protecting Copyrighted Material with Unique Identifiers in Large + Language Model Training + + +
+ A major public concern regarding the training of large language models (LLMs) +is whether they abusing copyrighted online text. Previous membership inference +methods may be misled by similar examples in vast amounts of training data. +Additionally, these methods are often too complex for general users to +understand and use, making them centralized, lacking transparency, and +trustworthiness. To address these issues, we propose an alternative +\textit{insert-and-detection} methodology, advocating that web users and +content platforms employ \textbf{\textit{unique identifiers}} for reliable and +independent membership inference. Users and platforms can create their own +identifiers, embed them in copyrighted text, and independently detect them in +future LLMs. As an initial demonstration, we introduce \textit{ghost +sentences}, a primitive form of unique identifiers, consisting primarily of +passphrases made up of random words. By embedding one ghost sentences in a few +copyrighted texts, users can detect its membership using a perplexity test and +a \textit{user-friendly} last-$k$ words test. The perplexity test is based on +the fact that LLMs trained on natural language should exhibit high perplexity +when encountering unnatural passphrases. As the repetition increases, users can +leverage the verbatim memorization ability of LLMs to perform a last-$k$ words +test by chatting with LLMs without writing any code. Both tests offer rigorous +statistical guarantees for membership inference. For LLaMA-13B, a perplexity +test on 30 ghost sentences with an average of 7 repetitions in 148K examples +yields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out +of 16 users, with an average of 24 examples each, successfully identify their +data from 1.8M examples. + +
+
+ comment: Preprint, work in progress +
+
+
+
+
+ + ♻ ☆ reCSE: Portable Reshaping Features for Sentence Embedding in + Self-supervised Contrastive Learning + + +
+ We propose reCSE, a self supervised contrastive learning sentence +representation framework based on feature reshaping. This framework is +different from the current advanced models that use discrete data augmentation +methods, but instead reshapes the input features of the original sentence, +aggregates the global information of each token in the sentence, and alleviates +the common problems of representation polarity and GPU memory consumption +linear increase in current advanced models. In addition, our reCSE has achieved +competitive performance in semantic similarity tasks. And the experiment proves +that our proposed feature reshaping method has strong universality, which can +be transplanted to other self supervised contrastive learning frameworks and +enhance their representation ability, even achieving state-of-the-art +performance. Our code is available at https://github.com/heavenhellchen/reCSE. + +
+
+
+
+
+ + ♻ ☆ Ethos and Pathos in Online Group Discussions: Corpora for Polarisation + Issues in Social Media + + +
+ Growing polarisation in society caught the attention of the scientific +community as well as news media, which devote special issues to this +phenomenon. At the same time, digitalisation of social interactions requires to +revise concepts from social science regarding establishment of trust, which is +a key feature of all human interactions, and group polarisation, as well as new +computational tools to process large quantities of available data. Existing +methods seem insufficient to tackle the problem fully, thus, we propose to +approach the problem by investigating rhetorical strategies employed by +individuals in polarising discussions online. To this end, we develop +multi-topic and multi-platform corpora with manual annotation of appeals to +ethos and pathos, two modes of persuasion in Aristotelian rhetoric. It can be +employed for training language models to advance the study of communication +strategies online on a large scale. With the use of computational methods, our +corpora allows an investigation of recurring patterns in polarising exchanges +across topics of discussion and media platforms, and conduct both quantitative +and qualitative analyses of language structures leading to and engaged in +polarisation. + +
+
+
+
+
+ + ♻ ☆ FineRadScore: A Radiology Report Line-by-Line Evaluation Technique + Generating Corrections with Severity Scores + + +
+ The current gold standard for evaluating generated chest x-ray (CXR) reports +is through radiologist annotations. However, this process can be extremely +time-consuming and costly, especially when evaluating large numbers of reports. +In this work, we present FineRadScore, a Large Language Model (LLM)-based +automated evaluation metric for generated CXR reports. Given a candidate report +and a ground-truth report, FineRadScore gives the minimum number of +line-by-line corrections required to go from the candidate to the ground-truth +report. Additionally, FineRadScore provides an error severity rating with each +correction and generates comments explaining why the correction was needed. We +demonstrate that FineRadScore's corrections and error severity scores align +with radiologist opinions. We also show that, when used to judge the quality of +the report as a whole, FineRadScore aligns with radiologists as well as current +state-of-the-art automated CXR evaluation metrics. Finally, we analyze +FineRadScore's shortcomings to provide suggestions for future improvements. + +
+
+
+
+
+ + ♻ ☆ MMICT: Boosting Multi-Modal Fine-Tuning with In-Context Examples + + +
+ Although In-Context Learning (ICL) brings remarkable performance gains to +Large Language Models (LLMs), the improvements remain lower than fine-tuning on +downstream tasks. This paper introduces Multi-Modal In-Context Tuning (MMICT), +a novel multi-modal fine-tuning paradigm that boosts multi-modal fine-tuning by +fully leveraging the promising ICL capability of multi-modal LLMs (MM-LLMs). We +propose the Multi-Modal Hub (M-Hub), a unified module that captures various +multi-modal features according to different inputs and objectives. Based on +M-Hub, MMICT enables MM-LLMs to learn from in-context visual-guided textual +features and subsequently generate outputs conditioned on the textual-guided +visual features. Moreover, leveraging the flexibility of M-Hub, we design a +variety of in-context demonstrations. Extensive experiments on a diverse range +of downstream multi-modal tasks demonstrate that MMICT significantly +outperforms traditional fine-tuning strategy and the vanilla ICT method that +directly takes the concatenation of all information from different modalities +as input. Our implementation is available at: +https://github.com/KDEGroup/MMICT. + +
+
+ comment: TOMM 2024 +
+
+
+
+
+ + ♻ ☆ LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic + Prompt Compression ACL 2024 + + +
+ This paper focuses on task-agnostic prompt compression for better +generalizability and efficiency. Considering the redundancy in natural +language, existing approaches compress prompts by removing tokens or lexical +units according to their information entropy obtained from a causal language +model such as LLaMa-7B. The challenge is that information entropy may be a +suboptimal compression metric: (i) it only leverages unidirectional context and +may fail to capture all essential information needed for prompt compression; +(ii) it is not aligned with the prompt compression objective. + To address these issues, we propose a data distillation procedure to derive +knowledge from an LLM to compress prompts without losing crucial information, +and meantime, introduce an extractive text compression dataset. We formulate +prompt compression as a token classification problem to guarantee the +faithfulness of the compressed prompt to the original one, and use a +Transformer encoder as the base architecture to capture all essential +information for prompt compression from the full bidirectional context. Our +approach leads to lower latency by explicitly learning the compression +objective with smaller models such as XLM-RoBERTa-large and mBERT. + We evaluate our method on both in-domain and out-of-domain datasets, +including MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its +small size, our model shows significant performance gains over strong baselines +and demonstrates robust generalization ability across different LLMs. +Additionally, our model is 3x-6x faster than existing prompt compression +methods, while accelerating the end-to-end latency by 1.6x-2.9x with +compression ratios of 2x-5x. Our code is available at +https://aka.ms/LLMLingua-2. + +
+
+ comment: Accepted at Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios + via Prompt Compression ACL 2024 + + +
+ In long context scenarios, large language models (LLMs) face three main +challenges: higher computational cost, performance reduction, and position +bias. Research indicates that LLM performance hinges on the density and +position of key information in the input prompt. Inspired by these findings, we +propose LongLLMLingua for prompt compression towards improving LLMs' perception +of the key information to simultaneously address the three challenges. Our +extensive evaluation across various long context scenarios demonstrates that +LongLLMLingua not only enhances performance but also significantly reduces +costs and latency. For instance, in the NaturalQuestions benchmark, +LongLLMLingua boosts performance by up to 21.4% with around 4x fewer tokens in +GPT-3.5-Turbo, leading to substantial cost savings. It achieves a 94.0% cost +reduction in the LooGLE benchmark. Moreover, when compressing prompts of about +10k tokens at ratios of 2x-6x, LongLLMLingua can accelerate end-to-end latency +by 1.4x-2.6x. Our code is available at https://aka.ms/LongLLMLingua. + +
+
+ comment: Accepted at ACL 2024 +
+
+
+
+
+ + ♻ ☆ How would Stance Detection Techniques Evolve after the Launch of + ChatGPT? + + +
+ Stance detection refers to the task of extracting the standpoint (Favor, +Against or Neither) towards a target in given texts. Such research gains +increasing attention with the proliferation of social media contents. The +conventional framework of handling stance detection is converting it into text +classification tasks. Deep learning models have already replaced rule-based +models and traditional machine learning models in solving such problems. +Current deep neural networks are facing two main challenges which are +insufficient labeled data and information in social media posts and the +unexplainable nature of deep learning models. A new pre-trained language model +chatGPT was launched on Nov 30, 2022. For the stance detection tasks, our +experiments show that ChatGPT can achieve SOTA or similar performance for +commonly used datasets including SemEval-2016 and P-Stance. At the same time, +ChatGPT can provide explanation for its own prediction, which is beyond the +capability of any existing model. The explanations for the cases it cannot +provide classification results are especially useful. ChatGPT has the potential +to be the best AI model for stance detection tasks in NLP, or at least change +the research paradigm of this field. ChatGPT also opens up the possibility of +building explanatory AI for stance detection. + +
+
+
+
+
+ + ♻ ☆ UNER: A Unified Prediction Head for Named Entity Recognition in + Visually-rich Documents + + +
+ The recognition of named entities in visually-rich documents (VrD-NER) plays +a critical role in various real-world scenarios and applications. However, the +research in VrD-NER faces three major challenges: complex document layouts, +incorrect reading orders, and unsuitable task formulations. To address these +challenges, we propose a query-aware entity extraction head, namely UNER, to +collaborate with existing multi-modal document transformers to develop more +robust VrD-NER models. The UNER head considers the VrD-NER task as a +combination of sequence labeling and reading order prediction, effectively +addressing the issues of discontinuous entities in documents. Experimental +evaluations on diverse datasets demonstrate the effectiveness of UNER in +improving entity extraction performance. Moreover, the UNER head enables a +supervised pre-training stage on various VrD-NER datasets to enhance the +document transformer backbones and exhibits substantial knowledge transfer from +the pre-training stage to the fine-tuning stage. By incorporating universal +layout understanding, a pre-trained UNER-based model demonstrates significant +advantages in few-shot and cross-linguistic scenarios and exhibits zero-shot +entity extraction abilities. + +
+
+ comment: accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Efficiently and Effectively: A Two-stage Approach to Balance Plaintext + and Encrypted Text for Traffic Classification + + +
+ Encrypted traffic classification is the task of identifying the application +or service associated with encrypted network traffic. One effective approach +for this task is to use deep learning methods to encode the raw traffic bytes +directly and automatically extract features for classification (byte-based +models). However, current byte-based models input raw traffic bytes, whether +plaintext or encrypted text, for automated feature extraction, neglecting the +distinct impacts of plaintext and encrypted text on downstream tasks. +Additionally, these models primarily focus on improving classification +accuracy, with little emphasis on the efficiency of models. In this paper, for +the first time, we analyze the impact of plaintext and encrypted text on the +model's effectiveness and efficiency. Based on our observations and findings, +we propose a two-phase approach to balance the trade-off between plaintext and +encrypted text in traffic classification. Specifically, Stage one is to +Determine whether the Plain text is enough to be accurately Classified (DPC) +using the proposed DPC Selector. This stage quickly identifies samples that can +be classified using plaintext, leveraging explicit byte features in plaintext +to enhance model's efficiency. Stage two aims to adaptively make a +classification with the result from stage one. This stage incorporates +encrypted text information for samples that cannot be classified using +plaintext alone, ensuring the model's effectiveness on traffic classification +tasks. Experiments on two datasets demonstrate that our proposed model achieves +state-of-the-art results in both effectiveness and efficiency. + +
+
+
+
+
+ + ♻ ☆ AI-native Memory: A Pathway from LLMs Towards AGI + + +
+ Large language models (LLMs) have demonstrated the world with the sparks of +artificial general intelligence (AGI). One opinion, especially from some +startups working on LLMs, argues that an LLM with nearly unlimited context +length can realize AGI. However, they might be too optimistic about the +long-context capability of (existing) LLMs -- (1) Recent literature has shown +that their effective context length is significantly smaller than their claimed +context length; and (2) Our reasoning-in-a-haystack experiments further +demonstrate that simultaneously finding the relevant information from a long +context and conducting (simple) reasoning is nearly impossible. In this paper, +we envision a pathway from LLMs to AGI through the integration of +\emph{memory}. We believe that AGI should be a system where LLMs serve as core +processors. In addition to raw data, the memory in this system would store a +large number of important conclusions derived from reasoning processes. +Compared with retrieval-augmented generation (RAG) that merely processing raw +data, this approach not only connects semantically related information closer, +but also simplifies complex inferences at the time of querying. As an +intermediate stage, the memory will likely be in the form of natural language +descriptions, which can be directly consumed by users too. Ultimately, every +agent/person should have its own large personal model, a deep neural network +model (thus \emph{AI-native}) that parameterizes and compresses all types of +memory, even the ones cannot be described by natural languages. Finally, we +discuss the significant potential of AI-native memory as the transformative +infrastructure for (proactive) engagement, personalization, distribution, and +social in the AGI era, as well as the incurred privacy and security challenges +with preliminary solutions. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The basic theory for large language + models + + +
+ Language models have emerged as a critical area of focus in artificial +intelligence, particularly with the introduction of groundbreaking innovations +like ChatGPT. Large-scale Transformer networks have quickly become the leading +approach for advancing natural language processing algorithms. Built on the +Transformer architecture, these models enable interactions that closely mimic +human communication and, equipped with extensive knowledge, can even assist in +guiding human tasks. Despite their impressive capabilities and growing +complexity, a key question remains-the theoretical foundations of large +language models (LLMs). What makes Transformer so effective for powering +intelligent language applications, such as translation and coding? What +underlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme +enhance the fine-tuning of LLMs? And what supports the practicality of pruning +LLMs? To address these critical questions and explore the technological +strategies within LLMs, we leverage the Universal Approximation Theory (UAT) to +offer a theoretical backdrop, shedding light on the mechanisms that underpin +these advancements. + +
+
+
+
+
+ + ♻ ☆ Decoding Speculative Decoding + + +
+ Speculative Decoding is a widely used technique to speed up inference for +Large Language Models (LLMs) without sacrificing quality. When performing +inference, speculative decoding uses a smaller draft model to generate +speculative tokens and then uses the target LLM to verify those draft tokens. +The speedup provided by speculative decoding heavily depends on the choice of +the draft model. In this work, we perform a detailed study comprising over 350 +experiments with LLaMA-65B and OPT-66B using speculative decoding and delineate +the factors that affect the performance gain provided by speculative decoding. +Our experiments indicate that the performance of speculative decoding depends +heavily on the latency of the draft model, and the draft model's capability in +language modeling does not correlate strongly with its performance in +speculative decoding. Based on these insights we explore a new design space for +draft models and design hardware-efficient draft models for speculative +decoding. Our newly designed draft model for LLaMA-65B can provide 111% higher +throughput than existing draft models and can generalize further to the LLaMA-2 +model family and supervised fine-tuned models. + +
+
+
+
+
+ + ♻ ☆ SELF-GUIDE: Better Task-Specific Instruction Following via + Self-Synthetic Finetuning + + +
+ Large language models (LLMs) hold the promise of solving diverse tasks when +provided with appropriate natural language prompts. However, prompting often +leads models to make predictions with lower accuracy compared to finetuning a +model with ample training data. On the other hand, while finetuning LLMs on +task-specific data generally improves their performance, abundant annotated +datasets are not available for all tasks. Previous work has explored generating +task-specific data from state-of-the-art LLMs and using this data to finetune +smaller models, but this approach requires access to a language model other +than the one being trained, which introduces cost, scalability challenges, and +legal hurdles associated with continuously relying on more powerful LLMs. In +response to these, we propose SELF-GUIDE, a multi-stage mechanism in which we +synthesize task-specific input-output pairs from the student LLM, then use +these input-output pairs to finetune the student LLM itself. In our empirical +evaluation of the Natural Instructions V2 benchmark, we find that SELF-GUIDE +improves the performance of LLM by a substantial margin. Specifically, we +report an absolute improvement of approximately 15% for classification tasks +and 18% for generation tasks in the benchmark's metrics. This sheds light on +the promise of self-synthesized data guiding LLMs towards becoming +task-specific experts without any external learning signals. + +
+
+ comment: Accepted by COLM 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Systematic Decompositional Natural Language Inference Using + Informal Logic + + +
+ Recent language models enable new opportunities for structured reasoning with +text, such as the construction of intuitive, proof-like textual entailment +trees without relying on brittle formal logic. However, progress in this +direction has been hampered by a long-standing lack of a clear protocol for +determining what valid compositional entailment is. This absence causes noisy +datasets and limited performance gains by modern neuro-symbolic engines. To +address these problems, we formulate a consistent and theoretically grounded +approach to annotating decompositional entailment and evaluate its impact on +LLM-based textual inference. We find that our new dataset, RDTE (Recognizing +Decompositional Textual Entailment), has a substantially higher internal +consistency (+9%) than prior decompositional entailment datasets. We also find +that training an RDTE-oriented entailment classifier via knowledge distillation +and employing it in an entailment tree reasoning engine significantly improves +both accuracy and proof quality, illustrating the practical benefit of this +advance for textual inference. + +
+
+
+
+
+ + ♻ ☆ NELLIE: A Neuro-Symbolic Inference Engine for Grounded, Compositional, + and Explainable Reasoning IJCAI 2024 + + +
+ Our goal is a modern approach to answering questions via systematic reasoning +where answers are supported by human interpretable proof trees grounded in an +NL corpus of authoritative facts. Such a system would help alleviate the +challenges of interpretability and hallucination with modern LMs, and the lack +of grounding of current explanation methods (e.g., Chain-of-Thought). This +paper proposes a new take on Prolog-based inference engines, where we replace +handcrafted rules with a combination of neural language modeling, guided +generation, and semiparametric dense retrieval. Our implementation, NELLIE, is +the first system to demonstrate fully interpretable, end-to-end grounded QA as +entailment tree proof search, going beyond earlier work explaining +known-to-be-true facts from text. In experiments, NELLIE outperforms a +similar-sized state-of-the-art reasoner [Tafjord et al., 2022] while producing +knowledge-grounded explanations. We also find NELLIE can exploit both +semi-structured and NL text corpora to guide reasoning. Together these suggest +a new way to jointly reap the benefits of both modern neural methods and +traditional symbolic reasoning. + +
+
+ comment: Published at IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence in Extracting Diagnostic Data from Dental + Records + + +
+ This research addresses the issue of missing structured data in dental +records by extracting diagnostic information from unstructured text. The +updated periodontology classification system's complexity has increased +incomplete or missing structured diagnoses. To tackle this, we use advanced AI +and NLP methods, leveraging GPT-4 to generate synthetic notes for fine-tuning a +RoBERTa model. This significantly enhances the model's ability to understand +medical and dental language. We evaluated the model using 120 randomly selected +clinical notes from two datasets, demonstrating its improved diagnostic +extraction accuracy. The results showed high accuracy in diagnosing periodontal +status, stage, and grade, with Site 1 scoring 0.99 and Site 2 scoring 0.98. In +the subtype category, Site 2 achieved perfect scores, outperforming Site 1. +This method enhances extraction accuracy and broadens its use across dental +contexts. The study underscores AI and NLP's transformative impact on +healthcare delivery and management. Integrating AI and NLP technologies +enhances documentation and simplifies administrative tasks by precisely +extracting complex clinical information. This approach effectively addresses +challenges in dental diagnostics. Using synthetic training data from LLMs +optimizes the training process, improving accuracy and efficiency in +identifying periodontal diagnoses from clinical notes. This innovative method +holds promise for broader healthcare applications, potentially improving +patient care quality. + +
+
+ comment: 11 pages, 2 tables, 3 figures, under review +
+
+
+
+
+ + ♻ ☆ Occam's Razor and Bender and Koller's Octopus + + +
+ We discuss the teaching of the discussion surrounding Bender and Koller's +prominent ACL 2020 paper, "Climbing toward NLU: on meaning form, and +understanding in the age of data" \cite{bender2020climbing}. We present what we +understand to be the main contentions of the paper, and then recommend that the +students engage with the natural counter-arguments to the claims in the paper. +We attach teaching materials that we use to facilitate teaching this topic to +undergraduate students. + +
+
+
+
+
+ + ♻ ☆ Large Language Model for Mental Health: A Systematic Review + + +
+ Large language models (LLMs) have attracted significant attention for +potential applications in digital health, while their application in mental +health is subject to ongoing debate. This systematic review aims to evaluate +the usage of LLMs in mental health, focusing on their strengths and limitations +in early screening, digital interventions, and clinical applications. Adhering +to PRISMA guidelines, we searched PubMed, IEEE Xplore, Scopus, JMIR, and ACM +using keywords: 'mental health OR mental illness OR mental disorder OR +psychiatry' AND 'large language models'. We included articles published between +January 1, 2017, and April 30, 2024, excluding non-English articles. 30 +articles were evaluated, which included research on mental health conditions +and suicidal ideation detection through text (n=15), usage of LLMs for mental +health conversational agents (CAs) (n=7), and other applications and +evaluations of LLMs in mental health (n=18). LLMs exhibit substantial +effectiveness in detecting mental health issues and providing accessible, +de-stigmatized eHealth services. However, the current risks associated with the +clinical use might surpass their benefits. The study identifies several +significant issues: the lack of multilingual datasets annotated by experts, +concerns about the accuracy and reliability of the content generated, +challenges in interpretability due to the 'black box' nature of LLMs, and +persistent ethical dilemmas. These include the lack of a clear ethical +framework, concerns about data privacy, and the potential for over-reliance on +LLMs by both therapists and patients, which could compromise traditional +medical practice. Despite these issues, the rapid development of LLMs +underscores their potential as new clinical aids, emphasizing the need for +continued research and development in this area. + +
+
+
+
+
+ + ♻ ☆ Fine-grained Hallucination Detection and Editing for Language Models + + +
+ Large language models (LMs) are prone to generate factual errors, which are +often called hallucinations. In this paper, we introduce a comprehensive +taxonomy of hallucinations and argue that hallucinations manifest in diverse +forms, each requiring varying degrees of careful assessments to verify +factuality. We propose a novel task of automatic fine-grained hallucination +detection and construct a new evaluation benchmark, FavaBench, that includes +about one thousand fine-grained human judgments on three LM outputs across +various domains. Our analysis reveals that ChatGPT and Llama2-Chat (70B, 7B) +exhibit diverse types of hallucinations in the majority of their outputs in +information-seeking scenarios. We train FAVA, a retrieval-augmented LM by +carefully creating synthetic data to detect and correct fine-grained +hallucinations. On our benchmark, our automatic and human evaluations show that +FAVA significantly outperforms ChatGPT and GPT-4 on fine-grained hallucination +detection, and edits suggested by FAVA improve the factuality of LM-generated +text. + +
+
+ comment: Our code, data, and demo are available at + https://fine-grained-hallucination.github.io. Published as a conference paper + at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Techniques for supercharging academic writing with generative AI + + +
+ Academic writing is an indispensable yet laborious part of the research +enterprise. This Perspective maps out principles and methods for using +generative artificial intelligence (AI), specifically large language models +(LLMs), to elevate the quality and efficiency of academic writing. We introduce +a human-AI collaborative framework that delineates the rationale (why), process +(how), and nature (what) of AI engagement in writing. The framework pinpoints +both short-term and long-term reasons for engagement and their underlying +mechanisms (e.g., cognitive offloading and imaginative stimulation). It reveals +the role of AI throughout the writing process, conceptualized through a +two-stage model for human-AI collaborative writing, and the nature of AI +assistance in writing, represented through a model of writing-assistance types +and levels. Building on this framework, we describe effective prompting +techniques for incorporating AI into the writing routine (outlining, drafting, +and editing) as well as strategies for maintaining rigorous scholarship, +adhering to varied journal policies, and avoiding overreliance on AI. +Ultimately, the prudent integration of AI into academic writing can ease the +communication burden, empower authors, accelerate discovery, and promote +diversity in science. + +
+
+ comment: 14 pages, 2 figures, 1 table, 1 box +
+
+
+
+
+ + ♻ ☆ Balancing Speciality and Versatility: a Coarse to Fine Framework for + Supervised Fine-tuning Large Language Model ACL 2024 + + +
+ Aligned Large Language Models (LLMs) showcase remarkable versatility, capable +of handling diverse real-world tasks. Meanwhile, aligned LLMs are also expected +to exhibit speciality, excelling in specific applications. However, fine-tuning +with extra data, a common practice to gain speciality, often leads to +catastrophic forgetting (CF) of previously acquired versatility, hindering the +model's performance across diverse tasks. In response to this challenge, we +propose CoFiTune, a coarse to fine framework in an attempt to strike the +balance between speciality and versatility. At the coarse-grained level, an +empirical tree-search algorithm is utilized to pinpoint and update specific +modules that are crucial for speciality, while keeping other parameters frozen; +at the fine-grained level, a soft-masking mechanism regulates the update to the +LLMs, mitigating the CF issue without harming speciality. In an overall +evaluation of both speciality and versatility, CoFiTune consistently +outperforms baseline methods across diverse tasks and model scales. Compared to +the full-parameter SFT, CoFiTune leads to about 14% versatility improvement and +marginal speciality loss on a 13B model. Lastly, based on further analysis, we +provide a speculative insight into the information forwarding process in LLMs, +which helps explain the effectiveness of the proposed method. The code is +available at https://github.com/rattlesnakey/CoFiTune. + +
+
+ comment: 43 pages, 10 figures, accepted by ACL 2024 +
+
+
+
+
+ + ♻ ☆ Commonsense-T2I Challenge: Can Text-to-Image Generation Models + Understand Commonsense? + + +
+ We present a novel task and benchmark for evaluating the ability of +text-to-image(T2I) generation models to produce images that align with +commonsense in real life, which we call Commonsense-T2I. Given two adversarial +text prompts containing an identical set of action words with minor +differences, such as "a lightbulb without electricity" v.s. "a lightbulb with +electricity", we evaluate whether T2I models can conduct visual-commonsense +reasoning, e.g. produce images that fit "the lightbulb is unlit" vs. "the +lightbulb is lit" correspondingly. Commonsense-T2I presents an adversarial +challenge, providing pairwise text prompts along with expected outputs. The +dataset is carefully hand-curated by experts and annotated with fine-grained +labels, such as commonsense type and likelihood of the expected outputs, to +assist analyzing model behavior. We benchmark a variety of state-of-the-art +(sota) T2I models and surprisingly find that, there is still a large gap +between image synthesis and real life photos--even the DALL-E 3 model could +only achieve 48.92% on Commonsense-T2I, and the stable diffusion XL model only +achieves 24.92% accuracy. Our experiments show that GPT-enriched prompts cannot +solve this challenge, and we include a detailed analysis about possible reasons +for such deficiency. We aim for Commonsense-T2I to serve as a high-quality +evaluation benchmark for T2I commonsense checking, fostering advancements in +real life image generation. + +
+
+ comment: COLM 2024, Project Url: https://zeyofu.github.io/CommonsenseT2I/ +
+
+
+
+
+ + ♻ ☆ Intrinsic Self-correction for Enhanced Morality: An Analysis of Internal + Mechanisms and the Superficial Hypothesis + + +
+ Large Language Models (LLMs) are capable of producing content that +perpetuates stereotypes, discrimination, and toxicity. The recently proposed +moral self-correction is a computationally efficient method for reducing +harmful content in the responses of LLMs. However, the process of how injecting +self-correction instructions can modify the behavior of LLMs remains +under-explored. In this paper, we explore the effectiveness of moral +self-correction by answering three research questions: (1) In what scenarios +does moral self-correction work? (2) What are the internal mechanisms of LLMs, +e.g., hidden states, that are influenced by moral self-correction instructions? +(3) Is intrinsic moral self-correction actually superficial? We argue that +self-correction can help LLMs find a shortcut to more morally correct output, +rather than truly reducing the immorality stored in hidden states. Through +empirical investigation with tasks of language generation and multi-choice +question answering, we conclude: (i) LLMs exhibit good performance across both +tasks, and self-correction instructions are particularly beneficial when the +correct answer is already top-ranked; (ii) The morality levels in intermediate +hidden states are strong indicators as to whether one instruction would be more +effective than another; (iii) Based on our analysis of intermediate hidden +states and task case studies of self-correction behaviors, we are first to +propose the hypothesis that intrinsic moral self-correction is in fact +superficial. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 123 + +
+
+
+ + ☆ Moo-ving Beyond Tradition: Revolutionizing Cattle Behavioural + Phenotyping with Pose Estimation Techniques + + +
+ The cattle industry has been a major contributor to the economy of many +countries, including the US and Canada. The integration of Artificial +Intelligence (AI) has revolutionized this sector, mirroring its transformative +impact across all industries by enabling scalable and automated monitoring and +intervention practices. AI has also introduced tools and methods that automate +many tasks previously performed by human labor with the help of computer +vision, including health inspections. Among these methods, pose estimation has +a special place; pose estimation is the process of finding the position of +joints in an image of animals. Analyzing the pose of animal subjects enables +precise identification and tracking of the animal's movement and the movements +of its body parts. By summarizing the video and imagery data into movement and +joint location using pose estimation and then analyzing this information, we +can address the scalability challenge in cattle management, focusing on health +monitoring, behavioural phenotyping and welfare concerns. Our study reviews +recent advancements in pose estimation methodologies, their applicability in +improving the cattle industry, existing challenges, and gaps in this field. +Furthermore, we propose an initiative to enhance open science frameworks within +this field of study by launching a platform designed to connect industry and +academia. + +
+
+
+
+
+ + ☆ HeLiMOS: A Dataset for Moving Object Segmentation in 3D Point Clouds + From Heterogeneous LiDAR Sensors IROS + + +
+ Moving object segmentation (MOS) using a 3D light detection and ranging +(LiDAR) sensor is crucial for scene understanding and identification of moving +objects. Despite the availability of various types of 3D LiDAR sensors in the +market, MOS research still predominantly focuses on 3D point clouds from +mechanically spinning omnidirectional LiDAR sensors. Thus, we are, for example, +lacking a dataset with MOS labels for point clouds from solid-state LiDAR +sensors which have irregular scanning patterns. In this paper, we present a +labeled dataset, called \textit{HeLiMOS}, that enables to test MOS approaches +on four heterogeneous LiDAR sensors, including two solid-state LiDAR sensors. +Furthermore, we introduce a novel automatic labeling method to substantially +reduce the labeling effort required from human annotators. To this end, our +framework exploits an instance-aware static map building approach and +tracking-based false label filtering. Finally, we provide experimental results +regarding the performance of commonly used state-of-the-art MOS approaches on +HeLiMOS that suggest a new direction for a sensor-agnostic MOS, which generally +works regardless of the type of LiDAR sensors used to capture 3D point clouds. +Our dataset is available at https://sites.google.com/view/helimos. + +
+
+ comment: Proc. IEEE/RSJ Int. Conf. Intell. Robot. Syst. (IROS) 2024 +
+
+
+
+
+ + ☆ VisualAgentBench: Towards Large Multimodal Models as Visual Foundation + Agents + + +
+ Large Multimodal Models (LMMs) have ushered in a new era in artificial +intelligence, merging capabilities in both language and vision to form highly +capable Visual Foundation Agents. These agents are postulated to excel across a +myriad of tasks, potentially approaching general artificial intelligence. +However, existing benchmarks fail to sufficiently challenge or showcase the +full potential of LMMs in complex, real-world environments. To address this +gap, we introduce VisualAgentBench (VAB), a comprehensive and pioneering +benchmark specifically designed to train and evaluate LMMs as visual foundation +agents across diverse scenarios, including Embodied, Graphical User Interface, +and Visual Design, with tasks formulated to probe the depth of LMMs' +understanding and interaction capabilities. Through rigorous testing across +nine proprietary LMM APIs and eight open models, we demonstrate the +considerable yet still developing agent capabilities of these models. +Additionally, VAB constructs a trajectory training set constructed through +hybrid methods including Program-based Solvers, LMM Agent Bootstrapping, and +Human Demonstrations, promoting substantial performance improvements in LMMs +through behavior cloning. Our work not only aims to benchmark existing models +but also provides a solid foundation for future development into visual +foundation agents. Code, train \& test data, and part of fine-tuned open LMMs +are available at \url{https://github.com/THUDM/VisualAgentBench}. + +
+
+
+
+
+ + ☆ EqNIO: Subequivariant Neural Inertial Odometry + + +
+ Presently, neural networks are widely employed to accurately estimate 2D +displacements and associated uncertainties from Inertial Measurement Unit (IMU) +data that can be integrated into stochastic filter networks like the Extended +Kalman Filter (EKF) as measurements and uncertainties for the update step in +the filter. However, such neural approaches overlook symmetry which is a +crucial inductive bias for model generalization. This oversight is notable +because (i) physical laws adhere to symmetry principles when considering the +gravity axis, meaning there exists the same transformation for both the +physical entity and the resulting trajectory, and (ii) displacements should +remain equivariant to frame transformations when the inertial frame changes. To +address this, we propose a subequivariant framework by: (i) deriving +fundamental layers such as linear and nonlinear layers for a subequivariant +network, designed to handle sequences of vectors and scalars, (ii) employing +the subequivariant network to predict an equivariant frame for the sequence of +inertial measurements. This predicted frame can then be utilized for extracting +invariant features through projection, which are integrated with arbitrary +network architectures, (iii) transforming the invariant output by frame +transformation to obtain equivariant displacements and covariances. We +demonstrate the effectiveness and generalization of our Equivariant Framework +on a filter-based approach with TLIO architecture for TLIO and Aria datasets, +and an end-to-end deep learning approach with RONIN architecture for RONIN, +RIDI and OxIOD datasets. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ From SAM to SAM 2: Exploring Improvements in Meta's Segment Anything + Model + + +
+ The Segment Anything Model (SAM), introduced to the computer vision community +by Meta in April 2023, is a groundbreaking tool that allows automated +segmentation of objects in images based on prompts such as text, clicks, or +bounding boxes. SAM excels in zero-shot performance, segmenting unseen objects +without additional training, stimulated by a large dataset of over one billion +image masks. SAM 2 expands this functionality to video, leveraging memory from +preceding and subsequent frames to generate accurate segmentation across entire +videos, enabling near real-time performance. This comparison shows how SAM has +evolved to meet the growing need for precise and efficient segmentation in +various applications. The study suggests that future advancements in models +like SAM will be crucial for improving computer vision technology. + +
+
+
+
+
+ + ☆ Long-Form Answers to Visual Questions from Blind and Low Vision People + + +
+ Vision language models can now generate long-form answers to questions about +images - long-form visual question answers (LFVQA). We contribute VizWiz-LF, a +dataset of long-form answers to visual questions posed by blind and low vision +(BLV) users. VizWiz-LF contains 4.2k long-form answers to 600 visual questions, +collected from human expert describers and six VQA models. We develop and +annotate functional roles of sentences of LFVQA and demonstrate that long-form +answers contain information beyond the question answer such as explanations and +suggestions. We further conduct automatic and human evaluations with BLV and +sighted people to evaluate long-form answers. BLV people perceive both +human-written and generated long-form answers to be plausible, but generated +answers often hallucinate incorrect visual details, especially for unanswerable +visual questions (e.g., blurry or irrelevant images). To reduce hallucinations, +we evaluate the ability of VQA models to abstain from answering unanswerable +questions across multiple prompting strategies. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ☆ Finding Patterns in Ambiguity: Interpretable Stress Testing in the + Decision~Boundary CVPR + + +
+ The increasing use of deep learning across various domains highlights the +importance of understanding the decision-making processes of these black-box +models. Recent research focusing on the decision boundaries of deep +classifiers, relies on generated synthetic instances in areas of low +confidence, uncovering samples that challenge both models and humans. We +propose a novel approach to enhance the interpretability of deep binary +classifiers by selecting representative samples from the decision boundary - +prototypes - and applying post-model explanation algorithms. We evaluate the +effectiveness of our approach through 2D visualizations and GradientSHAP +analysis. Our experiments demonstrate the potential of the proposed method, +revealing distinct and compact clusters and diverse prototypes that capture +essential features that lead to low-confidence decisions. By offering a more +aggregated view of deep classifiers' decision boundaries, our work contributes +to the responsible development and deployment of reliable machine learning +systems. + +
+
+ comment: To be published in the Responsible Generative AI workshop at CVPR +
+
+
+
+
+ + ☆ Mipmap-GS: Let Gaussians Deform with Scale-specific Mipmap for + Anti-aliasing Rendering + + +
+ 3D Gaussian Splatting (3DGS) has attracted great attention in novel view +synthesis because of its superior rendering efficiency and high fidelity. +However, the trained Gaussians suffer from severe zooming degradation due to +non-adjustable representation derived from single-scale training. Though some +methods attempt to tackle this problem via post-processing techniques such as +selective rendering or filtering techniques towards primitives, the +scale-specific information is not involved in Gaussians. In this paper, we +propose a unified optimization method to make Gaussians adaptive for arbitrary +scales by self-adjusting the primitive properties (e.g., color, shape and size) +and distribution (e.g., position). Inspired by the mipmap technique, we design +pseudo ground-truth for the target scale and propose a scale-consistency +guidance loss to inject scale information into 3D Gaussians. Our method is a +plug-in module, applicable for any 3DGS models to solve the zoom-in and +zoom-out aliasing. Extensive experiments demonstrate the effectiveness of our +method. Notably, our method outperforms 3DGS in PSNR by an average of 9.25 dB +for zoom-in and 10.40 dB for zoom-out on the NeRF Synthetic dataset. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Context-aware Visual Storytelling with Visual Prefix Tuning and + Contrastive Learning + + +
+ Visual storytelling systems generate multi-sentence stories from image +sequences. In this task, capturing contextual information and bridging visual +variation bring additional challenges. We propose a simple yet effective +framework that leverages the generalization capabilities of pretrained +foundation models, only training a lightweight vision-language mapping network +to connect modalities, while incorporating context to enhance coherence. We +introduce a multimodal contrastive objective that also improves visual +relevance and story informativeness. Extensive experimental results, across +both automatic metrics and human evaluations, demonstrate that the stories +generated by our framework are diverse, coherent, informative, and interesting. + +
+
+ comment: 18 pages, 12 figures, accepted by INLG 2024 +
+
+
+
+
+ + ☆ Rethinking Video with a Universal Event-Based Representation + + +
+ Traditionally, video is structured as a sequence of discrete image frames. +Recently, however, a novel video sensing paradigm has emerged which eschews +video frames entirely. These "event" sensors aim to mimic the human vision +system with asynchronous sensing, where each pixel has an independent, sparse +data stream. While these cameras enable high-speed and high-dynamic-range +sensing, researchers often revert to a framed representation of the event data +for existing applications, or build bespoke applications for a particular +camera's event data type. At the same time, classical video systems have +significant computational redundancy at the application layer, since pixel +samples are repeated across frames in the uncompressed domain. + To address the shortcomings of existing systems, I introduce Address, +Decimation, {\Delta}t Event Representation (AD{\Delta}ER, pronounced "adder"), +a novel intermediate video representation and system framework. The framework +transcodes a variety of framed and event camera sources into a single +event-based representation, which supports source-modeled lossy compression and +backward compatibility with traditional frame-based applications. I demonstrate +that AD{\Delta}ER achieves state-of-the-art application speed and compression +performance for scenes with high temporal redundancy. Crucially, I describe how +AD{\Delta}ER unlocks an entirely new control mechanism for computer vision: +application speed can correlate with both the scene content and the level of +lossy compression. Finally, I discuss the implications for event-based video on +large-scale video surveillance and resource-constrained sensing. + +
+
+ comment: 137 pages. PhD dissertation at the University of North Carolina, + Chapel Hill +
+
+
+
+
+ + ☆ Latent Disentanglement for Low Light Image Enhancement + + +
+ Many learning-based low-light image enhancement (LLIE) algorithms are based +on the Retinex theory. However, the Retinex-based decomposition techniques in +such models introduce corruptions which limit their enhancement performance. In +this paper, we propose a Latent Disentangle-based Enhancement Network (LDE-Net) +for low light vision tasks. The latent disentanglement module disentangles the +input image in latent space such that no corruption remains in the disentangled +Content and Illumination components. For LLIE task, we design a Content-Aware +Embedding (CAE) module that utilizes Content features to direct the enhancement +of the Illumination component. For downstream tasks (e.g. nighttime UAV +tracking and low-light object detection), we develop an effective light-weight +enhancer based on the latent disentanglement framework. Comprehensive +quantitative and qualitative experiments demonstrate that our LDE-Net +significantly outperforms state-of-the-art methods on various LLIE benchmarks. +In addition, the great results obtained by applying our framework on the +downstream tasks also demonstrate the usefulness of our latent disentanglement +design. + +
+
+
+
+
+ + ☆ 3D Reconstruction of Protein Structures from Multi-view AFM Images using + Neural Radiance Fields (NeRFs) + + +
+ Recent advancements in deep learning for predicting 3D protein structures +have shown promise, particularly when leveraging inputs like protein sequences +and Cryo-Electron microscopy (Cryo-EM) images. However, these techniques often +fall short when predicting the structures of protein complexes (PCs), which +involve multiple proteins. In our study, we investigate using atomic force +microscopy (AFM) combined with deep learning to predict the 3D structures of +PCs. AFM generates height maps that depict the PCs in various random +orientations, providing a rich information for training a neural network to +predict the 3D structures. We then employ the pre-trained UpFusion model (which +utilizes a conditional diffusion model for synthesizing novel views) to train +an instance-specific NeRF model for 3D reconstruction. The performance of +UpFusion is evaluated through zero-shot predictions of 3D protein structures +using AFM images. The challenge, however, lies in the time-intensive and +impractical nature of collecting actual AFM images. To address this, we use a +virtual AFM imaging process that transforms a `PDB' protein file into +multi-view 2D virtual AFM images via volume rendering techniques. We +extensively validate the UpFusion architecture using both virtual and actual +multi-view AFM images. Our results include a comparison of structures predicted +with varying numbers of views and different sets of views. This novel approach +holds significant potential for enhancing the accuracy of protein complex +structure predictions with further fine-tuning of the UpFusion network. + +
+
+
+
+
+ + ☆ Correlation Weighted Prototype-based Self-Supervised One-Shot + Segmentation of Medical Images ICPR 2024 + + +
+ Medical image segmentation is one of the domains where sufficient annotated +data is not available. This necessitates the application of low-data frameworks +like few-shot learning. Contemporary prototype-based frameworks often do not +account for the variation in features within the support and query images, +giving rise to a large variance in prototype alignment. In this work, we adopt +a prototype-based self-supervised one-way one-shot learning framework using +pseudo-labels generated from superpixels to learn the semantic segmentation +task itself. We use a correlation-based probability score to generate a dynamic +prototype for each query pixel from the bag of prototypes obtained from the +support feature map. This weighting scheme helps to give a higher weightage to +contextually related prototypes. We also propose a quadrant masking strategy in +the downstream segmentation task by utilizing prior domain information to +discard unwanted false positives. We present extensive experimentations and +evaluations on abdominal CT and MR datasets to show that the proposed simple +but potent framework performs at par with the state-of-the-art methods. + +
+
+ comment: Accepted to ICPR 2024 +
+
+
+
+
+ + ☆ FruitNeRF: A Unified Neural Radiance Field based Fruit Counting + Framework + + +
+ We introduce FruitNeRF, a unified novel fruit counting framework that +leverages state-of-the-art view synthesis methods to count any fruit type +directly in 3D. Our framework takes an unordered set of posed images captured +by a monocular camera and segments fruit in each image. To make our system +independent of the fruit type, we employ a foundation model that generates +binary segmentation masks for any fruit. Utilizing both modalities, RGB and +semantic, we train a semantic neural radiance field. Through uniform volume +sampling of the implicit Fruit Field, we obtain fruit-only point clouds. By +applying cascaded clustering on the extracted point cloud, our approach +achieves precise fruit count.The use of neural radiance fields provides +significant advantages over conventional methods such as object tracking or +optical flow, as the counting itself is lifted into 3D. Our method prevents +double counting fruit and avoids counting irrelevant fruit.We evaluate our +methodology using both real-world and synthetic datasets. The real-world +dataset consists of three apple trees with manually counted ground truths, a +benchmark apple dataset with one row and ground truth fruit location, while the +synthetic dataset comprises various fruit types including apple, plum, lemon, +pear, peach, and mango.Additionally, we assess the performance of fruit +counting using the foundation model compared to a U-Net. + +
+
+ comment: Project Page: https://meyerls.github.io/fruit_nerf/ +
+
+
+
+
+ + ☆ Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment + Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging + + +
+ Purpose: This study aimed to evaluate the zero-shot performance of Segment +Anything Model 2 (SAM 2) in 3D segmentation of abdominal organs in CT scans, +leveraging its video tracking capabilities for volumetric medical imaging. +Materials and Methods: Using a subset of the TotalSegmentator CT dataset +(n=123) from 8 different institutions, we assessed SAM 2's ability to segment 8 +abdominal organs. Segmentation was initiated from three different Z-coordinate +levels (caudal, mid, and cranial levels) of each organ. Performance was +measured using the Dice similarity coefficient (DSC). We also analyzed organ +volumes to contextualize the results. Results: As a zero-shot approach, larger +organs with clear boundaries demonstrated high segmentation performance, with +mean(median) DSCs as follows: liver 0.821(0.898), left kidney 0.870(0.921), +right kidney 0.862(0.935), and spleen 0.891(0.932). Smaller or less defined +structures showed lower performance: gallbladder 0.531(0.590), pancreas +0.361(0.359), and adrenal glands 0.203-0.308(0.109-0.231). Significant +differences in DSC were observed depending on the starting initial slice of +segmentation for different organs. A moderate positive correlation was observed +between volume size and DSCs (Spearman's rs = 0.731, P <.001 at caudal-level). +DSCs exhibited high variability within organs, ranging from near 0 to almost +1.0, indicating substantial inconsistency in segmentation performance between +scans. Conclusion: SAM 2 demonstrated promising zero-shot performance in +segmenting certain abdominal organs in CT scans, particularly larger organs +with clear boundaries. The model's ability to segment previously unseen targets +without additional training highlights its potential for cross-domain +generalization in medical imaging. However, improvements are needed for smaller +and less defined structures. + +
+
+ comment: 16 pages, 6 figures (including 1 supplemental figure), 3 tables +
+
+
+
+
+ + ☆ Blind-Match: Efficient Homomorphic Encryption-Based 1:N Matching for + Privacy-Preserving Biometric Identification CIKM 2024 + + +
+ We present Blind-Match, a novel biometric identification system that +leverages homomorphic encryption (HE) for efficient and privacy-preserving 1:N +matching. Blind-Match introduces a HE-optimized cosine similarity computation +method, where the key idea is to divide the feature vector into smaller parts +for processing rather than computing the entire vector at once. By optimizing +the number of these parts, Blind-Match minimizes execution time while ensuring +data privacy through HE. Blind-Match achieves superior performance compared to +state-of-the-art methods across various biometric datasets. On the LFW face +dataset, Blind-Match attains a 99.63% Rank-1 accuracy with a 128-dimensional +feature vector, demonstrating its robustness in face recognition tasks. For +fingerprint identification, Blind-Match achieves a remarkable 99.55% Rank-1 +accuracy on the PolyU dataset, even with a compact 16-dimensional feature +vector, significantly outperforming the state-of-the-art method, Blind-Touch, +which achieves only 59.17%. Furthermore, Blind-Match showcases practical +efficiency in large-scale biometric identification scenarios, such as Naver +Cloud's FaceSign, by processing 6,144 biometric samples in 0.74 seconds using a +128-dimensional feature vector. + +
+
+ comment: Accepted to CIKM 2024 (Applied Research Track) +
+
+
+
+
+ + ☆ ACCELERATION: Sequentially-scanning DECT Imaging Using High Temporal + Resolution Image Reconstruction And Temporal Extrapolation + + +
+ Dual-energy computed tomography (DECT) has been widely used to obtain +quantitative elemental composition of imaged subjects for personalized and +precise medical diagnosis. Compared with existing high-end DECT leveraging +advanced X-ray source and/or detector technologies, the use of the +sequentially-scanning data acquisition scheme to implement DECT may make +broader impact on clinical practice because this scheme requires no specialized +hardware designs. However, since the concentration of iodinated contrast agent +in the imaged subject varies over time, sequentially-scanned data sets acquired +at two tube potentials are temporally inconsistent. As existing material +decomposition approaches for DECT assume that the data sets acquired at two +tube potentials are temporally consistent, the violation of this assumption +results in inaccurate quantification accuracy of iodine concentration. In this +work, we developed a technique to achieve sequentially-scanning DECT imaging +using high temporal resolution image reconstruction and temporal extrapolation, +ACCELERATION in short, to address the technical challenge induced by temporal +inconsistency of sequentially-scanned data sets and improve iodine +quantification accuracy in sequentially-scanning DECT. ACCELERATION has been +validated and evaluated using numerical simulation data sets generated from +clinical human subject exams. Results demonstrated the improvement of iodine +quantification accuracy using ACCELERATION. + +
+
+
+
+
+ + ☆ OmniCLIP: Adapting CLIP for Video Recognition with Spatial-Temporal + Omni-Scale Feature Learning ECAI-2024 + + +
+ Recent Vision-Language Models (VLMs) \textit{e.g.} CLIP have made great +progress in video recognition. Despite the improvement brought by the strong +visual backbone in extracting spatial features, CLIP still falls short in +capturing and integrating spatial-temporal features which is essential for +video recognition. In this paper, we propose OmniCLIP, a framework that adapts +CLIP for video recognition by focusing on learning comprehensive features +encompassing spatial, temporal, and dynamic spatial-temporal scales, which we +refer to as omni-scale features. This is achieved through the design of +spatial-temporal blocks that include parallel temporal adapters (PTA), enabling +efficient temporal modeling. Additionally, we introduce a self-prompt generator +(SPG) module to capture dynamic object spatial features. The synergy between +PTA and SPG allows OmniCLIP to discern varying spatial information across +frames and assess object scales over time. We have conducted extensive +experiments in supervised video recognition, few-shot video recognition, and +zero-shot recognition tasks. The results demonstrate the effectiveness of our +method, especially with OmniCLIP achieving a top-1 accuracy of 74.30\% on +HMDB51 in a 16-shot setting, surpassing the recent MotionPrompt approach even +with full training data. The code is available at +\url{https://github.com/XiaoBuL/OmniCLIP}. + +
+
+ comment: ECAI-2024 +
+
+
+
+
+ + ☆ Novel View Synthesis from a Single Image with Pretrained Diffusion + Guidance + + +
+ Recent 3D novel view synthesis (NVS) methods are limited to +single-object-centric scenes generated from new viewpoints and struggle with +complex environments. They often require extensive 3D data for training, +lacking generalization beyond training distribution. Conversely, 3D-free +methods can generate text-controlled views of complex, in-the-wild scenes using +a pretrained stable diffusion model without tedious fine-tuning, but lack +camera control. In this paper, we introduce HawkI++, a method capable of +generating camera-controlled viewpoints from a single input image. HawkI++ +excels in handling complex and diverse scenes without additional 3D data or +extensive training. It leverages widely available pretrained NVS models for +weak guidance, integrating this knowledge into a 3D-free view synthesis +approach to achieve the desired results efficiently. Our experimental results +demonstrate that HawkI++ outperforms existing models in both qualitative and +quantitative evaluations, providing high-fidelity and consistent novel view +synthesis at desired camera angles across a wide variety of scenes. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ☆ Palantir: Towards Efficient Super Resolution for Ultra-high-definition + Live Streaming + + +
+ Neural enhancement through super-resolution deep neural networks opens up new +possibilities for ultra-high-definition live streaming over existing encoding +and networking infrastructure. Yet, the heavy SR DNN inference overhead leads +to severe deployment challenges. To reduce the overhead, existing systems +propose to apply DNN-based SR only on selected anchor frames while upscaling +non-anchor frames via the lightweight reusing-based SR approach. However, +frame-level scheduling is coarse-grained and fails to deliver optimal +efficiency. In this work, we propose Palantir, the first neural-enhanced UHD +live streaming system with fine-grained patch-level scheduling. In the +presented solutions, two novel techniques are incorporated to make good +scheduling decisions for inference overhead optimization and reduce the +scheduling latency. Firstly, under the guidance of our pioneering and +theoretical analysis, Palantir constructs a directed acyclic graph (DAG) for +lightweight yet accurate quality estimation under any possible anchor patch +set. Secondly, to further optimize the scheduling latency, Palantir improves +parallelizability by refactoring the computation subprocedure of the estimation +process into a sparse matrix-matrix multiplication operation. The evaluation +results suggest that Palantir incurs a negligible scheduling latency accounting +for less than 5.7% of the end-to-end latency requirement. When compared to the +state-of-the-art real-time frame-level scheduling strategy, Palantir reduces +the energy overhead of SR-integrated mobile clients by 38.1% at most (and 22.4% +on average) and the monetary costs of cloud-based SR by 80.1% at most (and +38.4% on average). + +
+
+
+
+
+ + ☆ Efficient and Scalable Point Cloud Generation with Sparse Point-Voxel + Diffusion Models + + +
+ We propose a novel point cloud U-Net diffusion architecture for 3D generative +modeling capable of generating high-quality and diverse 3D shapes while +maintaining fast generation times. Our network employs a dual-branch +architecture, combining the high-resolution representations of points with the +computational efficiency of sparse voxels. Our fastest variant outperforms all +non-diffusion generative approaches on unconditional shape generation, the most +popular benchmark for evaluating point cloud generative models, while our +largest model achieves state-of-the-art results among diffusion methods, with a +runtime approximately 70% of the previously state-of-the-art PVD. Beyond +unconditional generation, we perform extensive evaluations, including +conditional generation on all categories of ShapeNet, demonstrating the +scalability of our model to larger datasets, and implicit generation which +allows our network to produce high quality point clouds on fewer timesteps, +further decreasing the generation time. Finally, we evaluate the architecture's +performance in point cloud completion and super-resolution. Our model excels in +all tasks, establishing it as a state-of-the-art diffusion U-Net for point +cloud generative modeling. The code is publicly available at +https://github.com/JohnRomanelis/SPVD.git. + +
+
+
+
+
+ + ☆ MR3D-Net: Dynamic Multi-Resolution 3D Sparse Voxel Grid Fusion for + LiDAR-Based Collective Perception SC 2024 + + +
+ The safe operation of automated vehicles depends on their ability to perceive +the environment comprehensively. However, occlusion, sensor range, and +environmental factors limit their perception capabilities. To overcome these +limitations, collective perception enables vehicles to exchange information. +However, fusing this exchanged information is a challenging task. Early fusion +approaches require large amounts of bandwidth, while intermediate fusion +approaches face interchangeability issues. Late fusion of shared detections is +currently the only feasible approach. However, it often results in inferior +performance due to information loss. To address this issue, we propose +MR3D-Net, a dynamic multi-resolution 3D sparse voxel grid fusion backbone +architecture for LiDAR-based collective perception. We show that sparse voxel +grids at varying resolutions provide a meaningful and compact environment +representation that can adapt to the communication bandwidth. MR3D-Net achieves +state-of-the-art performance on the OPV2V 3D object detection benchmark while +reducing the required bandwidth by up to 94% compared to early fusion. Code is +available at https://github.com/ekut-es/MR3D-Net + +
+
+ comment: Accepted at IEEE ITSC 2024 +
+
+
+
+
+ + ☆ DPDETR: Decoupled Position Detection Transformer for Infrared-Visible + Object Detection + + +
+ Infrared-visible object detection aims to achieve robust object detection by +leveraging the complementary information of infrared and visible image pairs. +However, the commonly existing modality misalignment problem presents two +challenges: fusing misalignment complementary features is difficult, and +current methods cannot accurately locate objects in both modalities under +misalignment conditions. In this paper, we propose a Decoupled Position +Detection Transformer (DPDETR) to address these problems. Specifically, we +explicitly formulate the object category, visible modality position, and +infrared modality position to enable the network to learn the intrinsic +relationships and output accurate positions of objects in both modalities. To +fuse misaligned object features accurately, we propose a Decoupled Position +Multispectral Cross-attention module that adaptively samples and aggregates +multispectral complementary features with the constraint of infrared and +visible reference positions. Additionally, we design a query-decoupled +Multispectral Decoder structure to address the optimization gap among the three +kinds of object information in our task and propose a Decoupled Position +Contrastive DeNosing Training strategy to enhance the DPDETR's ability to learn +decoupled positions. Experiments on DroneVehicle and KAIST datasets demonstrate +significant improvements compared to other state-of-the-art methods. The code +will be released at https://github.com/gjj45/DPDETR. + +
+
+
+
+
+ + ☆ RISurConv: Rotation Invariant Surface Attention-Augmented Convolutions + for 3D Point Cloud Classification and Segmentation ECCV 2024 + + +
+ Despite the progress on 3D point cloud deep learning, most prior works focus +on learning features that are invariant to translation and point permutation, +and very limited efforts have been devoted for rotation invariant property. +Several recent studies achieve rotation invariance at the cost of lower +accuracies. In this work, we close this gap by proposing a novel yet effective +rotation invariant architecture for 3D point cloud classification and +segmentation. Instead of traditional pointwise operations, we construct local +triangle surfaces to capture more detailed surface structure, based on which we +can extract highly expressive rotation invariant surface properties which are +then integrated into an attention-augmented convolution operator named +RISurConv to generate refined attention features via self-attention layers. +Based on RISurConv we build an effective neural network for 3D point cloud +analysis that is invariant to arbitrary rotations while maintaining high +accuracy. We verify the performance on various benchmarks with supreme results +obtained surpassing the previous state-of-the-art by a large margin. We achieve +an overall accuracy of 96.0% (+4.7%) on ModelNet40, 93.1% (+12.8%) on +ScanObjectNN, and class accuracies of 91.5% (+3.6%), 82.7% (+5.1%), and 78.5% +(+9.2%) on the three categories of the FG3D dataset for the fine-grained +classification task. Additionally, we achieve 81.5% (+1.0%) mIoU on ShapeNet +for the segmentation task. Code is available here: +https://github.com/cszyzhang/RISurConv + +
+
+ comment: ECCV 2024 (oral) +
+
+
+
+
+ + ☆ Towards Robust Monocular Depth Estimation in Non-Lambertian Surfaces + + +
+ In the field of monocular depth estimation (MDE), many models with excellent +zero-shot performance in general scenes emerge recently. However, these methods +often fail in predicting non-Lambertian surfaces, such as transparent or mirror +(ToM) surfaces, due to the unique reflective properties of these regions. +Previous methods utilize externally provided ToM masks and aim to obtain +correct depth maps through direct in-painting of RGB images. These methods +highly depend on the accuracy of additional input masks, and the use of random +colors during in-painting makes them insufficiently robust. We are committed to +incrementally enabling the baseline model to directly learn the uniqueness of +non-Lambertian surface regions for depth estimation through a well-designed +training framework. Therefore, we propose non-Lambertian surface regional +guidance, which constrains the predictions of MDE model from the gradient +domain to enhance its robustness. Noting the significant impact of lighting on +this task, we employ the random tone-mapping augmentation during training to +ensure the network can predict correct results for varying lighting inputs. +Additionally, we propose an optional novel lighting fusion module, which uses +Variational Autoencoders to fuse multiple images and obtain the most +advantageous input RGB image for depth estimation when multi-exposure images +are available. Our method achieves accuracy improvements of 33.39% and 5.21% in +zero-shot testing on the Booster and Mirror3D dataset for non-Lambertian +surfaces, respectively, compared to the Depth Anything V2. The state-of-the-art +performance of 90.75 in delta1.05 within the ToM regions on the TRICKY2024 +competition test set demonstrates the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Towards Adversarial Robustness via Debiased High-Confidence Logit + Alignment + + +
+ Despite the significant advances that deep neural networks (DNNs) have +achieved in various visual tasks, they still exhibit vulnerability to +adversarial examples, leading to serious security concerns. Recent adversarial +training techniques have utilized inverse adversarial attacks to generate +high-confidence examples, aiming to align the distributions of adversarial +examples with the high-confidence regions of their corresponding classes. +However, in this paper, our investigation reveals that high-confidence outputs +under inverse adversarial attacks are correlated with biased feature +activation. Specifically, training with inverse adversarial examples causes the +model's attention to shift towards background features, introducing a spurious +correlation bias. To address this bias, we propose Debiased High-Confidence +Adversarial Training (DHAT), a novel approach that not only aligns the logits +of adversarial examples with debiased high-confidence logits obtained from +inverse adversarial examples, but also restores the model's attention to its +normal state by enhancing foreground logit orthogonality. Extensive experiments +demonstrate that DHAT achieves state-of-the-art performance and exhibits robust +generalization capabilities across various vision datasets. Additionally, DHAT +can seamlessly integrate with existing advanced adversarial training techniques +for improving the performance. + +
+
+
+
+
+ + ☆ Five Pitfalls When Assessing Synthetic Medical Images with Reference + Metrics MICCAI 2024 + + +
+ Reference metrics have been developed to objectively and quantitatively +compare two images. Especially for evaluating the quality of reconstructed or +compressed images, these metrics have shown very useful. Extensive tests of +such metrics on benchmarks of artificially distorted natural images have +revealed which metric best correlate with human perception of quality. Direct +transfer of these metrics to the evaluation of generative models in medical +imaging, however, can easily lead to pitfalls, because assumptions about image +content, image data format and image interpretation are often very different. +Also, the correlation of reference metrics and human perception of quality can +vary strongly for different kinds of distortions and commonly used metrics, +such as SSIM, PSNR and MAE are not the best choice for all situations. We +selected five pitfalls that showcase unexpected and probably undesired +reference metric scores and discuss strategies to avoid them. + +
+
+ comment: 10 pages, 5 figures, accepted at Deep Generative Models workshop @ + MICCAI 2024 +
+
+
+
+
+ + ☆ CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer + + +
+ We introduce CogVideoX, a large-scale diffusion transformer model designed +for generating videos based on text prompts. To efficently model video data, we +propose to levearge a 3D Variational Autoencoder (VAE) to compress videos along +both spatial and temporal dimensions. To improve the text-video alignment, we +propose an expert transformer with the expert adaptive LayerNorm to facilitate +the deep fusion between the two modalities. By employing a progressive training +technique, CogVideoX is adept at producing coherent, long-duration videos +characterized by significant motions. In addition, we develop an effective +text-video data processing pipeline that includes various data preprocessing +strategies and a video captioning method. It significantly helps enhance the +performance of CogVideoX, improving both generation quality and semantic +alignment. Results show that CogVideoX demonstrates state-of-the-art +performance across both multiple machine metrics and human evaluations. The +model weights of both the 3D Causal VAE and CogVideoX are publicly available at +https://github.com/THUDM/CogVideo. + +
+
+
+
+
+ + ☆ A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in + Adverse Weather and Lighting + + +
+ High-autonomy vehicle functions rely on machine learning (ML) algorithms to +understand the environment. Despite displaying remarkable performance in fair +weather scenarios, perception algorithms are heavily affected by adverse +weather and lighting conditions. To overcome these difficulties, ML engineers +mainly rely on comprehensive real-world datasets. However, the difficulties in +real-world data collection for critical areas of the operational design domain +(ODD) often means synthetic data is required for perception training and safety +validation. Thus, we present A-BDD, a large set of over 60,000 synthetically +augmented images based on BDD100K that are equipped with semantic segmentation +and bounding box annotations (inherited from the BDD100K dataset). The dataset +contains augmented data for rain, fog, overcast and sunglare/shadow with +varying intensity levels. We further introduce novel strategies utilizing +feature-based image quality metrics like FID and CMMD, which help identify +useful augmented and real-world data for ML training and testing. By conducting +experiments on A-BDD, we provide evidence that data augmentations can play a +pivotal role in closing performance gaps in adverse weather and lighting +conditions. + +
+
+
+
+
+ + ☆ ControlNeXt: Powerful and Efficient Control for Image and Video + Generation + + +
+ Diffusion models have demonstrated remarkable and robust abilities in both +image and video generation. To achieve greater control over generated results, +researchers introduce additional architectures, such as ControlNet, Adapters +and ReferenceNet, to integrate conditioning controls. However, current +controllable generation methods often require substantial additional +computational resources, especially for video generation, and face challenges +in training or exhibit weak control. In this paper, we propose ControlNeXt: a +powerful and efficient method for controllable image and video generation. We +first design a more straightforward and efficient architecture, replacing heavy +additional branches with minimal additional cost compared to the base model. +Such a concise structure also allows our method to seamlessly integrate with +other LoRA weights, enabling style alteration without the need for additional +training. As for training, we reduce up to 90% of learnable parameters compared +to the alternatives. Furthermore, we propose another method called Cross +Normalization (CN) as a replacement for Zero-Convolution' to achieve fast and +stable training convergence. We have conducted various experiments with +different base models across images and videos, demonstrating the robustness of +our method. + +
+
+ comment: controllable generation +
+
+
+
+
+ + ☆ Parallel transport on matrix manifolds and Exponential Action + + +
+ We express parallel transport for several common matrix Lie groups with a +family of pseudo-Riemannian metrics in terms of matrix exponential and +exponential actions. The expression for parallel transport is preserved by +taking the quotient under certain scenarios. In particular, for a Stiefel +manifold of orthogonal matrices of size $n\times d$, we give an expression for +parallel transport along a geodesic from time zero to $t$, that could be +computed with time complexity of $O(nd^2)$ for small $t$, and of $O(td^3)$ for +large t, contributing a step in a long-standing open problem in matrix +manifolds. A similar result holds for flag manifolds with the canonical metric. +We also show the parallel transport formulas for the generalized linear group, +and the special orthogonal group under these metrics. + +
+
+
+
+
+ + ☆ BooW-VTON: Boosting In-the-Wild Virtual Try-On via Mask-Free Pseudo Data + Training + + +
+ Image-based virtual try-on is an increasingly popular and important task to +generate realistic try-on images of specific person. Existing methods always +employ an accurate mask to remove the original garment in the source image, +thus achieving realistic synthesized images in simple and conventional try-on +scenarios based on powerful diffusion model. Therefore, acquiring suitable mask +is vital to the try-on performance of these methods. However, obtaining precise +inpainting masks, especially for complex wild try-on data containing diverse +foreground occlusions and person poses, is not easy as Figure 1-Top shows. This +difficulty often results in poor performance in more practical and challenging +real-life scenarios, such as the selfie scene shown in Figure 1-Bottom. To this +end, we propose a novel training paradigm combined with an efficient data +augmentation method to acquire large-scale unpaired training data from wild +scenarios, thereby significantly facilitating the try-on performance of our +model without the need for additional inpainting masks. Besides, a try-on +localization loss is designed to localize a more accurate try-on area to obtain +more reasonable try-on results. It is noted that our method only needs the +reference cloth image, source pose image and source person image as input, +which is more cost-effective and user-friendly compared to existing methods. +Extensive qualitative and quantitative experiments have demonstrated superior +performance in wild scenarios with such a low-demand input. + +
+
+
+
+
+ + ☆ ARPA: A Novel Hybrid Model for Advancing Visual Word Disambiguation + Using Large Language Models and Transformers + + +
+ In the rapidly evolving fields of natural language processing and computer +vision, Visual Word Sense Disambiguation (VWSD) stands as a critical, yet +challenging task. The quest for models that can seamlessly integrate and +interpret multimodal data is more pressing than ever. Imagine a system that can +understand language with the depth and nuance of human cognition, while +simultaneously interpreting the rich visual context of the world around it. + We present ARPA, an architecture that fuses the unparalleled contextual +understanding of large language models with the advanced feature extraction +capabilities of transformers, which then pass through a custom Graph Neural +Network (GNN) layer to learn intricate relationships and subtle nuances within +the data. This innovative architecture not only sets a new benchmark in visual +word disambiguation but also introduces a versatile framework poised to +transform how linguistic and visual data interact by harnessing the synergistic +strengths of its components, ensuring robust performance even in the most +complex disambiguation scenarios. Through a series of experiments and +comparative analysis, we reveal the substantial advantages of our model, +underscoring its potential to redefine standards in the field. Beyond its +architectural prowess, our architecture excels through experimental +enrichments, including sophisticated data augmentation and multi-modal training +techniques. + ARPA's introduction marks a significant milestone in visual word +disambiguation, offering a compelling solution that bridges the gap between +linguistic and visual modalities. We invite researchers and practitioners to +explore the capabilities of our model, envisioning a future where such hybrid +models drive unprecedented advancements in artificial intelligence. + +
+
+
+
+
+ + ☆ Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis + Search + + +
+ Deep neural network models have a complex architecture and are +overparameterized. The number of parameters is more than the whole dataset, +which is highly resource-consuming. This complicates their application and +limits its usage on different devices. Reduction in the number of network +parameters helps to reduce the size of the model, but at the same time, +thoughtlessly applied, can lead to a deterioration in the quality of the +network. One way to reduce the number of model parameters is matrix +decomposition, where a matrix is represented as a product of smaller matrices. +In this paper, we propose a new way of applying the matrix decomposition with +respect to the weights of convolutional layers. The essence of the method is to +train not all convolutions, but only the subset of convolutions (basis +convolutions), and represent the rest as linear combinations of the basis ones. +Experiments on models from the ResNet family and the CIFAR-10 dataset +demonstrate that basis convolutions can not only reduce the size of the model +but also accelerate the forward and backward passes of the network. Another +contribution of this work is that we propose a fast method for selecting a +subset of network layers in which the use of matrix decomposition does not +degrade the quality of the final model. + +
+
+ comment: A revived draft of an unpublished (and never-to-be-published) + article. For the sake of history, memory, and old times +
+
+
+
+
+ + ☆ ClickAttention: Click Region Similarity Guided Interactive Segmentation + + +
+ Interactive segmentation algorithms based on click points have garnered +significant attention from researchers in recent years.However, existing +studies typically use sparse click maps as model inputs to segment specific +target objects, which primarily affect local regions and have limited abilities +to focus on the whole target object, leading to increased times of clicks.In +addition, most existing algorithms can not balance well between high +performance and efficiency.To address this issue, we propose a click attention +algorithm that expands the influence range of positive clicks based on the +similarity between positively-clicked regions and the whole input.We also +propose a discriminative affinity loss to reduce the attention coupling between +positive and negative click regions to avoid an accuracy decrease caused by +mutual interference between positive and negative clicks.Extensive experiments +demonstrate that our approach is superior to existing methods and achieves +cutting-edge performance in fewer parameters.An interactive demo and all +reproducible codes will be released at +https://github.com/hahamyt/ClickAttention. + +
+
+
+
+
+ + ☆ HeadGAP: Few-shot 3D Head Avatar via Generalizable Gaussian Priors + + +
+ In this paper, we present a novel 3D head avatar creation approach capable of +generalizing from few-shot in-the-wild data with high-fidelity and animatable +robustness. Given the underconstrained nature of this problem, incorporating +prior knowledge is essential. Therefore, we propose a framework comprising +prior learning and avatar creation phases. The prior learning phase leverages +3D head priors derived from a large-scale multi-view dynamic dataset, and the +avatar creation phase applies these priors for few-shot personalization. Our +approach effectively captures these priors by utilizing a Gaussian +Splatting-based auto-decoder network with part-based dynamic modeling. Our +method employs identity-shared encoding with personalized latent codes for +individual identities to learn the attributes of Gaussian primitives. During +the avatar creation phase, we achieve fast head avatar personalization by +leveraging inversion and fine-tuning strategies. Extensive experiments +demonstrate that our model effectively exploits head priors and successfully +generalizes them to few-shot personalization, achieving photo-realistic +rendering quality, multi-view consistency, and stable animation. + +
+
+ comment: Project page: https://headgap.github.io/ +
+
+
+
+
+ + ☆ Uncertainty-Informed Volume Visualization using Implicit Neural + Representation IEEE VIS 2024 + + +
+ The increasing adoption of Deep Neural Networks (DNNs) has led to their +application in many challenging scientific visualization tasks. While advanced +DNNs offer impressive generalization capabilities, understanding factors such +as model prediction quality, robustness, and uncertainty is crucial. These +insights can enable domain scientists to make informed decisions about their +data. However, DNNs inherently lack ability to estimate prediction uncertainty, +necessitating new research to construct robust uncertainty-aware visualization +techniques tailored for various visualization tasks. In this work, we propose +uncertainty-aware implicit neural representations to model scalar field data +sets effectively and comprehensively study the efficacy and benefits of +estimated uncertainty information for volume visualization tasks. We evaluate +the effectiveness of two principled deep uncertainty estimation techniques: (1) +Deep Ensemble and (2) Monte Carlo Dropout (MCDropout). These techniques enable +uncertainty-informed volume visualization in scalar field data sets. Our +extensive exploration across multiple data sets demonstrates that +uncertainty-aware models produce informative volume visualization results. +Moreover, integrating prediction uncertainty enhances the trustworthiness of +our DNN model, making it suitable for robustly analyzing and visualizing +real-world scientific volumetric data sets. + +
+
+ comment: To appear in IEEE Workshop on Uncertainty Visualization in + conjunction with IEEE VIS 2024, Florida, USA +
+
+
+
+
+ + ☆ A Sharpness Based Loss Function for Removing Out-of-Focus Blur SP + + +
+ The success of modern Deep Neural Network (DNN) approaches can be attributed +to the use of complex optimization criteria beyond standard losses such as mean +absolute error (MAE) or mean squared error (MSE). In this work, we propose a +novel method of utilising a no-reference sharpness metric Q introduced by Zhu +and Milanfar for removing out-of-focus blur from images. We also introduce a +novel dataset of real-world out-of-focus images for assessing restoration +models. Our fine-tuned method produces images with a 7.5 % increase in +perceptual quality (LPIPS) as compared to a standard model trained only on MAE. +Furthermore, we observe a 6.7 % increase in Q (reflecting sharper restorations) +and 7.25 % increase in PSNR over most state-of-the-art (SOTA) methods. + +
+
+ comment: 6 pages, IEEE MMSP +
+
+
+
+
+ + ☆ DEEPTalk: Dynamic Emotion Embedding for Probabilistic Speech-Driven 3D + Face Animation + + +
+ Speech-driven 3D facial animation has garnered lots of attention thanks to +its broad range of applications. Despite recent advancements in achieving +realistic lip motion, current methods fail to capture the nuanced emotional +undertones conveyed through speech and produce monotonous facial motion. These +limitations result in blunt and repetitive facial animations, reducing user +engagement and hindering their applicability. To address these challenges, we +introduce DEEPTalk, a novel approach that generates diverse and emotionally +rich 3D facial expressions directly from speech inputs. To achieve this, we +first train DEE (Dynamic Emotion Embedding), which employs probabilistic +contrastive learning to forge a joint emotion embedding space for both speech +and facial motion. This probabilistic framework captures the uncertainty in +interpreting emotions from speech and facial motion, enabling the derivation of +emotion vectors from its multifaceted space. Moreover, to generate dynamic +facial motion, we design TH-VQVAE (Temporally Hierarchical VQ-VAE) as an +expressive and robust motion prior overcoming limitations of VAEs and VQ-VAEs. +Utilizing these strong priors, we develop DEEPTalk, A talking head generator +that non-autoregressively predicts codebook indices to create dynamic facial +motion, incorporating a novel emotion consistency loss. Extensive experiments +on various datasets demonstrate the effectiveness of our approach in creating +diverse, emotionally expressive talking faces that maintain accurate lip-sync. +Source code will be made publicly available soon. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ☆ An Analysis for Image-to-Image Translation and Style Transfer + + +
+ With the development of generative technologies in deep learning, a large +number of image-to-image translation and style transfer models have emerged at +an explosive rate in recent years. These two technologies have made significant +progress and can generate realistic images. However, many communities tend to +confuse the two, because both generate the desired image based on the input +image and both cover the two definitions of content and style. In fact, there +are indeed significant differences between the two, and there is currently a +lack of clear explanations to distinguish the two technologies, which is not +conducive to the advancement of technology. We hope to serve the entire +community by introducing the differences and connections between image-to-image +translation and style transfer. The entire discussion process involves the +concepts, forms, training modes, evaluation processes, and visualization +results of the two technologies. Finally, we conclude that image-to-image +translation divides images by domain, and the types of images in the domain are +limited, and the scope involved is small, but the conversion ability is strong +and can achieve strong semantic changes. Style transfer divides image types by +single image, and the scope involved is large, but the transfer ability is +limited, and it transfers more texture and color of the image. + +
+
+
+
+
+ + ☆ Diffuse-UDA: Addressing Unsupervised Domain Adaptation in Medical Image + Segmentation with Appearance and Structure Aligned Diffusion Models + + +
+ The scarcity and complexity of voxel-level annotations in 3D medical imaging +present significant challenges, particularly due to the domain gap between +labeled datasets from well-resourced centers and unlabeled datasets from +less-resourced centers. This disparity affects the fairness of artificial +intelligence algorithms in healthcare. We introduce Diffuse-UDA, a novel method +leveraging diffusion models to tackle Unsupervised Domain Adaptation (UDA) in +medical image segmentation. Diffuse-UDA generates high-quality image-mask pairs +with target domain characteristics and various structures, thereby enhancing +UDA tasks. Initially, pseudo labels for target domain samples are generated. +Subsequently, a specially tailored diffusion model, incorporating deformable +augmentations, is trained on image-label or image-pseudo-label pairs from both +domains. Finally, source domain labels guide the diffusion model to generate +image-label pairs for the target domain. Comprehensive evaluations on several +benchmarks demonstrate that Diffuse-UDA outperforms leading UDA and +semi-supervised strategies, achieving performance close to or even surpassing +the theoretical upper bound of models trained directly on target domain data. +Diffuse-UDA offers a pathway to advance the development and deployment of AI +systems in medical imaging, addressing disparities between healthcare +environments. This approach enables the exploration of innovative AI-driven +diagnostic tools, improves outcomes, saves time, and reduces human error. + +
+
+
+
+
+ + ☆ Unseen No More: Unlocking the Potential of CLIP for Generative Zero-shot + HOI Detection ACM MM 2024 + + +
+ Zero-shot human-object interaction (HOI) detector is capable of generalizing +to HOI categories even not encountered during training. Inspired by the +impressive zero-shot capabilities offered by CLIP, latest methods strive to +leverage CLIP embeddings for improving zero-shot HOI detection. However, these +embedding-based methods train the classifier on seen classes only, inevitably +resulting in seen-unseen confusion for the model during inference. Besides, we +find that using prompt-tuning and adapters further increases the gap between +seen and unseen accuracy. To tackle this challenge, we present the first +generation-based model using CLIP for zero-shot HOI detection, coined HOIGen. +It allows to unlock the potential of CLIP for feature generation instead of +feature extraction only. To achieve it, we develop a CLIP-injected feature +generator in accordance with the generation of human, object and union +features. Then, we extract realistic features of seen samples and mix them with +synthetic features together, allowing the model to train seen and unseen +classes jointly. To enrich the HOI scores, we construct a generative prototype +bank in a pairwise HOI recognition branch, and a multi-knowledge prototype bank +in an image-wise HOI recognition branch, respectively. Extensive experiments on +HICO-DET benchmark demonstrate our HOIGen achieves superior performance for +both seen and unseen classes under various zero-shot settings, compared with +other top-performing methods. Code is available at: +https://github.com/soberguo/HOIGen + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Freehand Sketch Generation from Mechanical Components ACM MM + + +
+ Drawing freehand sketches of mechanical components on multimedia devices for +AI-based engineering modeling has become a new trend. However, its development +is being impeded because existing works cannot produce suitable sketches for +data-driven research. These works either generate sketches lacking a freehand +style or utilize generative models not originally designed for this task +resulting in poor effectiveness. To address this issue, we design a two-stage +generative framework mimicking the human sketching behavior pattern, called +MSFormer, which is the first time to produce humanoid freehand sketches +tailored for mechanical components. The first stage employs Open CASCADE +technology to obtain multi-view contour sketches from mechanical components, +filtering perturbing signals for the ensuing generation process. Meanwhile, we +design a view selector to simulate viewpoint selection tasks during human +sketching for picking out information-rich sketches. The second stage +translates contour sketches into freehand sketches by a transformer-based +generator. To retain essential modeling features as much as possible and +rationalize stroke distribution, we introduce a novel edge-constraint stroke +initialization. Furthermore, we utilize a CLIP vision encoder and a new loss +function incorporating the Hausdorff distance to enhance the generalizability +and robustness of the model. Extensive experiments demonstrate that our +approach achieves state-of-the-art performance for generating freehand sketches +in the mechanical domain. Project page: https://mcfreeskegen.github.io . + +
+
+ comment: Published at ACM Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ☆ Target Detection of Safety Protective Gear Using the Improved YOLOv5 + + +
+ In high-risk railway construction, personal protective equipment monitoring +is critical but challenging due to small and frequently obstructed targets. We +propose YOLO-EA, an innovative model that enhances safety measure detection by +integrating ECA into its backbone's convolutional layers, improving discernment +of minuscule objects like hardhats. YOLO-EA further refines target recognition +under occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was +empirically substantiated using a dataset derived from real-world railway +construction site surveillance footage. It outperforms YOLOv5, achieving 98.9% +precision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining +real-time performance at 70.774 fps. This highly efficient and precise YOLO-EA +holds great promise for practical application in intricate construction +scenarios, enforcing stringent safety compliance during complex railway +construction projects. + +
+
+
+
+
+ + ☆ Boosting Adverse Weather Crowd Counting via Multi-queue Contrastive + Learning + + +
+ Currently, most crowd counting methods have outstanding performance under +normal weather conditions. However, they often struggle to maintain their +performance in extreme and adverse weather conditions due to significant +differences in the domain and a lack of adverse weather images for training. To +address this issue and enhance the model's robustness in adverse weather, we +propose a two-stage crowd counting method. Specifically, in the first stage, we +introduce a multi-queue MoCo contrastive learning strategy to tackle the +problem of weather class imbalance. This strategy facilitates the learning of +weather-aware representations by the model. In the second stage, we propose to +refine the representations under the guidance of contrastive learning, enabling +the conversion of the weather-aware representations to the normal weather +domain. While significantly improving the robustness, our method only +marginally increases the weight of the model. In addition, we also create a new +synthetic adverse weather dataset. Extensive experimental results show that our +method achieves competitive performance. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Probabilistic Vision-Language Representation for Weakly Supervised + Temporal Action Localization ACM MM 2024 + + +
+ Weakly supervised temporal action localization (WTAL) aims to detect action +instances in untrimmed videos using only video-level annotations. Since many +existing works optimize WTAL models based on action classification labels, they +encounter the task discrepancy problem (i.e., localization-by-classification). +To tackle this issue, recent studies have attempted to utilize action category +names as auxiliary semantic knowledge through vision-language pre-training +(VLP). However, there are still areas where existing research falls short. +Previous approaches primarily focused on leveraging textual information from +language models but overlooked the alignment of dynamic human action and VLP +knowledge in a joint space. Furthermore, the deterministic representation +employed in previous studies struggles to capture fine-grained human motions. +To address these problems, we propose a novel framework that aligns human +action knowledge and VLP knowledge in a probabilistic embedding space. +Moreover, we propose intra- and inter-distribution contrastive learning to +enhance the probabilistic embedding space based on statistical similarities. +Extensive experiments and ablation studies reveal that our method significantly +outperforms all previous state-of-the-art methods. Code is available at +https://github.com/sejong-rcv/PVLR. + +
+
+ comment: Accepted to ACM MM 2024 +
+
+
+
+
+ + ☆ A Simple Task-aware Contrastive Local Descriptor Selection Strategy for + Few-shot Learning between inter class and intra class ICANN 2024 + + +
+ Few-shot image classification aims to classify novel classes with few labeled +samples. Recent research indicates that deep local descriptors have better +representational capabilities. These studies recognize the impact of background +noise on classification performance. They typically filter query descriptors +using all local descriptors in the support classes or engage in bidirectional +selection between local descriptors in support and query sets. However, they +ignore the fact that background features may be useful for the classification +performance of specific tasks. This paper proposes a novel task-aware +contrastive local descriptor selection network (TCDSNet). First, we calculate +the contrastive discriminative score for each local descriptor in the support +class, and select discriminative local descriptors to form a support descriptor +subset. Finally, we leverage support descriptor subsets to adaptively select +discriminative query descriptors for specific tasks. Extensive experiments +demonstrate that our method outperforms state-of-the-art methods on both +general and fine-grained datasets. + +
+
+ comment: Submitted to ICANN 2024 +
+
+
+
+
+ + ☆ Optimizing Vision Transformers with Data-Free Knowledge Transfer + + +
+ The groundbreaking performance of transformers in Natural Language Processing +(NLP) tasks has led to their replacement of traditional Convolutional Neural +Networks (CNNs), owing to the efficiency and accuracy achieved through the +self-attention mechanism. This success has inspired researchers to explore the +use of transformers in computer vision tasks to attain enhanced long-term +semantic awareness. Vision transformers (ViTs) have excelled in various +computer vision tasks due to their superior ability to capture long-distance +dependencies using the self-attention mechanism. Contemporary ViTs like Data +Efficient Transformers (DeiT) can effectively learn both global semantic +information and local texture information from images, achieving performance +comparable to traditional CNNs. However, their impressive performance comes +with a high computational cost due to very large number of parameters, +hindering their deployment on devices with limited resources like smartphones, +cameras, drones etc. Additionally, ViTs require a large amount of data for +training to achieve performance comparable to benchmark CNN models. Therefore, +we identified two key challenges in deploying ViTs on smaller form factor +devices: the high computational requirements of large models and the need for +extensive training data. As a solution to these challenges, we propose +compressing large ViT models using Knowledge Distillation (KD), which is +implemented data-free to circumvent limitations related to data availability. +Additionally, we conducted experiments on object detection within the same +environment in addition to classification tasks. Based on our analysis, we +found that datafree knowledge distillation is an effective method to overcome +both issues, enabling the deployment of ViTs on less resourceconstrained +devices. + +
+
+
+
+
+ + ☆ MV2DFusion: Leveraging Modality-Specific Object Semantics for + Multi-Modal 3D Detection + + +
+ The rise of autonomous vehicles has significantly increased the demand for +robust 3D object detection systems. While cameras and LiDAR sensors each offer +unique advantages--cameras provide rich texture information and LiDAR offers +precise 3D spatial data--relying on a single modality often leads to +performance limitations. This paper introduces MV2DFusion, a multi-modal +detection framework that integrates the strengths of both worlds through an +advanced query-based fusion mechanism. By introducing an image query generator +to align with image-specific attributes and a point cloud query generator, +MV2DFusion effectively combines modality-specific object semantics without +biasing toward one single modality. Then the sparse fusion process can be +accomplished based on the valuable object semantics, ensuring efficient and +accurate object detection across various scenarios. Our framework's flexibility +allows it to integrate with any image and point cloud-based detectors, +showcasing its adaptability and potential for future advancements. Extensive +evaluations on the nuScenes and Argoverse2 datasets demonstrate that MV2DFusion +achieves state-of-the-art performance, particularly excelling in long-range +detection scenarios. + +
+
+
+
+
+ + ☆ Spb3DTracker: A Robust LiDAR-Based Person Tracker for Noisy Environmen + + +
+ Person detection and tracking (PDT) has seen significant advancements with 2D +camera-based systems in the autonomous vehicle field, leading to widespread +adoption of these algorithms. However, growing privacy concerns have recently +emerged as a major issue, prompting a shift towards LiDAR-based PDT as a viable +alternative. Within this domain, "Tracking-by-Detection" (TBD) has become a +prominent methodology. Despite its effectiveness, LiDAR-based PDT has not yet +achieved the same level of performance as camera-based PDT. This paper examines +key components of the LiDAR-based PDT framework, including detection +post-processing, data association, motion modeling, and lifecycle management. +Building upon these insights, we introduce SpbTrack, a robust person tracker +designed for diverse environments. Our method achieves superior performance on +noisy datasets and state-of-the-art results on KITTI Dataset benchmarks and +custom office indoor dataset among LiDAR-based trackers. Project page at +anonymous. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ UniPortrait: A Unified Framework for Identity-Preserving Single- and + Multi-Human Image Personalization + + +
+ This paper presents UniPortrait, an innovative human image personalization +framework that unifies single- and multi-ID customization with high face +fidelity, extensive facial editability, free-form input description, and +diverse layout generation. UniPortrait consists of only two plug-and-play +modules: an ID embedding module and an ID routing module. The ID embedding +module extracts versatile editable facial features with a decoupling strategy +for each ID and embeds them into the context space of diffusion models. The ID +routing module then combines and distributes these embeddings adaptively to +their respective regions within the synthesized image, achieving the +customization of single and multiple IDs. With a carefully designed two-stage +training scheme, UniPortrait achieves superior performance in both single- and +multi-ID customization. Quantitative and qualitative experiments demonstrate +the advantages of our method over existing approaches as well as its good +scalability, e.g., the universal compatibility with existing generative control +tools. The project page is at +https://aigcdesigngroup.github.io/UniPortrait-Page/ . + +
+
+ comment: Tech report; Project page: + https://aigcdesigngroup.github.io/UniPortrait-Page/ +
+
+
+
+
+ + ☆ Deep Geometric Moments Promote Shape Consistency in Text-to-3D + Generation + + +
+ To address the data scarcity associated with 3D assets, 2D-lifting techniques +such as Score Distillation Sampling (SDS) have become a widely adopted practice +in text-to-3D generation pipelines. However, the diffusion models used in these +techniques are prone to viewpoint bias and thus lead to geometric +inconsistencies such as the Janus problem. To counter this, we introduce MT3D, +a text-to-3D generative model that leverages a high-fidelity 3D object to +overcome viewpoint bias and explicitly infuse geometric understanding into the +generation pipeline. Firstly, we employ depth maps derived from a high-quality +3D model as control signals to guarantee that the generated 2D images preserve +the fundamental shape and structure, thereby reducing the inherent viewpoint +bias. Next, we utilize deep geometric moments to ensure geometric consistency +in the 3D representation explicitly. By incorporating geometric details from a +3D asset, MT3D enables the creation of diverse and geometrically consistent +objects, thereby improving the quality and usability of our 3D representations. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Multi-scale Contrastive Adaptor Learning for Segmenting Anything in + Underperformed Scenes + + +
+ Foundational vision models, such as the Segment Anything Model (SAM), have +achieved significant breakthroughs through extensive pre-training on +large-scale visual datasets. Despite their general success, these models may +fall short in specialized tasks with limited data, and fine-tuning such +large-scale models is often not feasible. Current strategies involve +incorporating adaptors into the pre-trained SAM to facilitate downstream task +performance with minimal model adjustment. However, these strategies can be +hampered by suboptimal learning approaches for the adaptors. In this paper, we +introduce a novel Multi-scale Contrastive Adaptor learning method named +MCA-SAM, which enhances adaptor performance through a meticulously designed +contrastive learning framework at both token and sample levels. Our Token-level +Contrastive adaptor (TC-adaptor) focuses on refining local representations by +improving the discriminability of patch tokens, while the Sample-level +Contrastive adaptor (SC-adaptor) amplifies global understanding across +different samples. Together, these adaptors synergistically enhance feature +comparison within and across samples, bolstering the model's representational +strength and its ability to adapt to new tasks. Empirical results demonstrate +that MCA-SAM sets new benchmarks, outperforming existing methods in three +challenging domains: camouflage object detection, shadow segmentation, and +polyp segmentation. Specifically, MCA-SAM exhibits substantial relative +performance enhancements, achieving a 20.0% improvement in MAE on the COD10K +dataset, a 6.0% improvement in MAE on the CAMO dataset, a 15.4% improvement in +BER on the ISTD dataset, and a 7.9% improvement in mDice on the Kvasir-SEG +dataset. + +
+
+
+
+
+ + ☆ A Simple Early Exiting Framework for Accelerated Sampling in Diffusion + Models ICML 2024 + + +
+ Diffusion models have shown remarkable performance in generation problems +over various domains including images, videos, text, and audio. A practical +bottleneck of diffusion models is their sampling speed, due to the repeated +evaluation of score estimation networks during the inference. In this work, we +propose a novel framework capable of adaptively allocating compute required for +the score estimation, thereby reducing the overall sampling time of diffusion +models. We observe that the amount of computation required for the score +estimation may vary along the time step for which the score is estimated. Based +on this observation, we propose an early-exiting scheme, where we skip the +subset of parameters in the score estimation network during the inference, +based on a time-dependent exit schedule. Using the diffusion models for image +synthesis, we show that our method could significantly improve the sampling +throughput of the diffusion models without compromising image quality. +Furthermore, we also demonstrate that our method seamlessly integrates with +various types of solvers for faster sampling, capitalizing on their +compatibility to enhance overall efficiency. The source code and our +experiments are available at \url{https://github.com/taehong-moon/ee-diffusion} + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ Image Denoising Using Green Channel Prior + + +
+ Image denoising is an appealing and challenging task, in that noise +statistics of real-world observations may vary with local image contents and +different image channels. Specifically, the green channel usually has twice the +sampling rate in raw data. To handle noise variances and leverage such +channel-wise prior information, we propose a simple and effective green channel +prior-based image denoising (GCP-ID) method, which integrates GCP into the +classic patch-based denoising framework. Briefly, we exploit the green channel +to guide the search for similar patches, which aims to improve the patch +grouping quality and encourage sparsity in the transform domain. The grouped +image patches are then reformulated into RGGB arrays to explicitly characterize +the density of green samples. Furthermore, to enhance the adaptivity of GCP-ID +to various image contents, we cast the noise estimation problem into a +classification task and train an effective estimator based on convolutional +neural networks (CNNs). Experiments on real-world datasets demonstrate the +competitive performance of the proposed GCP-ID method for image and video +denoising applications in both raw and sRGB spaces. Our code is available at +https://github.com/ZhaomingKong/GCP-ID. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.08235 +
+
+
+
+
+ + ☆ PAFormer: Part Aware Transformer for Person Re-identification + + +
+ Within the domain of person re-identification (ReID), partial ReID methods +are considered mainstream, aiming to measure feature distances through +comparisons of body parts between samples. However, in practice, previous +methods often lack sufficient awareness of anatomical aspect of body parts, +resulting in the failure to capture features of the same body parts across +different samples. To address this issue, we introduce \textbf{Part Aware +Transformer (PAFormer)}, a pose estimation based ReID model which can perform +precise part-to-part comparison. In order to inject part awareness to pose +tokens, we introduce learnable parameters called `pose token' which estimate +the correlation between each body part and partial regions of the image. +Notably, at inference phase, PAFormer operates without additional modules +related to body part localization, which is commonly used in previous ReID +methodologies leveraging pose estimation models. Additionally, leveraging the +enhanced awareness of body parts, PAFormer suggests the use of a learning-based +visibility predictor to estimate the degree of occlusion for each body part. +Also, we introduce a teacher forcing technique using ground truth visibility +scores which enables PAFormer to be trained only with visible parts. A set of +extensive experiments show that our method outperforms existing approaches on +well-known ReID benchmark datasets. + +
+
+ comment: 34 pages, 8 figures +
+
+
+
+
+ + ☆ Deep Multimodal Collaborative Learning for Polyp Re-Identification + + +
+ Colonoscopic Polyp Re-Identification aims to match the same polyp from a +large gallery with images from different views taken using different cameras +and plays an important role in the prevention and treatment of colorectal +cancer in computer-aided diagnosis. However, traditional methods for object +ReID directly adopting CNN models trained on the ImageNet dataset usually +produce unsatisfactory retrieval performance on colonoscopic datasets due to +the large domain gap. Worsely, these solutions typically learn unimodal modal +representations on the basis of visual samples, which fails to explore +complementary information from different modalities. To address this challenge, +we propose a novel Deep Multimodal Collaborative Learning framework named DMCL +for polyp re-identification, which can effectively encourage modality +collaboration and reinforce generalization capability in medical scenarios. On +the basis of it, a dynamic multimodal feature fusion strategy is introduced to +leverage the optimized multimodal representations for multimodal fusion via +end-to-end training. Experiments on the standard benchmarks show the benefits +of the multimodal setting over state-of-the-art unimodal ReID models, +especially when combined with the specialized multimodal fusion strategy. + +
+
+ comment: Work in progress. arXiv admin note: text overlap with + arXiv:2307.10625 +
+
+
+
+
+ + ☆ Weakly Supervised Video Anomaly Detection and Localization with + Spatio-Temporal Prompts + + +
+ Current weakly supervised video anomaly detection (WSVAD) task aims to +achieve frame-level anomalous event detection with only coarse video-level +annotations available. Existing works typically involve extracting global +features from full-resolution video frames and training frame-level classifiers +to detect anomalies in the temporal dimension. However, most anomalous events +tend to occur in localized spatial regions rather than the entire video frames, +which implies existing frame-level feature based works may be misled by the +dominant background information and lack the interpretation of the detected +anomalies. To address this dilemma, this paper introduces a novel method called +STPrompt that learns spatio-temporal prompt embeddings for weakly supervised +video anomaly detection and localization (WSVADL) based on pre-trained +vision-language models (VLMs). Our proposed method employs a two-stream network +structure, with one stream focusing on the temporal dimension and the other +primarily on the spatial dimension. By leveraging the learned knowledge from +pre-trained VLMs and incorporating natural motion priors from raw videos, our +model learns prompt embeddings that are aligned with spatio-temporal regions of +videos (e.g., patches of individual frames) for identify specific local regions +of anomalies, enabling accurate video anomaly detection while mitigating the +influence of background information. Without relying on detailed +spatio-temporal annotations or auxiliary object detection/tracking, our method +achieves state-of-the-art performance on three public benchmarks for the WSVADL +task. + +
+
+ comment: Accepted by ACMMM2024 +
+
+
+
+
+ + ☆ Classifier Guidance Enhances Diffusion-based Adversarial Purification by + Preserving Predictive Information ECAI 2024 + + +
+ Adversarial purification is one of the promising approaches to defend neural +networks against adversarial attacks. Recently, methods utilizing diffusion +probabilistic models have achieved great success for adversarial purification +in image classification tasks. However, such methods fall into the dilemma of +balancing the needs for noise removal and information preservation. This paper +points out that existing adversarial purification methods based on diffusion +models gradually lose sample information during the core denoising process, +causing occasional label shift in subsequent classification tasks. As a remedy, +we suggest to suppress such information loss by introducing guidance from the +classifier confidence. Specifically, we propose Classifier-cOnfidence gUided +Purification (COUP) algorithm, which purifies adversarial examples while +keeping away from the classifier decision boundary. Experimental results show +that COUP can achieve better adversarial robustness under strong attack +methods. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ HcNet: Image Modeling with Heat Conduction Equation + + +
+ Foundation models, such as CNNs and ViTs, have powered the development of +image modeling. However, general guidance to model architecture design is still +missing. The design of many modern model architectures, such as residual +structures, multiplicative gating signal, and feed-forward networks, can be +interpreted in terms of the heat conduction equation. This finding inspired us +to model images by the heat conduction equation, where the essential idea is to +conceptualize image features as temperatures and model their information +interaction as the diffusion of thermal energy. We can take advantage of the +rich knowledge in the heat conduction equation to guide us in designing new and +more interpretable models. As an example, we propose Heat Conduction Layer and +Refine Approximation Layer inspired by solving the heat conduction equation +using Finite Difference Method and Fourier series, respectively. This paper +does not aim to present a state-of-the-art model; instead, it seeks to +integrate the overall architectural design of the model into the heat +conduction theory framework. Nevertheless, our Heat Conduction Network (HcNet) +still shows competitive performance. Code available at +\url{https://github.com/ZheminZhang1/HcNet}. + +
+
+
+
+
+ + ☆ GlyphPattern: An Abstract Pattern Recognition for Vision-Language Models + + +
+ Vision-Language Models (VLMs) building upon the foundation of powerful large +language models have made rapid progress in reasoning across visual and textual +data. While VLMs perform well on vision tasks that they are trained on, our +results highlight key challenges in abstract pattern recognition. We present +GlyphPattern, a 954 item dataset that pairs 318 human-written descriptions of +visual patterns from 40 writing systems with three visual presentation styles. + GlyphPattern evaluates abstract pattern recognition in VLMs, requiring models +to understand and judge natural language descriptions of visual patterns. +GlyphPattern patterns are drawn from a large-scale cognitive science +investigation of human writing systems; as a result, they are rich in spatial +reference and compositionality. Our experiments show that GlyphPattern is +challenging for state-of-the-art VLMs (GPT-4o achieves only 55% accuracy), with +marginal gains from few-shot prompting. Our detailed error analysis reveals +challenges at multiple levels, including visual processing, natural language +understanding, and pattern generalization. + +
+
+
+
+
+ + ☆ Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer + Detection + + +
+ Polyp segmentation plays a crucial role in the early detection and diagnosis +of colorectal cancer. However, obtaining accurate segmentations often requires +labor-intensive annotations and specialized models. Recently, Meta AI Research +released a general Segment Anything Model 2 (SAM 2), which has demonstrated +promising performance in several segmentation tasks. In this work, we evaluate +the performance of SAM 2 in segmenting polyps under various prompted settings. +We hope this report will provide insights to advance the field of polyp +segmentation and promote more interesting work in the future. This project is +publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2. + +
+
+
+
+
+ + ☆ CMAB: A First National-Scale Multi-Attribute Building Dataset Derived + from Open Source Data and GeoAI + + +
+ Rapidly acquiring three-dimensional (3D) building data, including geometric +attributes like rooftop, height, and structure, as well as indicative +attributes like function, quality, and age, is essential for accurate urban +analysis, simulations, and policy updates. Existing large-scale building +datasets lack accuracy, extensibility and indicative attributes. This paper +presents a geospatial artificial intelligence (GeoAI) framework for large-scale +building modeling, introducing the first Multi-Attribute Building dataset +(CMAB) in China at a national scale. The dataset covers 3,667 natural cities +with a total rooftop area of 21.3 billion square meters with an F1-Score of +89.93% in rooftop extraction through the OCRNet. We trained bootstrap +aggregated XGBoost models with city administrative classifications, +incorporating building features such as morphology, location, and function. +Using multi-source data, including billions of high-resolution Google Earth +imagery and 60 million street view images (SVI), we generated rooftop, height, +function, age, and quality attributes for each building. Accuracy was validated +through model benchmarks, existing similar products, and manual SVI validation. +The results support urban planning and sustainable development. + +
+
+ comment: 43 pages, 20 figures +
+
+
+
+
+ + ☆ Enhancing 3D Transformer Segmentation Model for Medical Image with + Token-level Representation Learning + + +
+ In the field of medical images, although various works find Swin Transformer +has promising effectiveness on pixelwise dense prediction, whether pre-training +these models without using extra dataset can further boost the performance for +the downstream semantic segmentation remains unexplored.Applications of +previous representation learning methods are hindered by the limited number of +3D volumes and high computational cost. In addition, most of pretext tasks +designed specifically for Transformer are not applicable to hierarchical +structure of Swin Transformer. Thus, this work proposes a token-level +representation learning loss that maximizes agreement between token embeddings +from different augmented views individually instead of volume-level global +features. Moreover, we identify a potential representation collapse exclusively +caused by this new loss. To prevent collapse, we invent a simple +"rotate-and-restore" mechanism, which rotates and flips one augmented view of +input volume, and later restores the order of tokens in the feature maps. We +also modify the contrastive loss to address the discrimination between tokens +at the same position but from different volumes. We test our pre-training +scheme on two public medical segmentation datasets, and the results on the +downstream segmentation task show more improvement of our methods than other +state-of-the-art pre-trainig methods. + +
+
+
+
+
+ + ☆ Benchmarking tree species classification from proximally-sensed laser + scanning data: introducing the FOR-species20K dataset + + +
+ Proximally-sensed laser scanning offers significant potential for automated +forest data capture, but challenges remain in automatically identifying tree +species without additional ground data. Deep learning (DL) shows promise for +automation, yet progress is slowed by the lack of large, diverse, openly +available labeled datasets of single tree point clouds. This has impacted the +robustness of DL models and the ability to establish best practices for species +classification. + To overcome these challenges, the FOR-species20K benchmark dataset was +created, comprising over 20,000 tree point clouds from 33 species, captured +using terrestrial (TLS), mobile (MLS), and drone laser scanning (ULS) across +various European forests, with some data from other regions. This dataset +enables the benchmarking of DL models for tree species classification, +including both point cloud-based (PointNet++, MinkNet, MLP-Mixer, DGCNNs) and +multi-view image-based methods (SimpleView, DetailView, YOLOv5). + 2D image-based models generally performed better (average OA = 0.77) than 3D +point cloud-based models (average OA = 0.72), with consistent results across +different scanning platforms and sensors. The top model, DetailView, was +particularly robust, handling data imbalances well and generalizing effectively +across tree sizes. + The FOR-species20K dataset, available at https://zenodo.org/records/13255198, +is a key resource for developing and benchmarking DL models for tree species +classification using laser scanning data, providing a foundation for future +advancements in the field. + +
+
+
+
+
+ + ☆ Prompt Recovery for Image Generation Models: A Comparative Study of + Discrete Optimizers + + +
+ Recovering natural language prompts for image generation models, solely based +on the generated images is a difficult discrete optimization problem. In this +work, we present the first head-to-head comparison of recent discrete +optimization techniques for the problem of prompt inversion. We evaluate Greedy +Coordinate Gradients (GCG), PEZ , Random Search, AutoDAN and BLIP2's image +captioner across various evaluation metrics related to the quality of inverted +prompts and the quality of the images generated by the inverted prompts. We +find that focusing on the CLIP similarity between the inverted prompts and the +ground truth image acts as a poor proxy for the similarity between ground truth +image and the image generated by the inverted prompts. While the discrete +optimizers effectively minimize their objectives, simply using responses from a +well-trained captioner often leads to generated images that more closely +resemble those produced by the original prompts. + +
+
+ comment: 9 Pages, 4 Figures +
+
+
+
+
+ + ☆ What Color Scheme is More Effective in Assisting Readers to Locate + Information in a Color-Coded Article? + + +
+ Color coding, a technique assigning specific colors to cluster information +types, has proven advantages in aiding human cognitive activities, especially +reading and comprehension. The rise of Large Language Models (LLMs) has +streamlined document coding, enabling simple automatic text labeling with +various schemes. This has the potential to make color-coding more accessible +and benefit more users. However, the impact of color choice on information +seeking is understudied. We conducted a user study assessing various color +schemes' effectiveness in LLM-coded text documents, standardizing contrast +ratios to approximately 5.55:1 across schemes. Participants performed timed +information-seeking tasks in color-coded scholarly abstracts. Results showed +non-analogous and yellow-inclusive color schemes improved performance, with the +latter also being more preferred by participants. These findings can inform +better color scheme choices for text annotation. As LLMs advance document +coding, we advocate for more research focusing on the "color" aspect of +color-coding techniques. + +
+
+
+
+
+ + ☆ Generalization Enhancement Strategies to Enable Cross-year Cropland + Mapping with Convolutional Neural Networks Trained Using Historical Samples + + +
+ The accuracy of mapping agricultural fields across large areas is steadily +improving with high-resolution satellite imagery and deep learning (DL) models, +even in regions where fields are small and geometrically irregular. However, +developing effective DL models often requires large, expensive label datasets, +typically available only for specific years or locations. This limits the +ability to create annual maps essential for agricultural monitoring, as domain +shifts occur between years and regions due to changes in farming practices and +environmental conditions. The challenge is to design a model flexible enough to +account for these shifts without needing yearly labels. While domain adaptation +techniques or semi-supervised training are common solutions, we explored +enhancing the model's generalization power. Our results indicate that a +holistic approach is essential, combining methods to improve generalization. +Specifically, using an area-based loss function, such as Tversky-focal loss +(TFL), significantly improved predictions across multiple years. The use of +different augmentation techniques helped to encode different types of +invariance, particularly photometric augmentations encoded invariance to +brightness changes, though they increased false positives. The combination of +photometric augmentation, TFL loss, and MC-dropout produced the best results, +although dropout alone led to more false negatives in subsequent year +predictions. Additionally, the choice of input normalization had a significant +impact, with the best results obtained when statistics were calculated either +locally or across the entire dataset over all bands (lab and gab). We developed +a workflow that enabled a U-Net model to generate effective multi-year crop +maps over large areas. Our code, available at: +https://github.com/agroimpacts/cnn-generalization-enhancement, will be +regularly updated with improvements. + +
+
+
+
+
+ + ☆ InfLocNet: Enhanced Lung Infection Localization and Disease Detection + from Chest X-Ray Images Using Lightweight Deep Learning + + +
+ In recent years, the integration of deep learning techniques into medical +imaging has revolutionized the diagnosis and treatment of lung diseases, +particularly in the context of COVID-19 and pneumonia. This paper presents a +novel, lightweight deep learning based segmentation-classification network +designed to enhance the detection and localization of lung infections using +chest X-ray images. By leveraging the power of transfer learning with +pre-trained VGG-16 weights, our model achieves robust performance even with +limited training data. The architecture incorporates refined skip connections +within the UNet++ framework, reducing semantic gaps and improving precision in +segmentation tasks. Additionally, a classification module is integrated at the +end of the encoder block, enabling simultaneous classification and +segmentation. This dual functionality enhances the model's versatility, +providing comprehensive diagnostic insights while optimizing computational +efficiency. Experimental results demonstrate that our proposed lightweight +network outperforms existing methods in terms of accuracy and computational +requirements, making it a viable solution for real-time and resource +constrained medical imaging applications. Furthermore, the streamlined design +facilitates easier hyperparameter tuning and deployment on edge devices. This +work underscores the potential of advanced deep learning architectures in +improving clinical outcomes through precise and efficient medical image +analysis. Our model achieved remarkable results with an Intersection over Union +(IoU) of 93.59% and a Dice Similarity Coefficient (DSC) of 97.61% in lung area +segmentation, and an IoU of 97.67% and a DSC of 87.61% for infection region +localization. Additionally, it demonstrated high accuracy of 93.86% and +sensitivity of 89.55% in detecting chest diseases, highlighting its efficacy +and reliability. + +
+
+
+
+
+ + ☆ Advanced Vision Transformers and Open-Set Learning for Robust Mosquito + Classification: A Novel Approach to Entomological Studies + + +
+ Mosquito-related diseases pose a significant threat to global public health, +necessitating efficient and accurate mosquito classification for effective +surveillance and control. This work presents an innovative approach to mosquito +classification by leveraging state-of-the-art vision transformers and open-set +learning techniques. A novel framework has been introduced that integrates +Transformer-based deep learning models with comprehensive data augmentation and +preprocessing methods, enabling robust and precise identification of ten +mosquito species. The Swin Transformer model achieves the best performance for +traditional closed-set learning with 99.80\% accuracy and 0.998 F1 score. The +lightweight MobileViT technique attains an almost similar accuracy of 98.90\% +with significantly reduced parameters and model complexities. Next, the applied +deep learning models' adaptability and generalizability in a static environment +have been enhanced by using new classes of data samples during the inference +stage that have not been included in the training set. The proposed framework's +ability to handle unseen classes like insects similar to mosquitoes, even +humans, through open-set learning further enhances its practical applicability +employing the OpenMax technique and Weibull distribution. The traditional CNN +model, Xception, outperforms the latest transformer with higher accuracy and F1 +score for open-set learning. The study's findings highlight the transformative +potential of advanced deep-learning architectures in entomology, providing a +strong groundwork for future research and development in mosquito surveillance +and vector control. The implications of this work extend beyond mosquito +classification, offering valuable insights for broader ecological and +environmental monitoring applications. + +
+
+ comment: 23 pages, 15 figures +
+
+
+
+
+ + ☆ S-SAM: SVD-based Fine-Tuning of Segment Anything Model for Medical Image + Segmentation MICCAI 2024 + + +
+ Medical image segmentation has been traditionally approached by training or +fine-tuning the entire model to cater to any new modality or dataset. However, +this approach often requires tuning a large number of parameters during +training. With the introduction of the Segment Anything Model (SAM) for +prompted segmentation of natural images, many efforts have been made towards +adapting it efficiently for medical imaging, thus reducing the training time +and resources. However, these methods still require expert annotations for +every image in the form of point prompts or bounding box prompts during +training and inference, making it tedious to employ them in practice. In this +paper, we propose an adaptation technique, called S-SAM, that only trains +parameters equal to 0.4% of SAM's parameters and at the same time uses simply +the label names as prompts for producing precise masks. This not only makes +tuning SAM more efficient than the existing adaptation methods but also removes +the burden of providing expert prompts. We call this modified version S-SAM and +evaluate it on five different modalities including endoscopic images, x-ray, +ultrasound, CT, and histology images. Our experiments show that S-SAM +outperforms state-of-the-art methods as well as existing SAM adaptation methods +while tuning a significantly less number of parameters. We release the code for +S-SAM at https://github.com/JayParanjape/SVDSAM. + +
+
+ comment: Accepted in MICCAI 2024 +
+
+
+
+
+ + ☆ HAT: History-Augmented Anchor Transformer for Online Temporal Action + Localization ECCV 2024 + + +
+ Online video understanding often relies on individual frames, leading to +frame-by-frame predictions. Recent advancements such as Online Temporal Action +Localization (OnTAL), extend this approach to instance-level predictions. +However, existing methods mainly focus on short-term context, neglecting +historical information. To address this, we introduce the History-Augmented +Anchor Transformer (HAT) Framework for OnTAL. By integrating historical +context, our framework enhances the synergy between long-term and short-term +information, improving the quality of anchor features crucial for +classification and localization. We evaluate our model on both procedural +egocentric (PREGO) datasets (EGTEA and EPIC) and standard non-PREGO OnTAL +datasets (THUMOS and MUSES). Results show that our model outperforms +state-of-the-art approaches significantly on PREGO datasets and achieves +comparable or slightly superior performance on non-PREGO datasets, underscoring +the importance of leveraging long-term history, especially in procedural and +egocentric action scenarios. Code is available at: +https://github.com/sakibreza/ECCV24-HAT/ + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Wavelet based inpainting detection + + +
+ With the advancement in image editing tools, manipulating digital images has +become alarmingly easy. Inpainting, which is used to remove objects or fill in +parts of an image, serves as a powerful tool for both image restoration and +forgery. This paper introduces a novel approach for detecting image inpainting +forgeries by combining DT-CWT with Hierarchical Feature segmentation and with +noise inconsistency analysis. The DT-CWT offers several advantages for this +task, including inherent shift-invariance, which makes it robust to minor +manipulations during the inpainting process, and directional selectivity, which +helps capture subtle artifacts introduced by inpainting in specific frequency +bands and orientations. By first applying color image segmentation and then +analyzing for each segment, noise inconsistency obtained via DT-CW we can +identify patterns indicative of inpainting forgeries. The proposed method is +evaluated on a benchmark dataset created for this purpose and is compared with +existing forgery detection techniques. Our approach demonstrates superior +results compared with SOTA in detecting inpainted images. + +
+
+
+
+
+ + ♻ ☆ OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation ECCV 2024 + + +
+ In this work, we introduce OpenIns3D, a new 3D-input-only framework for 3D +open-vocabulary scene understanding. The OpenIns3D framework employs a +"Mask-Snap-Lookup" scheme. The "Mask" module learns class-agnostic mask +proposals in 3D point clouds, the "Snap" module generates synthetic scene-level +images at multiple scales and leverages 2D vision-language models to extract +interesting objects, and the "Lookup" module searches through the outcomes of +"Snap" to assign category names to the proposed masks. This approach, yet +simple, achieves state-of-the-art performance across a wide range of 3D +open-vocabulary tasks, including recognition, object detection, and instance +segmentation, on both indoor and outdoor datasets. Moreover, OpenIns3D +facilitates effortless switching between different 2D detectors without +requiring retraining. When integrated with powerful 2D open-world models, it +achieves excellent results in scene understanding tasks. Furthermore, when +combined with LLM-powered 2D models, OpenIns3D exhibits an impressive +capability to comprehend and process highly complex text queries that demand +intricate reasoning and real-world knowledge. Project page: +https://zheninghuang.github.io/OpenIns3D/ + +
+
+ comment: ECCV 2024. Project page: https://zheninghuang.github.io/OpenIns3D/ +
+
+
+
+
+ + ♻ ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning IROS'24 + + +
+ Robot-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems cannot accommodate individual surgeons' unique preferences and +requirements. Additionally, they primarily focus on general surgeries (e.g., +laparoscopy) and are unsuitable for highly precise microsurgeries, such as +ophthalmic procedures. Thus, we propose an image-guided approach for +surgeon-centered autonomous agents that can adapt to the individual surgeon's +skill level and preferred surgical techniques during ophthalmic cataract +surgery. Our approach trains reinforcement and imitation learning agents +simultaneously using curriculum learning approaches guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process, our approach +enables the robot to implicitly learn and adapt to the individual surgeon's +unique techniques through surgeon-in-the-loop demonstrations. This results in a +more intuitive and personalized surgical experience for the surgeon while +ensuring consistent performance for the autonomous robotic apprentice. We +define and evaluate the effectiveness of our approach in a simulated +environment using our proposed metrics and highlight the trade-off between a +generic agent and a surgeon-centered adapted agent. Finally, our approach has +the potential to extend to other ophthalmic and microsurgical procedures, +opening the door to a new generation of surgeon-in-the-loop autonomous surgical +robots. We provide an open-source simulation framework for future development +and reproducibility at +https://github.com/amrgomaaelhady/CataractAdaptSurgRobot. + +
+
+ comment: Accepted at IROS'24 +
+
+
+
+
+ + ♻ ☆ CT evaluation of 2D and 3D holistic deep learning methods for the + volumetric segmentation of airway lesions + + +
+ This research embarked on a comparative exploration of the holistic +segmentation capabilities of Convolutional Neural Networks (CNNs) in both 2D +and 3D formats, focusing on cystic fibrosis (CF) lesions. The study utilized +data from two CF reference centers, covering five major CF structural changes. +Initially, it compared the 2D and 3D models, highlighting the 3D model's +superior capability in capturing complex features like mucus plugs and +consolidations. To improve the 2D model's performance, a loss adapted to fine +structures segmentation was implemented and evaluated, significantly enhancing +its accuracy, though not surpassing the 3D model's performance. The models +underwent further validation through external evaluation against pulmonary +function tests (PFTs), confirming the robustness of the findings. Moreover, +this study went beyond comparing metrics; it also included comprehensive +assessments of the models' interpretability and reliability, providing valuable +insights for their clinical application. + +
+
+ comment: 6 pages, 3 figures, 2 tables, IEEE International Symposium on + Biomedical Imaging (ISBI) 2024 +
+
+
+
+
+ + ♻ ☆ ControlNet-XS: Rethinking the Control of Text-to-Image Diffusion Models + as Feedback-Control Systems + + +
+ The field of image synthesis has made tremendous strides forward in the last +years. Besides defining the desired output image with text-prompts, an +intuitive approach is to additionally use spatial guidance in form of an image, +such as a depth map. In state-of-the-art approaches, this guidance is realized +by a separate controlling model that controls a pre-trained image generation +network, such as a latent diffusion model. Understanding this process from a +control system perspective shows that it forms a feedback-control system, where +the control module receives a feedback signal from the generation process and +sends a corrective signal back. When analysing existing systems, we observe +that the feedback signals are timely sparse and have a small number of bits. As +a consequence, there can be long delays between newly generated features and +the respective corrective signals for these features. It is known that this +delay is the most unwanted aspect of any control system. In this work, we take +an existing controlling network (ControlNet) and change the communication +between the controlling network and the generation process to be of +high-frequency and with large-bandwidth. By doing so, we are able to +considerably improve the quality of the generated images, as well as the +fidelity of the control. Also, the controlling network needs noticeably fewer +parameters and hence is about twice as fast during inference and training time. +Another benefit of small-sized models is that they help to democratise our +field and are likely easier to understand. We call our proposed network +ControlNet-XS. When comparing with the state-of-the-art approaches, we +outperform them for pixel-level guidance, such as depth, canny-edges, and +semantic segmentation, and are on a par for loose keypoint-guidance of human +poses. All code and pre-trained models will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Discover-then-Name: Task-Agnostic Concept Bottlenecks via Automated + Concept Discovery ECCV + + +
+ Concept Bottleneck Models (CBMs) have recently been proposed to address the +'black-box' problem of deep neural networks, by first mapping images to a +human-understandable concept space and then linearly combining concepts for +classification. Such models typically require first coming up with a set of +concepts relevant to the task and then aligning the representations of a +feature extractor to map to these concepts. However, even with powerful +foundational feature extractors like CLIP, there are no guarantees that the +specified concepts are detectable. In this work, we leverage recent advances in +mechanistic interpretability and propose a novel CBM approach -- called +Discover-then-Name-CBM (DN-CBM) -- that inverts the typical paradigm: instead +of pre-selecting concepts based on the downstream classification task, we use +sparse autoencoders to first discover concepts learnt by the model, and then +name them and train linear probes for classification. Our concept extraction +strategy is efficient, since it is agnostic to the downstream task, and uses +concepts already known to the model. We perform a comprehensive evaluation +across multiple datasets and CLIP architectures and show that our method yields +semantically meaningful concepts, assigns appropriate names to them that make +them easy to interpret, and yields performant and interpretable CBMs. Code +available at https://github.com/neuroexplicit-saar/discover-then-name. + +
+
+ comment: 40 pages, 21 figures, 6 tables, European Conference on Computer + Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ A secure and private ensemble matcher using multi-vault obfuscated + templates + + +
+ Generative AI has revolutionized modern machine learning by providing +unprecedented realism, diversity, and efficiency in data generation. This +technology holds immense potential for biometrics, including for securing +sensitive and personally identifiable information. Given the irrevocability of +biometric samples and mounting privacy concerns, biometric template security +and secure matching are among the most sought-after features of modern +biometric systems. This paper proposes a novel obfuscation method using +Generative AI to enhance biometric template security. Our approach utilizes +synthetic facial images generated by a Generative Adversarial Network (GAN) as +"random chaff points" within a secure vault system. Our method creates n +sub-templates from the original template, each obfuscated with m GAN chaff +points. During verification, s closest vectors to the biometric query are +retrieved from each vault and combined to generate hash values, which are then +compared with the stored hash value. Thus, our method safeguards user +identities during the training and deployment phases by employing the +GAN-generated synthetic images. Our protocol was tested using the AT&T, GT, and +LFW face datasets, achieving ROC areas under the curve of 0.99, 0.99, and 0.90, +respectively. Our results demonstrate that the proposed method can maintain +high accuracy and reasonable computational complexity comparable to those +unprotected template methods while significantly enhancing security and +privacy, underscoring the potential of Generative AI in developing proactive +defensive strategies for biometric systems. + +
+
+ comment: This paper has been accepted in IJCB 2024 Special Session, Generative + AI for Futuristic Biometrics +
+
+
+
+
+ + ♻ ☆ LPGen: Enhancing High-Fidelity Landscape Painting Generation through + Diffusion Model + + +
+ Generating landscape paintings expands the possibilities of artistic +creativity and imagination. Traditional landscape painting methods involve +using ink or colored ink on rice paper, which requires substantial time and +effort. These methods are susceptible to errors and inconsistencies and lack +precise control over lines and colors. This paper presents LPGen, a +high-fidelity, controllable model for landscape painting generation, +introducing a novel multi-modal framework that integrates image prompts into +the diffusion model. We extract its edges and contours by computing canny edges +from the target landscape image. These, along with natural language text +prompts and drawing style references, are fed into the latent diffusion model +as conditions. We implement a decoupled cross-attention strategy to ensure +compatibility between image and text prompts, facilitating multi-modal image +generation. A decoder generates the final image. Quantitative and qualitative +analyses demonstrate that our method outperforms existing approaches in +landscape painting generation and exceeds the current state-of-the-art. The +LPGen network effectively controls the composition and color of landscape +paintings, generates more accurate images, and supports further research in +deep learning-based landscape painting generation. + +
+
+
+
+
+ + ♻ ☆ MC-GPT: Empowering Vision-and-Language Navigation with Memory Map and + Reasoning Chains + + +
+ In the Vision-and-Language Navigation (VLN) task, the agent is required to +navigate to a destination following a natural language instruction. While +learning-based approaches have been a major solution to the task, they suffer +from high training costs and lack of interpretability. Recently, Large Language +Models (LLMs) have emerged as a promising tool for VLN due to their strong +generalization capabilities. However, existing LLM-based methods face +limitations in memory construction and diversity of navigation strategies. To +address these challenges, we propose a suite of techniques. Firstly, we +introduce a method to maintain a topological map that stores navigation +history, retaining information about viewpoints, objects, and their spatial +relationships. This map also serves as a global action space. Additionally, we +present a Navigation Chain of Thoughts module, leveraging human navigation +examples to enrich navigation strategy diversity. Finally, we establish a +pipeline that integrates navigational memory and strategies with perception and +action prediction modules. Experimental results on the REVERIE and R2R datasets +show that our method effectively enhances the navigation ability of the LLM and +improves the interpretability of navigation reasoning. + +
+
+
+
+
+ + ♻ ☆ ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for + ControlNet + + +
+ This paper introduces ViscoNet, a novel one-branch-adapter architecture for +concurrent spatial and visual conditioning. Our lightweight model requires +trainable parameters and dataset size multiple orders of magnitude smaller than +the current state-of-the-art IP-Adapter. However, our method successfully +preserves the generative power of the frozen text-to-image (T2I) backbone. +Notably, it excels in addressing mode collapse, a pervasive issue previously +overlooked. Our novel architecture demonstrates outstanding capabilities in +achieving a harmonious visual-text balance, unlocking unparalleled versatility +in various human image generation tasks, including pose re-targeting, virtual +try-on, stylization, person re-identification, and textile transfer.Demo and +code are available from project page https://soon-yau.github.io/visconet/ . + +
+
+
+
+
+ + ♻ ☆ MLAAN: Scaling Supervised Local Learning with Multilaminar Leap + Augmented Auxiliary Network + + +
+ Deep neural networks (DNNs) typically employ an end-to-end (E2E) training +paradigm which presents several challenges, including high GPU memory +consumption, inefficiency, and difficulties in model parallelization during +training. Recent research has sought to address these issues, with one +promising approach being local learning. This method involves partitioning the +backbone network into gradient-isolated modules and manually designing +auxiliary networks to train these local modules. Existing methods often neglect +the interaction of information between local modules, leading to myopic issues +and a performance gap compared to E2E training. To address these limitations, +we propose the Multilaminar Leap Augmented Auxiliary Network (MLAAN). +Specifically, MLAAN comprises Multilaminar Local Modules (MLM) and Leap +Augmented Modules (LAM). MLM captures both local and global features through +independent and cascaded auxiliary networks, alleviating performance issues +caused by insufficient global features. However, overly simplistic auxiliary +networks can impede MLM's ability to capture global information. To address +this, we further design LAM, an enhanced auxiliary network that uses the +Exponential Moving Average (EMA) method to facilitate information exchange +between local modules, thereby mitigating the shortsightedness resulting from +inadequate interaction. The synergy between MLM and LAM has demonstrated +excellent performance. Our experiments on the CIFAR-10, STL-10, SVHN, and +ImageNet datasets show that MLAAN can be seamlessly integrated into existing +local learning frameworks, significantly enhancing their performance and even +surpassing end-to-end (E2E) training methods, while also reducing GPU memory +consumption. + +
+
+
+
+
+ + ♻ ☆ Momentum Auxiliary Network for Supervised Local Learning ECCV2024 + + +
+ Deep neural networks conventionally employ end-to-end backpropagation for +their training process, which lacks biological credibility and triggers a +locking dilemma during network parameter updates, leading to significant GPU +memory use. Supervised local learning, which segments the network into multiple +local blocks updated by independent auxiliary networks. However, these methods +cannot replace end-to-end training due to lower accuracy, as gradients only +propagate within their local block, creating a lack of information exchange +between blocks. To address this issue and establish information transfer across +blocks, we propose a Momentum Auxiliary Network (MAN) that establishes a +dynamic interaction mechanism. The MAN leverages an exponential moving average +(EMA) of the parameters from adjacent local blocks to enhance information flow. +This auxiliary network, updated through EMA, helps bridge the informational gap +between blocks. Nevertheless, we observe that directly applying EMA parameters +has certain limitations due to feature discrepancies among local blocks. To +overcome this, we introduce learnable biases, further boosting performance. We +have validated our method on four image classification datasets (CIFAR-10, +STL-10, SVHN, ImageNet), attaining superior performance and substantial memory +savings. Notably, our method can reduce GPU memory usage by more than 45\% on +the ImageNet dataset compared to end-to-end training, while achieving higher +performance. The Momentum Auxiliary Network thus offers a new perspective for +supervised local learning. Our code is available at: +https://github.com/JunhaoSu0/MAN. + +
+
+ comment: Accepted by ECCV2024(Oral) +
+
+
+
+
+ + ♻ ☆ MIMONet: Multi-Input Multi-Output On-Device Deep Learning ICRA 2025 + + +
+ Future intelligent robots are expected to process multiple inputs +simultaneously (such as image and audio data) and generate multiple outputs +accordingly (such as gender and emotion), similar to humans. Recent research +has shown that multi-input single-output (MISO) deep neural networks (DNN) +outperform traditional single-input single-output (SISO) models, representing a +significant step towards this goal. In this paper, we propose MIMONet, a novel +on-device multi-input multi-output (MIMO) DNN framework that achieves high +accuracy and on-device efficiency in terms of critical performance metrics such +as latency, energy, and memory usage. Leveraging existing SISO model +compression techniques, MIMONet develops a new deep-compression method that is +specifically tailored to MIMO models. This new method explores unique yet +non-trivial properties of the MIMO model, resulting in boosted accuracy and +on-device efficiency. Extensive experiments on three embedded platforms +commonly used in robotic systems, as well as a case study using the TurtleBot3 +robot, demonstrate that MIMONet achieves higher accuracy and superior on-device +efficiency compared to state-of-the-art SISO and MISO models, as well as a +baseline MIMO model we constructed. Our evaluation highlights the real-world +applicability of MIMONet and its potential to significantly enhance the +performance of intelligent robotic systems. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ♻ ☆ Towards Highly Realistic Artistic Style Transfer via Stable Diffusion + with Step-aware and Layer-aware Prompt IJCAI2024 + + +
+ Artistic style transfer aims to transfer the learned artistic style onto an +arbitrary content image, generating artistic stylized images. Existing +generative adversarial network-based methods fail to generate highly realistic +stylized images and always introduce obvious artifacts and disharmonious +patterns. Recently, large-scale pre-trained diffusion models opened up a new +way for generating highly realistic artistic stylized images. However, +diffusion model-based methods generally fail to preserve the content structure +of input content images well, introducing some undesired content structure and +style patterns. To address the above problems, we propose a novel pre-trained +diffusion-based artistic style transfer method, called LSAST, which can +generate highly realistic artistic stylized images while preserving the content +structure of input content images well, without bringing obvious artifacts and +disharmonious style patterns. Specifically, we introduce a Step-aware and +Layer-aware Prompt Space, a set of learnable prompts, which can learn the style +information from the collection of artworks and dynamically adjusts the input +images' content structure and style pattern. To train our prompt space, we +propose a novel inversion method, called Step-ware and Layer-aware Prompt +Inversion, which allows the prompt space to learn the style information of the +artworks collection. In addition, we inject a pre-trained conditional branch of +ControlNet into our LSAST, which further improved our framework's ability to +maintain content structure. Extensive experiments demonstrate that our proposed +method can generate more highly realistic artistic stylized images than the +state-of-the-art artistic style transfer methods. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Learning Invariant Causal Mechanism from Vision-Language Models + + +
+ Large-scale pre-trained vision-language models such as CLIP have been widely +applied to a variety of downstream scenarios. In real-world applications, the +CLIP model is often utilized in more diverse scenarios than those encountered +during its training, a challenge known as the out-of-distribution (OOD) +problem. However, our experiments reveal that CLIP performs unsatisfactorily in +certain domains. Through a causal analysis, we find that CLIP's current +prediction process cannot guarantee a low OOD risk. The lowest OOD risk can be +achieved when the prediction process is based on invariant causal mechanisms, +i.e., predicting solely based on invariant latent factors. However, theoretical +analysis indicates that CLIP does not identify these invariant latent factors. +Therefore, we propose the Invariant Causal Mechanism for CLIP (CLIP-ICM), a +framework that first identifies invariant latent factors using interventional +data and then performs invariant predictions across various domains. Our method +is simple yet effective, without significant computational overhead. +Experimental results demonstrate that CLIP-ICM significantly improves CLIP's +performance in OOD scenarios. + +
+
+
+
+
+ + ♻ ☆ Helios: An extremely low power event-based gesture recognition for + always-on smart eyewear ECCV + + +
+ This paper introduces Helios, the first extremely low-power, real-time, +event-based hand gesture recognition system designed for all-day on smart +eyewear. As augmented reality (AR) evolves, current smart glasses like the Meta +Ray-Bans prioritize visual and wearable comfort at the expense of +functionality. Existing human-machine interfaces (HMIs) in these devices, such +as capacitive touch and voice controls, present limitations in ergonomics, +privacy and power consumption. Helios addresses these challenges by leveraging +natural hand interactions for a more intuitive and comfortable user experience. +Our system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera +to perform natural hand-based gesture recognition for always-on smart eyewear. +The camera's output is processed by a convolutional neural network (CNN) +running on a NXP Nano UltraLite compute platform, consuming less than 350mW. +Helios can recognize seven classes of gestures, including subtle microgestures +like swipes and pinches, with 91% accuracy. We also demonstrate real-time +performance across 20 users at a remarkably low latency of 60ms. Our user +testing results align with the positive feedback we received during our recent +successful demo at AWE-USA-2024. + +
+
+ comment: Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024. + 18 pages, 10 figures. First three authors contributed equally to this paper +
+
+
+
+
+ + ♻ ☆ IN-Sight: Interactive Navigation through Sight IROS 2024 + + +
+ Current visual navigation systems often treat the environment as static, +lacking the ability to adaptively interact with obstacles. This limitation +leads to navigation failure when encountering unavoidable obstructions. In +response, we introduce IN-Sight, a novel approach to self-supervised path +planning, enabling more effective navigation strategies through interaction +with obstacles. Utilizing RGB-D observations, IN-Sight calculates +traversability scores and incorporates them into a semantic map, facilitating +long-range path planning in complex, maze-like environments. To precisely +navigate around obstacles, IN-Sight employs a local planner, trained +imperatively on a differentiable costmap using representation learning +techniques. The entire framework undergoes end-to-end training within the +state-of-the-art photorealistic Intel SPEAR Simulator. We validate the +effectiveness of IN-Sight through extensive benchmarking in a variety of +simulated scenarios and ablation studies. Moreover, we demonstrate the system's +real-world applicability with zero-shot sim-to-real transfer, deploying our +planner on the legged robot platform ANYmal, showcasing its practical potential +for interactive navigation in real environments. + +
+
+ comment: The 2024 IEEE/RSJ International Conference on Intelligent Robots and + Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Integrating Present and Past in Unsupervised Continual Learning + + +
+ We formulate a unifying framework for unsupervised continual learning (UCL), +which disentangles learning objectives that are specific to the present and the +past data, encompassing stability, plasticity, and cross-task consolidation. +The framework reveals that many existing UCL approaches overlook cross-task +consolidation and try to balance plasticity and stability in a shared embedding +space. This results in worse performance due to a lack of within-task data +diversity and reduced effectiveness in learning the current task. Our method, +Osiris, which explicitly optimizes all three objectives on separate embedding +spaces, achieves state-of-the-art performance on all benchmarks, including two +novel benchmarks proposed in this paper featuring semantically structured task +sequences. Compared to standard benchmarks, these two structured benchmarks +more closely resemble visual signals received by humans and animals when +navigating real-world environments. Finally, we show some preliminary evidence +that continual models can benefit from such realistic learning scenarios. + +
+
+ comment: CoLLAs 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Weakly Supervised LiDAR Semantic Segmentation via Scatter Image + Annotation + + +
+ Weakly supervised LiDAR semantic segmentation has made significant strides +with limited labeled data. However, most existing methods focus on the network +training under weak supervision, while efficient annotation strategies remain +largely unexplored. To tackle this gap, we implement LiDAR semantic +segmentation using scatter image annotation, effectively integrating an +efficient annotation strategy with network training. Specifically, we propose +employing scatter images to annotate LiDAR point clouds, combining a +pre-trained optical flow estimation network with a foundation image +segmentation model to rapidly propagate manual annotations into dense labels +for both images and point clouds. Moreover, we propose ScatterNet, a network +that includes three pivotal strategies to reduce the performance gap caused by +such annotations. Firstly, it utilizes dense semantic labels as supervision for +the image branch, alleviating the modality imbalance between point clouds and +images. Secondly, an intermediate fusion branch is proposed to obtain +multimodal texture and structural features. Lastly, a perception consistency +loss is introduced to determine which information needs to be fused and which +needs to be discarded during the fusion process. Extensive experiments on the +nuScenes and SemanticKITTI datasets have demonstrated that our method requires +less than 0.02% of the labeled points to achieve over 95% of the performance of +fully-supervised methods. Notably, our labeled points are only 5% of those used +in the most advanced weakly supervised methods. + +
+
+
+
+
+ + ♻ ☆ CSWin-UNet: Transformer UNet with Cross-Shaped Windows for Medical Image + Segmentation + + +
+ Deep learning, especially convolutional neural networks (CNNs) and +Transformer architectures, have become the focus of extensive research in +medical image segmentation, achieving impressive results. However, CNNs come +with inductive biases that limit their effectiveness in more complex, varied +segmentation scenarios. Conversely, while Transformer-based methods excel at +capturing global and long-range semantic details, they suffer from high +computational demands. In this study, we propose CSWin-UNet, a novel U-shaped +segmentation method that incorporates the CSWin self-attention mechanism into +the UNet to facilitate horizontal and vertical stripes self-attention. This +method significantly enhances both computational efficiency and receptive field +interactions. Additionally, our innovative decoder utilizes a content-aware +reassembly operator that strategically reassembles features, guided by +predicted kernels, for precise image resolution restoration. Our extensive +empirical evaluations on diverse datasets, including synapse multi-organ CT, +cardiac MRI, and skin lesions, demonstrate that CSWin-UNet maintains low model +complexity while delivering high segmentation accuracy. + +
+
+
+
+
+ + ♻ ☆ DINOv2 Rocks Geological Image Analysis: Classification, Segmentation, + and Interpretability + + +
+ This study investigates the interpretability, classification, and +segmentation of CT-scan images of rock samples, with a particular focus on the +application of DINOv2 within Geosciences. We compared various segmentation +techniques to evaluate their efficacy, efficiency, and adaptability in +geological image analysis. The methods assessed include the Otsu thresholding +method, clustering techniques (K-means and fuzzy C-means), a supervised machine +learning approach (Random Forest), and deep learning methods (UNet and DINOv2). +We tested these methods using ten binary sandstone datasets and three +multi-class calcite datasets. To begin, we provide a thorough interpretability +analysis of DINOv2's features in the geoscientific context, discussing its +suitability and inherent ability to process CT-scanned rock data. In terms of +classification, the out-of-the-box DINOv2 demonstrates an impressive capability +to perfectly classify rock images, even when the CT scans are out of its +original training set. Regarding segmentation, thresholding and unsupervised +methods, while fast, perform poorly despite image preprocessing, whereas +supervised methods show better results. We underscore the computational demands +of deep learning but highlight its minimal intervention, superior +generalization, and performance without additional image preprocessing. +Additionally, we observe a lack of correlation between a network's depth or the +number of parameters and its performance. Our results show that a LoRA +fine-tuned DINOv2 excels in out-of-distribution segmentation and significantly +outperforms other methods in multi-class segmentation. By systematically +comparing these methods, we identify the most efficient strategy for meticulous +and laborious segmentation tasks. DINOv2 proves advantageous, achieving +segmentations that could be described as "better than ground-truth" against +relatively small training sets. + +
+
+ comment: Minor typos fixing, link to the code, small changes +
+
+
+
+
+ + ♻ ☆ CARMIL: Context-Aware Regularization on Multiple Instance Learning + models for Whole Slide Images + + +
+ Multiple Instance Learning (MIL) models have proven effective for cancer +prognosis from Whole Slide Images. However, the original MIL formulation +incorrectly assumes the patches of the same image to be independent, leading to +a loss of spatial context as information flows through the network. +Incorporating contextual knowledge into predictions is particularly important +given the inclination for cancerous cells to form clusters and the presence of +spatial indicators for tumors. State-of-the-art methods often use attention +mechanisms eventually combined with graphs to capture spatial knowledge. In +this paper, we take a novel and transversal approach, addressing this issue +through the lens of regularization. We propose Context-Aware Regularization for +Multiple Instance Learning (CARMIL), a versatile regularization scheme designed +to seamlessly integrate spatial knowledge into any MIL model. Additionally, we +present a new and generic metric to quantify the Context-Awareness of any MIL +model when applied to Whole Slide Images, resolving a previously unexplored gap +in the field. The efficacy of our framework is evaluated for two survival +analysis tasks on glioblastoma (TCGA GBM) and colon cancer data (TCGA COAD). + +
+
+
+
+
+ + ♻ ☆ SeiT++: Masked Token Modeling Improves Storage-efficient Training ECCV 2024 + + +
+ Recent advancements in Deep Neural Network (DNN) models have significantly +improved performance across computer vision tasks. However, achieving highly +generalizable and high-performing vision models requires expansive datasets, +resulting in significant storage requirements. This storage challenge is a +critical bottleneck for scaling up models. A recent breakthrough by SeiT +proposed the use of Vector-Quantized (VQ) feature vectors (i.e., tokens) as +network inputs for vision classification. This approach achieved 90% of the +performance of a model trained on full-pixel images with only 1% of the +storage. While SeiT needs labeled data, its potential in scenarios beyond fully +supervised learning remains largely untapped. In this paper, we extend SeiT by +integrating Masked Token Modeling (MTM) for self-supervised pre-training. +Recognizing that self-supervised approaches often demand more data due to the +lack of labels, we introduce TokenAdapt and ColorAdapt. These methods +facilitate comprehensive token-friendly data augmentation, effectively +addressing the increased data requirements of self-supervised learning. We +evaluate our approach across various scenarios, including storage-efficient +ImageNet-1k classification, fine-grained classification, ADE-20k semantic +segmentation, and robustness benchmarks. Experimental results demonstrate +consistent performance improvement in diverse experiments, validating the +effectiveness of our method. Code is available at +https://github.com/naver-ai/seit. + +
+
+ comment: Accepted to ECCV 2024. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Confidence Self-Calibration for Multi-Label Class-Incremental Learning ECCV + + +
+ The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL) +arises when only the new classes are labeled during training, while past and +future labels remain unavailable. This issue leads to a proliferation of +false-positive errors due to erroneously high confidence multi-label +predictions, exacerbating catastrophic forgetting within the disjoint label +space. In this paper, we aim to refine multi-label confidence calibration in +MLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for +label relationship calibration, we introduce a class-incremental graph +convolutional network that bridges the isolated label spaces by constructing +learnable, dynamically extended label relationship graph. Then, for confidence +calibration, we present a max-entropy regularization for each multi-label +increment, facilitating confidence self-calibration through the penalization of +over-confident output distributions. Our approach attains new state-of-the-art +results in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the +calibration of label confidences confirmed through our methodology. + +
+
+ comment: Accepted at the European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion + Prior and Reward Feedback Learning + + +
+ Recent advances in text-to-image (T2I) diffusion models have enabled +impressive image generation capabilities guided by text prompts. However, +extending these techniques to video generation remains challenging, with +existing text-to-video (T2V) methods often struggling to produce high-quality +and motion-consistent videos. In this work, we introduce Control-A-Video, a +controllable T2V diffusion model that can generate videos conditioned on text +prompts and reference control maps like edge and depth maps. To tackle video +quality and motion consistency issues, we propose novel strategies to +incorporate content prior and motion prior into the diffusion-based generation +process. Specifically, we employ a first-frame condition scheme to transfer +video generation from the image domain. Additionally, we introduce +residual-based and optical flow-based noise initialization to infuse motion +priors from reference videos, promoting relevance among frame latents for +reduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback +Learning (ST-ReFL) algorithm that optimizes the video diffusion model using +multiple reward models for video quality and motion consistency, leading to +superior outputs. Comprehensive experiments demonstrate that our framework +generates higher-quality, more consistent videos compared to existing +state-of-the-art methods in controllable text-to-video generation + +
+
+
+
+
+ + ♻ ☆ A Deep Learning Method for Simultaneous Denoising and Missing Wedge + Reconstruction in Cryogenic Electron Tomography + + +
+ Cryogenic electron tomography is a technique for imaging biological samples +in 3D. A microscope collects a series of 2D projections of the sample, and the +goal is to reconstruct the 3D density of the sample called the tomogram. +Reconstruction is difficult as the 2D projections are noisy and can not be +recorded from all directions, resulting in a missing wedge of information. +Tomograms conventionally reconstructed with filtered back-projection suffer +from noise and strong artifacts due to the missing wedge. Here, we propose a +deep-learning approach for simultaneous denoising and missing wedge +reconstruction called DeepDeWedge. The algorithm requires no ground truth data +and is based on fitting a neural network to the 2D projections using a +self-supervised loss. DeepDeWedge is simpler than current state-of-the-art +approaches for denoising and missing wedge reconstruction, performs +competitively and produces more denoised tomograms with higher overall +contrast. + +
+
+
+
+
+ + ♻ ☆ On-the-fly Point Feature Representation for Point Clouds Analysis ACM MM 2024 + + +
+ Point cloud analysis is challenging due to its unique characteristics of +unorderness, sparsity and irregularity. Prior works attempt to capture local +relationships by convolution operations or attention mechanisms, exploiting +geometric information from coordinates implicitly. These methods, however, are +insufficient to describe the explicit local geometry, e.g., curvature and +orientation. In this paper, we propose On-the-fly Point Feature Representation +(OPFR), which captures abundant geometric information explicitly through Curve +Feature Generator module. This is inspired by Point Feature Histogram (PFH) +from computer vision community. However, the utilization of vanilla PFH +encounters great difficulties when applied to large datasets and dense point +clouds, as it demands considerable time for feature generation. In contrast, we +introduce the Local Reference Constructor module, which approximates the local +coordinate systems based on triangle sets. Owing to this, our OPFR only +requires extra 1.56ms for inference (65x faster than vanilla PFH) and 0.012M +more parameters, and it can serve as a versatile plug-and-play module for +various backbones, particularly MLP-based and Transformer-based backbones +examined in this study. Additionally, we introduce the novel Hierarchical +Sampling module aimed at enhancing the quality of triangle sets, thereby +ensuring robustness of the obtained geometric features. Our proposed method +improves overall accuracy (OA) on ModelNet40 from 90.7% to 94.5% (+3.8%) for +classification, and OA on S3DIS Area-5 from 86.4% to 90.0% (+3.6%) for semantic +segmentation, respectively, building upon PointNet++ backbone. When integrated +with Point Transformer backbone, we achieve state-of-the-art results on both +tasks: 94.8% OA on ModelNet40 and 91.7% OA on S3DIS Area-5. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ StreetSurfaceVis: a dataset of crowdsourced street-level imagery with + semi-automated annotations of road surface type and quality + + +
+ Road unevenness significantly impacts the safety and comfort of various +traffic participants, especially vulnerable road users such as cyclists and +wheelchair users. This paper introduces StreetSurfaceVis, a novel dataset +comprising 9,122 street-level images collected from a crowdsourcing platform +and manually annotated by road surface type and quality. The dataset is +intended to train models for comprehensive surface assessments of road +networks. Existing open datasets are constrained by limited geospatial coverage +and camera setups, typically excluding cycleways and footways. By crafting a +heterogeneous dataset, we aim to fill this gap and enable robust models that +maintain high accuracy across diverse image sources. However, the frequency +distribution of road surface types and qualities is highly imbalanced. We +address the challenge of ensuring sufficient images per class while reducing +manual annotation by proposing a sampling strategy that incorporates various +external label prediction resources. More precisely, we estimate the impact of +(1) enriching the image data with OpenStreetMap tags, (2) iterative training +and application of a custom surface type classification model, (3) amplifying +underrepresented classes through prompt-based classification with GPT-4o or +similarity search using image embeddings. We show that utilizing a combination +of these strategies effectively reduces manual annotation workload while +ensuring sufficient class representation. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Computer Vision for Primate Behavior Analysis in the Wild + + +
+ Advances in computer vision as well as increasingly widespread video-based +behavioral monitoring have great potential for transforming how we study animal +cognition and behavior. However, there is still a fairly large gap between the +exciting prospects and what can actually be achieved in practice today, +especially in videos from the wild. With this perspective paper, we want to +contribute towards closing this gap, by guiding behavioral scientists in what +can be expected from current methods and steering computer vision researchers +towards problems that are relevant to advance research in animal behavior. We +start with a survey of the state-of-the-art methods for computer vision +problems that are directly relevant to the video-based study of animal +behavior, including object detection, multi-individual tracking, individual +identification, and (inter)action recognition. We then review methods for +effort-efficient learning, which is one of the biggest challenges from a +practical perspective. Finally, we close with an outlook into the future of the +emerging field of computer vision for animal behavior, where we argue that the +field should develop approaches to unify detection, tracking, identification +and (inter)action recognition in a single, video-based framework. + +
+
+
+
+
+ + ♻ ☆ TDANet: Target-Directed Attention Network For Object-Goal Visual + Navigation With Zero-Shot Ability + + +
+ The generalization of the end-to-end deep reinforcement learning (DRL) for +object-goal visual navigation is a long-standing challenge since object classes +and placements vary in new test environments. Learning domain-independent +visual representation is critical for enabling the trained DRL agent with the +ability to generalize to unseen scenes and objects. In this letter, a +target-directed attention network (TDANet) is proposed to learn the end-to-end +object-goal visual navigation policy with zero-shot ability. TDANet features a +novel target attention (TA) module that learns both the spatial and semantic +relationships among objects to help TDANet focus on the most relevant observed +objects to the target. With the Siamese architecture (SA) design, TDANet +distinguishes the difference between the current and target states and +generates the domain-independent visual representation. To evaluate the +navigation performance of TDANet, extensive experiments are conducted in the +AI2-THOR embodied AI environment. The simulation results demonstrate a strong +generalization ability of TDANet to unseen scenes and target objects, with +higher navigation success rate (SR) and success weighted by length (SPL) than +other state-of-the-art models. TDANet is finally deployed on a wheeled robot in +real scenes, demonstrating satisfactory generalization of TDANet to the real +world. + +
+
+
+
+
+ + ♻ ☆ Contextual Object Detection with Multimodal Large Language Models + + +
+ Recent Multimodal Large Language Models (MLLMs) are remarkable in +vision-language tasks, such as image captioning and question answering, but +lack the essential perception ability, i.e., object detection. In this work, we +address this limitation by introducing a novel research problem of contextual +object detection -- understanding visible objects within different human-AI +interactive contexts. Three representative scenarios are investigated, +including the language cloze test, visual captioning, and question answering. +Moreover, we present ContextDET, a unified multimodal model that is capable of +end-to-end differentiable modeling of visual-language contexts, so as to +locate, identify, and associate visual objects with language inputs for +human-AI interaction. Our ContextDET involves three key submodels: (i) a visual +encoder for extracting visual representations, (ii) a pre-trained LLM for +multimodal context decoding, and (iii) a visual decoder for predicting bounding +boxes given contextual object words. The new generate-then-detect framework +enables us to detect object words within human vocabulary. Extensive +experiments show the advantages of ContextDET on our proposed CODE benchmark, +open-vocabulary detection, and referring image segmentation. Github: +https://github.com/yuhangzang/ContextDET. + +
+
+ comment: IJCV 2024 +
+
+
+
+
+ + ♻ ☆ LEGO: Learning and Graph-Optimized Modular Tracker for Online + Multi-Object Tracking with Point Clouds + + +
+ Online multi-object tracking (MOT) plays a pivotal role in autonomous +systems. The state-of-the-art approaches usually employ a tracking-by-detection +method, and data association plays a critical role. This paper proposes a +learning and graph-optimized (LEGO) modular tracker to improve data association +performance in the existing literature. The proposed LEGO tracker integrates +graph optimization and self-attention mechanisms, which efficiently formulate +the association score map, facilitating the accurate and efficient matching of +objects across time frames. To further enhance the state update process, the +Kalman filter is added to ensure consistent tracking by incorporating temporal +coherence in the object states. Our proposed method utilizing LiDAR alone has +shown exceptional performance compared to other online tracking approaches, +including LiDAR-based and LiDAR-camera fusion-based methods. LEGO ranked 1st at +the time of submitting results to KITTI object tracking evaluation ranking +board and remains 2nd at the time of submitting this paper, among all online +trackers in the KITTI MOT benchmark for cars1 + +
+
+
+
+
+ + ♻ ☆ A Diagnostic Model for Acute Lymphoblastic Leukemia Using Metaheuristics + and Deep Learning Methods + + +
+ Acute lymphoblastic leukemia (ALL) severity is determined by the presence and +ratios of blast cells (abnormal white blood cells) in both bone marrow and +peripheral blood. Manual diagnosis of this disease is a tedious and +time-consuming operation, making it difficult for professionals to accurately +examine blast cell characteristics. To address this difficulty, researchers use +deep learning and machine learning. In this paper, a ResNet-based feature +extractor is utilized to detect ALL, along with a variety of feature selectors +and classifiers. To get the best results, a variety of transfer learning +models, including the Resnet, VGG, EfficientNet, and DensNet families, are used +as deep feature extractors. Following extraction, different feature selectors +are used, including Genetic algorithm, PCA, ANOVA, Random Forest, Univariate, +Mutual information, Lasso, XGB, Variance, and Binary ant colony. After feature +qualification, a variety of classifiers are used, with MLP outperforming the +others. The recommended technique is used to categorize ALL and HEM in the +selected dataset which is C-NMC 2019. This technique got an impressive 90.71% +accuracy and 95.76% sensitivity for the relevant classifications, and its +metrics on this dataset outperformed others. + +
+
+
+
+
+ + ♻ ☆ MultiHateClip: A Multilingual Benchmark Dataset for Hateful Video + Detection on YouTube and Bilibili + + +
+ Hate speech is a pressing issue in modern society, with significant effects +both online and offline. Recent research in hate speech detection has primarily +centered on text-based media, largely overlooking multimodal content such as +videos. Existing studies on hateful video datasets have predominantly focused +on English content within a Western context and have been limited to binary +labels (hateful or non-hateful), lacking detailed contextual information. This +study presents MultiHateClip1 , an novel multilingual dataset created through +hate lexicons and human annotation. It aims to enhance the detection of hateful +videos on platforms such as YouTube and Bilibili, including content in both +English and Chinese languages. Comprising 2,000 videos annotated for +hatefulness, offensiveness, and normalcy, this dataset provides a +cross-cultural perspective on gender-based hate speech. Through a detailed +examination of human annotation results, we discuss the differences between +Chinese and English hateful videos and underscore the importance of different +modalities in hateful and offensive video analysis. Evaluations of +state-of-the-art video classification models, such as VLM, GPT-4V and Qwen-VL, +on MultiHateClip highlight the existing challenges in accurately distinguishing +between hateful and offensive content and the urgent need for models that are +both multimodally and culturally nuanced. MultiHateClip represents a +foundational advance in enhancing hateful video detection by underscoring the +necessity of a multimodal and culturally sensitive approach in combating online +hate speech. + +
+
+ comment: 10 pages, 3 figures, ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ A Billion-scale Foundation Model for Remote Sensing Images + + +
+ As the potential of foundation models in visual tasks has garnered +significant attention, pretraining these models before downstream tasks has +become a crucial step. The three key factors in pretraining foundation models +are the pretraining method, the size of the pretraining dataset, and the number +of model parameters. Recently, research in the remote sensing field has focused +primarily on the pretraining method and the size of the dataset, with limited +emphasis on the number of model parameters. This paper addresses this gap by +examining the effect of increasing the number of model parameters on the +performance of foundation models in downstream tasks such as rotated object +detection and semantic segmentation. We pretrained foundation models with +varying numbers of parameters, including 86M, 605.26M, 1.3B, and 2.4B, to +determine whether performance in downstream tasks improved with an increase in +parameters. To the best of our knowledge, this is the first billion-scale +foundation model in the remote sensing field. Furthermore, we propose an +effective method for scaling up and fine-tuning a vision transformer in the +remote sensing field. To evaluate general performance in downstream tasks, we +employed the DOTA v2.0 and DIOR-R benchmark datasets for rotated object +detection, and the Potsdam and LoveDA datasets for semantic segmentation. +Experimental results demonstrated that, across all benchmark datasets and +downstream tasks, the performance of the foundation models and data efficiency +improved as the number of parameters increased. Moreover, our models achieve +the state-of-the-art performance on several datasets including DIOR-R, Postdam, +and LoveDA. + +
+
+ comment: This manuscript is the accepted version for IEEE Journal of Selected + Topics in Applied Earth Observations and Remote Sensing (IEEE J-STARS) +
+
+
+
+
+ + ♻ ☆ UniAV: Unified Audio-Visual Perception for Multi-Task Video Event + Localization + + +
+ Video localization tasks aim to temporally locate specific instances in +videos, including temporal action localization (TAL), sound event detection +(SED) and audio-visual event localization (AVEL). Existing methods +over-specialize on each task, overlooking the fact that these instances often +occur in the same video to form the complete video content. In this work, we +present UniAV, a Unified Audio-Visual perception network, to achieve joint +learning of TAL, SED and AVEL tasks for the first time. UniAV can leverage +diverse data available in task-specific datasets, allowing the model to learn +and share mutually beneficial knowledge across tasks and modalities. To tackle +the challenges posed by substantial variations in datasets +(size/domain/duration) and distinct task characteristics, we propose to +uniformly encode visual and audio modalities of all videos to derive generic +representations, while also designing task-specific experts to capture unique +knowledge for each task. Besides, we develop a unified language-aware +classifier by utilizing a pre-trained text encoder, enabling the model to +flexibly detect various types of instances and previously unseen ones by simply +changing prompts during inference. UniAV outperforms its single-task +counterparts by a large margin with fewer parameters, achieving on-par or +superior performances compared to state-of-the-art task-specific methods across +ActivityNet 1.3, DESED and UnAV-100 benchmarks. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ CoFiI2P: Coarse-to-Fine Correspondences for Image-to-Point Cloud + Registration + + +
+ Image-to-point cloud (I2P) registration is a fundamental task for robots and +autonomous vehicles to achieve cross-modality data fusion and localization. +Current I2P registration methods primarily focus on estimating correspondences +at the point or pixel level, often neglecting global alignment. As a result, +I2P matching can easily converge to a local optimum if it lacks high-level +guidance from global constraints. To improve the success rate and general +robustness, this paper introduces CoFiI2P, a novel I2P registration network +that extracts correspondences in a coarse-to-fine manner. First, the image and +point cloud data are processed through a two-stream encoder-decoder network for +hierarchical feature extraction. Second, a coarse-to-fine matching module is +designed to leverage these features and establish robust feature +correspondences. Specifically, In the coarse matching phase, a novel I2P +transformer module is employed to capture both homogeneous and heterogeneous +global information from the image and point cloud data. This enables the +estimation of coarse super-point/super-pixel matching pairs with discriminative +descriptors. In the fine matching module, point/pixel pairs are established +with the guidance of super-point/super-pixel correspondences. Finally, based on +matching pairs, the transform matrix is estimated with the EPnP-RANSAC +algorithm. Experiments conducted on the KITTI Odometry dataset demonstrate that +CoFiI2P achieves impressive results, with a relative rotation error (RRE) of +1.14 degrees and a relative translation error (RTE) of 0.29 meters, while +maintaining real-time speed.Additional experiments on the Nuscenes datasets +confirm our method's generalizability. The project page is available at +\url{https://whu-usi3dv.github.io/CoFiI2P}. + +
+
+ comment: Submitted to IEEE RA-L (under review); project page is available at: + https://whu-usi3dv.github.io/CoFiI2P +
+
+
+
+
+ + ♻ ☆ Boosting Multimodal Large Language Models with Visual Tokens Withdrawal + for Rapid Inference + + +
+ Multimodal large language models (MLLMs) demand considerable computations for +inference due to the extensive parameters and the additional input tokens +needed for visual information representation. Herein, we introduce Visual +Tokens Withdrawal (VTW), a plug-and-play module to boost MLLMs for rapid +inference. Our approach is inspired by two intriguing phenomena we have +observed: (1) the attention sink phenomenon that is prevalent in LLMs also +persists in MLLMs, suggesting that initial tokens and nearest tokens receive +the majority of attention, while middle vision tokens garner minimal attention +in deep layers; (2) the presence of information migration, which implies that +visual information is transferred to subsequent text tokens within the first +few layers of MLLMs. As per our findings, we conclude that vision tokens are +unnecessary in the deep layers of MLLMs. Thus, we strategically withdraw them +at a certain layer, enabling only text tokens to engage in subsequent layers. +To pinpoint the ideal layer for VTW, we initially analyze a limited set of tiny +datasets and choose the first layer that meets the Kullback-Leibler divergence +criterion. Our VTW approach can cut computational overhead by over 40\% across +diverse multimodal tasks while maintaining performance. Our code is released at +\url{https://github.com/lzhxmu/VTW}. + +
+
+
+
+
+ + ♻ ☆ MS-Twins: Multi-Scale Deep Self-Attention Networks for Medical Image + Segmentation + + +
+ Chest X-ray is one of the most common radiological examination types for the +diagnosis of chest diseases. Nowadays, the automatic classification technology +of radiological images has been widely used in clinical diagnosis and treatment +plans. However, each disease has its own different response characteristic +receptive field region, which is the main challenge for chest disease +classification tasks. Besides, the imbalance of sample data categories further +increases the difficulty of tasks. To solve these problems, we propose a new +multi-label chest disease image classification scheme based on a multi-scale +attention network. In this scheme, multi-scale information is iteratively fused +to focus on regions with a high probability of disease, to effectively mine +more meaningful information from data, and the classification performance can +be improved only by image level annotation. We also designed a new loss +function to improve the rationality of visual perception and the performance of +multi-label image classification by forcing the consistency of attention +regions before and after image transformation. A comprehensive experiment was +carried out on the public Chest X-Ray14 and CheXpert datasets to achieve state +of the art results, which verified the effectiveness of this method in chest +X-ray image classification. + +
+
+
+
+
+ + ♻ ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +implicit-learned memories to retain as many temporal cues as possible. However, +implicit memories still struggle to preserve long-term features effectively. In +this paper, we consider those invisible objects as pseudo-occluded objects and +aim to detect them by tracking through occlusions. Firstly, we introduce the +visibility attribute of objects and contribute an auto-labeling algorithm to +not only clean the existing event camera dataset but also append additional +visibility labels to it. Secondly, we exploit tracking strategies for +pseudo-occluded objects to maintain their permanence and retain their bounding +boxes, even when features have not been available for a very long time. These +strategies can be treated as an explicit-learned memory guided by the tracking +objective to record the displacements of objects across frames. Lastly, we +propose a spatio-temporal feature aggregation module to enrich the latent +features and a consistency loss to increase the robustness of the overall +pipeline. We conduct comprehensive experiments to verify our method's +effectiveness where still objects are retained, but real occluded objects are +discarded. The results demonstrate that (1) the additional visibility labels +can assist in supervised training, and (2) our method outperforms +state-of-the-art approaches with a significant improvement of 7.9% absolute +mAP. + +
+
+
+
+
+ + ♻ ☆ TP3M: Transformer-based Pseudo 3D Image Matching with Reference Image ICRA 2024 + + +
+ Image matching is still challenging in such scenes with large viewpoints or +illumination changes or with low textures. In this paper, we propose a +Transformer-based pseudo 3D image matching method. It upgrades the 2D features +extracted from the source image to 3D features with the help of a reference +image and matches to the 2D features extracted from the destination image by +the coarse-to-fine 3D matching. Our key discovery is that by introducing the +reference image, the source image's fine points are screened and furtherly +their feature descriptors are enriched from 2D to 3D, which improves the match +performance with the destination image. Experimental results on multiple +datasets show that the proposed method achieves the state-of-the-art on the +tasks of homography estimation, pose estimation and visual localization +especially in challenging scenes. + +
+
+ comment: Accepted by ICRA 2024 +
+
+
+
+
+ + ♻ ☆ BackdoorBench: A Comprehensive Benchmark and Analysis of Backdoor + Learning + + +
+ As an emerging and vital topic for studying deep neural networks' +vulnerability (DNNs), backdoor learning has attracted increasing interest in +recent years, and many seminal backdoor attack and defense algorithms are being +developed successively or concurrently, in the status of a rapid arms race. +However, mainly due to the diverse settings, and the difficulties of +implementation and reproducibility of existing works, there is a lack of a +unified and standardized benchmark of backdoor learning, causing unfair +comparisons, and unreliable conclusions (e.g., misleading, biased or even false +conclusions). Consequently, it is difficult to evaluate the current progress +and design the future development roadmap of this literature. To alleviate this +dilemma, we build a comprehensive benchmark of backdoor learning called +BackdoorBench. Our benchmark makes three valuable contributions to the research +community. 1) We provide an integrated implementation of state-of-the-art +(SOTA) backdoor learning algorithms (currently including 16 attack and 27 +defense algorithms), based on an extensible modular-based codebase. 2) We +conduct comprehensive evaluations of 12 attacks against 16 defenses, with 5 +poisoning ratios, based on 4 models and 4 datasets, thus 11,492 pairs of +evaluations in total. 3) Based on above evaluations, we present abundant +analysis from 8 perspectives via 18 useful analysis tools, and provide several +inspiring insights about backdoor learning. We hope that our efforts could +build a solid foundation of backdoor learning to facilitate researchers to +investigate existing algorithms, develop more innovative algorithms, and +explore the intrinsic mechanism of backdoor learning. Finally, we have created +a user-friendly website at http://backdoorbench.com, which collects all +important information of BackdoorBench, including codebase, docs, leaderboard, +and model Zoo. + +
+
+ comment: We have uploaded a new version, which can be accessed at + arXiv:2407.19845 +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The basic theory for deep learning-based + computer vision models + + +
+ Computer vision (CV) is one of the most crucial fields in artificial +intelligence. In recent years, a variety of deep learning models based on +convolutional neural networks (CNNs) and Transformers have been designed to +tackle diverse problems in CV. These algorithms have found practical +applications in areas such as robotics and facial recognition. Despite the +increasing power of current CV models, several fundamental questions remain +unresolved: Why do CNNs require deep layers? What ensures the generalization +ability of CNNs? Why do residual-based networks outperform fully convolutional +networks like VGG? What is the fundamental difference between residual-based +CNNs and Transformer-based networks? Why can CNNs utilize LoRA and pruning +techniques? The root cause of these questions lies in the lack of a robust +theoretical foundation for deep learning models in CV. To address these +critical issues and techniques, we employ the Universal Approximation Theorem +(UAT) to provide a theoretical basis for convolution- and Transformer-based +models in CV. By doing so, we aim to elucidate these questions from a +theoretical perspective. + +
+
+
+
+
+ + ♻ ☆ FA-Depth: Toward Fast and Accurate Self-supervised Monocular Depth + Estimation + + +
+ Most existing methods often rely on complex models to predict scene depth +with high accuracy, resulting in slow inference that is not conducive to +deployment. To better balance precision and speed, we first designed SmallDepth +based on sparsity. Second, to enhance the feature representation ability of +SmallDepth during training under the condition of equal complexity during +inference, we propose an equivalent transformation module(ETM). Third, to +improve the ability of each layer in the case of a fixed SmallDepth to perceive +different context information and improve the robustness of SmallDepth to the +left-right direction and illumination changes, we propose pyramid loss. Fourth, +to further improve the accuracy of SmallDepth, we utilized the proposed +function approximation loss (APX) to transfer knowledge in the pretrained +HQDecv2, obtained by optimizing the previous HQDec to address grid artifacts in +some regions, to SmallDepth. Extensive experiments demonstrate that each +proposed component improves the precision of SmallDepth without changing the +complexity of SmallDepth during inference, and the developed approach achieves +state-of-the-art results on KITTI at an inference speed of more than 500 frames +per second and with approximately 2 M parameters. The code and models will be +publicly available at https://github.com/fwucas/FA-Depth. + +
+
+
+
+
+ + ♻ ☆ A Realistic Protocol for Evaluation of Weakly Supervised Object + Localization + + +
+ Weakly Supervised Object Localization (WSOL) allows training deep learning +models for classification and localization (LOC) using only global class-level +labels. The absence of bounding box (bbox) supervision during training raises +challenges in the literature for hyper-parameter tuning, model selection, and +evaluation. WSOL methods rely on a validation set with bbox annotations for +model selection, and a test set with bbox annotations for threshold estimation +for producing bboxes from localization maps. This approach, however, is not +aligned with the WSOL setting as these annotations are typically unavailable in +real-world scenarios. Our initial empirical analysis shows a significant +decline in LOC performance when model selection and threshold estimation rely +solely on class labels and the image itself, respectively, compared to using +manual bbox annotations. This highlights the importance of incorporating bbox +labels for optimal model performance. In this paper, a new WSOL evaluation +protocol is proposed that provides LOC information without the need for manual +bbox annotations. In particular, we generated noisy pseudo-boxes from a +pretrained off-the-shelf region proposal method such as Selective Search, CLIP, +and RPN for model selection. These bboxes are also employed to estimate the +threshold from LOC maps, circumventing the need for test-set bbox annotations. +Our experiments with several WSOL methods on ILSVRC and CUB datasets show that +using the proposed pseudo-bboxes for validation facilitates the model selection +and threshold estimation, with LOC performance comparable to those selected +using GT bboxes on the validation set and threshold estimation on the test set. +It also outperforms models selected using class-level labels, and then +dynamically thresholded based solely on LOC maps. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ MIS-ME: A Multi-modal Framework for Soil Moisture Estimation + + +
+ Soil moisture estimation is an important task to enable precision agriculture +in creating optimal plans for irrigation, fertilization, and harvest. It is +common to utilize statistical and machine learning models to estimate soil +moisture from traditional data sources such as weather forecasts, soil +properties, and crop properties. However, there is a growing interest in +utilizing aerial and geospatial imagery to estimate soil moisture. Although +these images capture high-resolution crop details, they are expensive to curate +and challenging to interpret. Imagine, an AI-enhanced software tool that +predicts soil moisture using visual cues captured by smartphones and +statistical data given by weather forecasts. This work is a first step towards +that goal of developing a multi-modal approach for soil moisture estimation. In +particular, we curate a dataset consisting of real-world images taken from +ground stations and their corresponding weather data. We also propose MIS-ME - +Meteorological & Image based Soil Moisture Estimator, a multi-modal framework +for soil moisture estimation. Our extensive analysis shows that MIS-ME achieves +a MAPE of 10.14%, outperforming traditional unimodal approaches with a +reduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image +data, highlighting the effectiveness of tailored multi-modal approaches. + +
+
+ comment: Accepted by DSAA2024 +
+
+
+
+
+ + ♻ ☆ BrainFounder: Towards Brain Foundation Models for Neuroimage Analysis + + +
+ The burgeoning field of brain health research increasingly leverages +artificial intelligence (AI) to interpret and analyze neurological data. This +study introduces a novel approach towards the creation of medical foundation +models by integrating a large-scale multi-modal magnetic resonance imaging +(MRI) dataset derived from 41,400 participants in its own. Our method involves +a novel two-stage pretraining approach using vision transformers. The first +stage is dedicated to encoding anatomical structures in generally healthy +brains, identifying key features such as shapes and sizes of different brain +regions. The second stage concentrates on spatial information, encompassing +aspects like location and the relative positioning of brain structures. We +rigorously evaluate our model, BrainFounder, using the Brain Tumor Segmentation +(BraTS) challenge and Anatomical Tracings of Lesions After Stroke v2.0 (ATLAS +v2.0) datasets. BrainFounder demonstrates a significant performance gain, +surpassing the achievements of the previous winning solutions using fully +supervised learning. Our findings underscore the impact of scaling up both the +complexity of the model and the volume of unlabeled training data derived from +generally healthy brains, which enhances the accuracy and predictive +capabilities of the model in complex neuroimaging tasks with MRI. The +implications of this research provide transformative insights and practical +applications in healthcare and make substantial steps towards the creation of +foundation models for Medical AI. Our pretrained models and training code can +be found at https://github.com/lab-smile/GatorBrain. + +
+
+ comment: 19 pages, 5 figures, to be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Is SAM 2 Better than SAM in Medical Image Segmentation? + + +
+ The Segment Anything Model (SAM) has demonstrated impressive performance in +zero-shot promptable segmentation on natural images. The recently released +Segment Anything Model 2 (SAM 2) claims to outperform SAM on images and extends +the model's capabilities to video segmentation. Evaluating the performance of +this new model in medical image segmentation, specifically in a zero-shot +promptable manner, is crucial. In this work, we conducted extensive studies +using multiple datasets from various imaging modalities to compare the +performance of SAM and SAM 2. We employed two point-prompt strategies: (i) +multiple positive prompts where one prompt is placed near the centroid of the +target structure, while the remaining prompts are randomly placed within the +structure, and (ii) combined positive and negative prompts where one positive +prompt is placed near the centroid of the target structure, and two negative +prompts are positioned outside the structure, maximizing the distance from the +positive prompt and from each other. The evaluation encompassed 24 unique +organ-modality combinations, including abdominal structures, cardiac +structures, fetal head images, skin lesions and polyp images across 11 publicly +available MRI, CT, ultrasound, dermoscopy, and endoscopy datasets. Preliminary +results based on 2D images indicate that while SAM 2 may perform slightly +better in a few cases, it does not generally surpass SAM for medical image +segmentation. Notably, SAM 2 performs worse than SAM in lower contrast imaging +modalities, such as CT and ultrasound. However, for MRI images, SAM 2 performs +on par with or better than SAM. Like SAM, SAM 2 also suffers from +over-segmentation issues, particularly when the boundaries of the target organ +are fuzzy. + +
+
+
+
+
+ + ♻ ☆ Commonsense-T2I Challenge: Can Text-to-Image Generation Models + Understand Commonsense? + + +
+ We present a novel task and benchmark for evaluating the ability of +text-to-image(T2I) generation models to produce images that align with +commonsense in real life, which we call Commonsense-T2I. Given two adversarial +text prompts containing an identical set of action words with minor +differences, such as "a lightbulb without electricity" v.s. "a lightbulb with +electricity", we evaluate whether T2I models can conduct visual-commonsense +reasoning, e.g. produce images that fit "the lightbulb is unlit" vs. "the +lightbulb is lit" correspondingly. Commonsense-T2I presents an adversarial +challenge, providing pairwise text prompts along with expected outputs. The +dataset is carefully hand-curated by experts and annotated with fine-grained +labels, such as commonsense type and likelihood of the expected outputs, to +assist analyzing model behavior. We benchmark a variety of state-of-the-art +(sota) T2I models and surprisingly find that, there is still a large gap +between image synthesis and real life photos--even the DALL-E 3 model could +only achieve 48.92% on Commonsense-T2I, and the stable diffusion XL model only +achieves 24.92% accuracy. Our experiments show that GPT-enriched prompts cannot +solve this challenge, and we include a detailed analysis about possible reasons +for such deficiency. We aim for Commonsense-T2I to serve as a high-quality +evaluation benchmark for T2I commonsense checking, fostering advancements in +real life image generation. + +
+
+ comment: COLM 2024, Project Url: https://zeyofu.github.io/CommonsenseT2I/ +
+
+
+
+
+ + ♻ ☆ Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos AAAI 2024 + + +
+ Recent advancements in 4D scene reconstruction using neural radiance fields +(NeRF) have demonstrated the ability to represent dynamic scenes from +multi-view videos. However, they fail to reconstruct the dynamic scenes and +struggle to fit even the training views in unsynchronized settings. It happens +because they employ a single latent embedding for a frame while the multi-view +images at the same frame were actually captured at different moments. To +address this limitation, we introduce time offsets for individual +unsynchronized videos and jointly optimize the offsets with NeRF. By design, +our method is applicable for various baselines and improves them with large +margins. Furthermore, finding the offsets naturally works as synchronizing the +videos without manual effort. Experiments are conducted on the common Plenoptic +Video Dataset and a newly built Unsynchronized Dynamic Blender Dataset to +verify the performance of our method. Project page: +https://seoha-kim.github.io/sync-nerf + +
+
+ comment: AAAI 2024. Project page: https://seoha-kim.github.io/sync-nerf +
+
+
+
+
+ + ♻ ☆ Convergence Properties of Score-Based Models for Linear Inverse Problems + Using Graduated Optimisation + + +
+ The incorporation of generative models as regularisers within variational +formulations for inverse problems has proven effective across numerous image +reconstruction tasks. However, the resulting optimisation problem is often +non-convex and challenging to solve. In this work, we show that score-based +generative models (SGMs) can be used in a graduated optimisation framework to +solve inverse problems. We show that the resulting graduated non-convexity flow +converge to stationary points of the original problem and provide a numerical +convergence analysis of a 2D toy example. We further provide experiments on +computed tomography image reconstruction, where we show that this framework is +able to recover high-quality images, independent of the initial value. The +experiments highlight the potential of using SGMs in graduated optimisation +frameworks. The source code is publicly available on GitHub. + +
+
+ comment: 8 pages +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Perceptual Similarity for Measuring Decision-Making Style and Policy + Diversity in Games + + +
+ Defining and measuring decision-making styles, also known as playstyles, is +crucial in gaming, where these styles reflect a broad spectrum of individuality +and diversity. However, finding a universally applicable measure for these +styles poses a challenge. Building on Playstyle Distance, the first +unsupervised metric to measure playstyle similarity based on game screens and +raw actions, we introduce three enhancements to increase accuracy: multiscale +analysis with varied state granularity, a perceptual kernel rooted in +psychology, and the utilization of the intersection-over-union method for +efficient evaluation. These innovations not only advance measurement precision +but also offer insights into human cognition of similarity. Across two racing +games and seven Atari games, our techniques significantly improve the precision +of zero-shot playstyle classification, achieving an accuracy exceeding 90 +percent with fewer than 512 observation-action pairs, which is less than half +an episode of these games. Furthermore, our experiments with 2048 and Go +demonstrate the potential of discrete playstyle measures in puzzle and board +games. We also develop an algorithm for assessing decision-making diversity +using these measures. Our findings improve the measurement of end-to-end game +analysis and the evolution of artificial intelligence for diverse playstyles. + +
+
+ comment: TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49 +
+
+
+
+
+ + ☆ The landscape of ontologies in materials science and engineering: A + survey and evaluation + + +
+ Ontologies are widely used in materials science to describe experiments, +processes, material properties, and experimental and computational workflows. +Numerous online platforms are available for accessing and sharing ontologies in +Materials Science and Engineering (MSE). Additionally, several surveys of these +ontologies have been conducted. However, these studies often lack comprehensive +analysis and quality control metrics. This paper provides an overview of +ontologies used in Materials Science and Engineering to assist domain experts +in selecting the most suitable ontology for a given purpose. Sixty selected +ontologies are analyzed and compared based on the requirements outlined in this +paper. Statistical data on ontology reuse and key metrics are also presented. +The evaluation results provide valuable insights into the strengths and +weaknesses of the investigated MSE ontologies. This enables domain experts to +select suitable ontologies and to incorporate relevant terms from existing +resources. + +
+
+
+
+
+ + ☆ ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge + Graph QA datasets with Large Language Models + + +
+ The rapid advancement of Large Language Models (LLMs) and conversational +assistants necessitates dynamic, scalable, and configurable conversational +datasets for training and evaluation. These datasets must accommodate diverse +user interaction modes, including text and voice, each presenting unique +modeling challenges. Knowledge Graphs (KGs), with their structured and evolving +nature, offer an ideal foundation for current and precise knowledge. Although +human-curated KG-based conversational datasets exist, they struggle to keep +pace with the rapidly changing user information needs. We present ConvKGYarn, a +scalable method for generating up-to-date and configurable conversational KGQA +datasets. Qualitative psychometric analyses confirm our method can generate +high-quality datasets rivaling a popular conversational KGQA dataset while +offering it at scale and covering a wide range of human-interaction +configurations. We showcase its utility by testing LLMs on diverse +conversations - exploring model behavior on conversational KGQA sets with +different configurations grounded in the same KG fact set. Our results +highlight the ability of ConvKGYarn to improve KGQA foundations and evaluate +parametric knowledge of LLMs, thus offering a robust solution to the constantly +evolving landscape of conversational assistants. + +
+
+
+
+
+ + ☆ Optimizing RAG Techniques for Automotive Industry PDF Chatbots: A Case + Study with Locally Deployed Ollama Models + + +
+ With the growing demand for offline PDF chatbots in automotive industrial +production environments, optimizing the deployment of large language models +(LLMs) in local, low-performance settings has become increasingly important. +This study focuses on enhancing Retrieval-Augmented Generation (RAG) techniques +for processing complex automotive industry documents using locally deployed +Ollama models. Based on the Langchain framework, we propose a multi-dimensional +optimization approach for Ollama's local RAG implementation. Our method +addresses key challenges in automotive document processing, including +multi-column layouts and technical specifications. We introduce improvements in +PDF processing, retrieval mechanisms, and context compression, tailored to the +unique characteristics of automotive industry documents. Additionally, we +design custom classes supporting embedding pipelines and an agent supporting +self-RAG based on LangGraph best practices. To evaluate our approach, we +constructed a proprietary dataset comprising typical automotive industry +documents, including technical reports and corporate regulations. We compared +our optimized RAG model and self-RAG agent against a naive RAG baseline across +three datasets: our automotive industry dataset, QReCC, and CoQA. Results +demonstrate significant improvements in context precision, context recall, +answer relevancy, and faithfulness, with particularly notable performance on +the automotive industry dataset. Our optimization scheme provides an effective +solution for deploying local RAG systems in the automotive sector, addressing +the specific needs of PDF chatbots in industrial production environments. This +research has important implications for advancing information processing and +intelligent production in the automotive industry. + +
+
+
+
+
+ + ☆ Low-Rank Approximation, Adaptation, and Other Tales + + +
+ Low-rank approximation is a fundamental technique in modern data analysis, +widely utilized across various fields such as signal processing, machine +learning, and natural language processing. Despite its ubiquity, the mechanics +of low-rank approximation and its application in adaptation can sometimes be +obscure, leaving practitioners and researchers with questions about its true +capabilities and limitations. This paper seeks to clarify low-rank +approximation and adaptation by offering a comprehensive guide that reveals +their inner workings and explains their utility in a clear and accessible way. +Our focus here is to develop a solid intuition for how low-rank approximation +and adaptation operate, and why they are so effective. We begin with basic +concepts and gradually build up to the mathematical underpinnings, ensuring +that readers of all backgrounds can gain a deeper understanding of low-rank +approximation and adaptation. We strive to strike a balance between informal +explanations and rigorous mathematics, ensuring that both newcomers and +experienced experts can benefit from this survey. Additionally, we introduce +new low-rank decomposition and adaptation algorithms that have not yet been +explored in the field, hoping that future researchers will investigate their +potential applicability. + +
+
+
+
+
+ + ☆ Learned Ranking Function: From Short-term Behavior Predictions to + Long-term User Satisfaction RecSys 24 + + +
+ We present the Learned Ranking Function (LRF), a system that takes short-term +user-item behavior predictions as input and outputs a slate of recommendations +that directly optimizes for long-term user satisfaction. Most previous work is +based on optimizing the hyperparameters of a heuristic function. We propose to +model the problem directly as a slate optimization problem with the objective +of maximizing long-term user satisfaction. We also develop a novel constraint +optimization algorithm that stabilizes objective trade-offs for multi-objective +optimization. We evaluate our approach with live experiments and describe its +deployment on YouTube. + +
+
+ comment: RecSys 24 +
+
+
+
+
+ + ☆ Investigating Characteristics of Media Recommendation Solicitation in + r/ifyoulikeblank + + +
+ Despite the existence of search-based recommender systems like Google, +Netflix, and Spotify, online users sometimes may turn to crowdsourced +recommendations in places like the r/ifyoulikeblank subreddit. In this +exploratory study, we probe why users go to r/ifyoulikeblank, how they look for +recommendation, and how the subreddit users respond to recommendation requests. +To answer, we collected sample posts from r/ifyoulikeblank and analyzed them +using a qualitative approach. Our analysis reveals that users come to this +subreddit for various reasons, such as exhausting popular search systems, not +knowing what or how to search for an item, and thinking crowd have better +knowledge than search systems. Examining users query and their description, we +found novel information users provide during recommendation seeking using +r/ifyoulikeblank. For example, sometimes they ask for artifacts recommendation +based on the tools used to create them. Or, sometimes indicating a +recommendation seeker's time constraints can help better suit recommendations +to their needs. Finally, recommendation responses and interactions revealed +patterns of how requesters and responders refine queries and recommendations. +Our work informs future intelligent recommender systems design. + +
+
+ comment: page 23 +
+
+
+
+
+ + ☆ Scalable recommender system based on factor analysis + + +
+ Recommender systems have become crucial in the modern digital landscape, +where personalized content, products, and services are essential for enhancing +user experience. This paper explores statistical models for recommender +systems, focusing on crossed random effects models and factor analysis. We +extend the crossed random effects model to include random slopes, enabling the +capture of varying covariate effects among users and items. Additionally, we +investigate the use of factor analysis in recommender systems, particularly for +settings with incomplete data. The paper also discusses scalable solutions +using the Expectation Maximization (EM) and variational EM algorithms for +parameter estimation, highlighting the application of these models to predict +user-item interactions effectively. + +
+
+
+
+
+ + ♻ ☆ Wiping out the limitations of Large Language Models -- A Taxonomy for + Retrieval Augmented Generation + + +
+ Current research on RAGs is distributed across various disciplines, and since +the technology is evolving very quickly, its unit of analysis is mostly on +technological innovations, rather than applications in business contexts. Thus, +in this research, we aim to create a taxonomy to conceptualize a comprehensive +overview of the constituting characteristics that define RAG applications, +facilitating the adoption of this technology in the IS community. To the best +of our knowledge, no RAG application taxonomies have been developed so far. We +describe our methodology for developing the taxonomy, which includes the +criteria for selecting papers, an explanation of our rationale for employing a +Large Language Model (LLM)-supported approach to extract and identify initial +characteristics, and a concise overview of our systematic process for +conceptualizing the taxonomy. Our systematic taxonomy development process +includes four iterative phases designed to refine and enhance our understanding +and presentation of RAG's core dimensions. We have developed a total of five +meta-dimensions and sixteen dimensions to comprehensively capture the concept +of Retrieval-Augmented Generation (RAG) applications. When discussing our +findings, we also detail the specific research areas and pose key research +questions to guide future information system researchers as they explore the +emerging topics of RAG systems. + +
+
+
+
+
+ + ♻ ☆ Protecting Copyrighted Material with Unique Identifiers in Large + Language Model Training + + +
+ A major public concern regarding the training of large language models (LLMs) +is whether they abusing copyrighted online text. Previous membership inference +methods may be misled by similar examples in vast amounts of training data. +Additionally, these methods are often too complex for general users to +understand and use, making them centralized, lacking transparency, and +trustworthiness. To address these issues, we propose an alternative +\textit{insert-and-detection} methodology, advocating that web users and +content platforms employ \textbf{\textit{unique identifiers}} for reliable and +independent membership inference. Users and platforms can create their own +identifiers, embed them in copyrighted text, and independently detect them in +future LLMs. As an initial demonstration, we introduce \textit{ghost +sentences}, a primitive form of unique identifiers, consisting primarily of +passphrases made up of random words. By embedding one ghost sentences in a few +copyrighted texts, users can detect its membership using a perplexity test and +a \textit{user-friendly} last-$k$ words test. The perplexity test is based on +the fact that LLMs trained on natural language should exhibit high perplexity +when encountering unnatural passphrases. As the repetition increases, users can +leverage the verbatim memorization ability of LLMs to perform a last-$k$ words +test by chatting with LLMs without writing any code. Both tests offer rigorous +statistical guarantees for membership inference. For LLaMA-13B, a perplexity +test on 30 ghost sentences with an average of 7 repetitions in 148K examples +yields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out +of 16 users, with an average of 24 examples each, successfully identify their +data from 1.8M examples. + +
+
+ comment: Preprint, work in progress +
+
+
+
+
+ + ♻ ☆ Enhancing Node Representations for Real-World Complex Networks with + Topological Augmentation ECAI 2024 + + +
+ Graph augmentation methods play a crucial role in improving the performance +and enhancing generalisation capabilities in Graph Neural Networks (GNNs). +Existing graph augmentation methods mainly perturb the graph structures, and +are usually limited to pairwise node relations. These methods cannot fully +address the complexities of real-world large-scale networks, which often +involve higher-order node relations beyond only being pairwise. Meanwhile, +real-world graph datasets are predominantly modelled as simple graphs, due to +the scarcity of data that can be used to form higher-order edges. Therefore, +reconfiguring the higher-order edges as an integration into graph augmentation +strategies lights up a promising research path to address the aforementioned +issues. In this paper, we present Topological Augmentation (TopoAug), a novel +graph augmentation method that builds a combinatorial complex from the original +graph by constructing virtual hyperedges directly from the raw data. TopoAug +then produces auxiliary node features by extracting information from the +combinatorial complex, which are used for enhancing GNN performances on +downstream tasks. We design three diverse virtual hyperedge construction +strategies to accompany the construction of combinatorial complexes: (1) via +graph statistics, (2) from multiple data perspectives, and (3) utilising +multi-modality. Furthermore, to facilitate TopoAug evaluation, we provide 23 +novel real-world graph datasets across various domains including social media, +biology, and e-commerce. Our empirical study shows that TopoAug consistently +and significantly outperforms GNN baselines and other graph augmentation +methods, across a variety of application contexts, which clearly indicates that +it can effectively incorporate higher-order node relations into the graph +augmentation for real-world complex networks. + +
+
+ comment: In 27th European Conference on Artificial Intelligence (ECAI 2024). + 13 pages, 2 figures, 13 tables +
+
+
+
+
+
+
+
+ + Machine Learning 146 + +
+
+
+ + ☆ LOLgorithm: Integrating Semantic,Syntactic and Contextual Elements for + Humor Classification + + +
+ This paper explores humor detection through a linguistic lens, prioritizing +syntactic, semantic, and contextual features over computational methods in +Natural Language Processing. We categorize features into syntactic, semantic, +and contextual dimensions, including lexicons, structural statistics, Word2Vec, +WordNet, and phonetic style. Our proposed model, Colbert, utilizes BERT +embeddings and parallel hidden layers to capture sentence congruity. By +combining syntactic, semantic, and contextual features, we train Colbert for +humor detection. Feature engineering examines essential syntactic and semantic +features alongside BERT embeddings. SHAP interpretations and decision trees +identify influential features, revealing that a holistic approach improves +humor detection accuracy on unseen data. Integrating linguistic cues from +different dimensions enhances the model's ability to understand humor +complexity beyond traditional computational methods. + +
+
+
+
+
+ + ☆ Can We Rely on LLM Agents to Draft Long-Horizon Plans? Let's Take + TravelPlanner as an Example + + +
+ Large language models (LLMs) have brought autonomous agents closer to +artificial general intelligence (AGI) due to their promising generalization and +emergent capabilities. There is, however, a lack of studies on how LLM-based +agents behave, why they could potentially fail, and how to improve them, +particularly in demanding real-world planning tasks. In this paper, as an +effort to fill the gap, we present our study using a realistic benchmark, +TravelPlanner, where an agent must meet multiple constraints to generate +accurate plans. We leverage this benchmark to address four key research +questions: (1) are LLM agents robust enough to lengthy and noisy contexts when +it comes to reasoning and planning? (2) can few-shot prompting adversely impact +the performance of LLM agents in scenarios with long context? (3) can we rely +on refinement to improve plans, and (4) can fine-tuning LLMs with both positive +and negative feedback lead to further improvement? Our comprehensive +experiments indicate that, firstly, LLMs often fail to attend to crucial parts +of a long context, despite their ability to handle extensive reference +information and few-shot examples; secondly, they still struggle with analyzing +the long plans and cannot provide accurate feedback for refinement; thirdly, we +propose Feedback-Aware Fine-Tuning (FAFT), which leverages both positive and +negative feedback, resulting in substantial gains over Supervised Fine-Tuning +(SFT). Our findings offer in-depth insights to the community on various aspects +related to real-world planning applications. + +
+
+ comment: 13 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Body Transformer: Leveraging Robot Embodiment for Policy Learning + + +
+ In recent years, the transformer architecture has become the de facto +standard for machine learning algorithms applied to natural language processing +and computer vision. Despite notable evidence of successful deployment of this +architecture in the context of robot learning, we claim that vanilla +transformers do not fully exploit the structure of the robot learning problem. +Therefore, we propose Body Transformer (BoT), an architecture that leverages +the robot embodiment by providing an inductive bias that guides the learning +process. We represent the robot body as a graph of sensors and actuators, and +rely on masked attention to pool information throughout the architecture. The +resulting architecture outperforms the vanilla transformer, as well as the +classical multilayer perceptron, in terms of task completion, scaling +properties, and computational efficiency when representing either imitation or +reinforcement learning policies. Additional material including the open-source +code is available at https://sferrazza.cc/bot_site. + +
+
+
+
+
+ + ☆ Finding Patterns in Ambiguity: Interpretable Stress Testing in the + Decision~Boundary CVPR + + +
+ The increasing use of deep learning across various domains highlights the +importance of understanding the decision-making processes of these black-box +models. Recent research focusing on the decision boundaries of deep +classifiers, relies on generated synthetic instances in areas of low +confidence, uncovering samples that challenge both models and humans. We +propose a novel approach to enhance the interpretability of deep binary +classifiers by selecting representative samples from the decision boundary - +prototypes - and applying post-model explanation algorithms. We evaluate the +effectiveness of our approach through 2D visualizations and GradientSHAP +analysis. Our experiments demonstrate the potential of the proposed method, +revealing distinct and compact clusters and diverse prototypes that capture +essential features that lead to low-confidence decisions. By offering a more +aggregated view of deep classifiers' decision boundaries, our work contributes +to the responsible development and deployment of reliable machine learning +systems. + +
+
+ comment: To be published in the Responsible Generative AI workshop at CVPR +
+
+
+
+
+ + ☆ Inverse designing metamaterials with programmable nonlinear functional + responses in graph space + + +
+ Material responses to static and dynamic stimuli, represented as nonlinear +curves, are design targets for engineering functionalities like structural +support, impact protection, and acoustic and photonic bandgaps. +Three-dimensional metamaterials offer significant tunability due to their +internal structure, yet existing methods struggle to capture their complex +behavior-to-structure relationships. We present GraphMetaMat, a graph-based +framework capable of designing three-dimensional metamaterials with +programmable responses and arbitrary manufacturing constraints. Integrating +graph networks, physics biases, reinforcement learning, and tree search, +GraphMetaMat can target stress-strain curves spanning four orders of magnitude +and complex behaviors, as well as viscoelastic transmission responses with +varying attenuation gaps. GraphMetaMat can create cushioning materials for +protective equipment and vibration-damping panels for electric vehicles, +outperforming commercial materials, and enabling the automatic design of +materials with on-demand functionalities. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ LEARN: An Invex Loss for Outlier Oblivious Robust Online Optimization + + +
+ We study a robust online convex optimization framework, where an adversary +can introduce outliers by corrupting loss functions in an arbitrary number of +rounds k, unknown to the learner. Our focus is on a novel setting allowing +unbounded domains and large gradients for the losses without relying on a +Lipschitz assumption. We introduce the Log Exponential Adjusted Robust and +iNvex (LEARN) loss, a non-convex (invex) robust loss function to mitigate the +effects of outliers and develop a robust variant of the online gradient descent +algorithm by leveraging the LEARN loss. We establish tight regret guarantees +(up to constants), in a dynamic setting, with respect to the uncorrupted rounds +and conduct experiments to validate our theory. Furthermore, we present a +unified analysis framework for developing online optimization algorithms for +non-convex (invex) losses, utilizing it to provide regret bounds with respect +to the LEARN loss, which may be of independent interest. + +
+
+
+
+
+ + ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aids to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+ + ☆ Mambular: A Sequential Model for Tabular Deep Learning + + +
+ The analysis of tabular data has traditionally been dominated by +gradient-boosted decision trees (GBDTs), known for their proficiency with mixed +categorical and numerical features. However, recent deep learning innovations +are challenging this dominance. We introduce Mambular, an adaptation of the +Mamba architecture optimized for tabular data. We extensively benchmark +Mambular against state-of-the-art models, including neural networks and +tree-based methods, and demonstrate its competitive performance across diverse +datasets. Additionally, we explore various adaptations of Mambular to +understand its effectiveness for tabular data. We investigate different pooling +strategies, feature interaction mechanisms, and bi-directional processing. Our +analysis shows that interpreting features as a sequence and passing them +through Mamba layers results in surprisingly performant models. The results +highlight Mambulars potential as a versatile and powerful architecture for +tabular data analysis, expanding the scope of deep learning applications in +this domain. + The source code is available at https://github.com/basf/mamba-tabular. + +
+
+
+
+
+ + ☆ Synthetic Patient-Physician Dialogue Generation from Clinical Notes + Using LLM + + +
+ Medical dialogue systems (MDS) enhance patient-physician communication, +improve healthcare accessibility, and reduce costs. However, acquiring suitable +data to train these systems poses significant challenges. Privacy concerns +prevent the use of real conversations, necessitating synthetic alternatives. +Synthetic dialogue generation from publicly available clinical notes offers a +promising solution to this issue, providing realistic data while safeguarding +privacy. Our approach, SynDial, uses a single LLM iteratively with zero-shot +prompting and a feedback loop to generate and refine high-quality synthetic +dialogues. The feedback consists of weighted evaluation scores for similarity +and extractiveness. The iterative process ensures dialogues meet predefined +thresholds, achieving superior extractiveness as a result of the feedback loop. +Additionally, evaluation shows that the generated dialogues excel in factuality +metric compared to the baselines and has comparable diversity scores with GPT4. + +
+
+
+
+
+ + ☆ Multi-marginal Schrödinger Bridges with Iterative Reference + + +
+ Practitioners frequently aim to infer an unobserved population trajectory +using sample snapshots at multiple time points. For instance, in single-cell +sequencing, scientists would like to learn how gene expression evolves over +time. But sequencing any cell destroys that cell. So we cannot access any +cell's full trajectory, but we can access snapshot samples from many cells. +Stochastic differential equations are commonly used to analyze systems with +full individual-trajectory access; since here we have only sample snapshots, +these methods are inapplicable. The deep learning community has recently +explored using Schr\"odinger bridges (SBs) and their extensions to estimate +these dynamics. However, these methods either (1) interpolate between just two +time points or (2) require a single fixed reference dynamic within the SB, +which is often just set to be Brownian motion. But learning piecewise from +adjacent time points can fail to capture long-term dependencies. And +practitioners are typically able to specify a model class for the reference +dynamic but not the exact values of the parameters within it. So we propose a +new method that (1) learns the unobserved trajectories from sample snapshots +across multiple time points and (2) requires specification only of a class of +reference dynamics, not a single fixed one. In particular, we suggest an +iterative projection method inspired by Schr\"odinger bridges; we alternate +between learning a piecewise SB on the unobserved trajectories and using the +learned SB to refine our best guess for the dynamics within the reference +class. We demonstrate the advantages of our method via a well-known simulated +parametric model from ecology, simulated and real data from systems biology, +and real motion-capture data. + +
+
+
+
+
+ + ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ☆ DUNE: A Machine Learning Deep UNet++ based Ensemble Approach to Monthly, + Seasonal and Annual Climate Forecasting + + +
+ Capitalizing on the recent availability of ERA5 monthly averaged long-term +data records of mean atmospheric and climate fields based on high-resolution +reanalysis, deep-learning architectures offer an alternative to physics-based +daily numerical weather predictions for subseasonal to seasonal (S2S) and +annual means. A novel Deep UNet++-based Ensemble (DUNE) neural architecture is +introduced, employing multi-encoder-decoder structures with residual blocks. +When initialized from a prior month or year, this architecture produced the +first AI-based global monthly, seasonal, or annual mean forecast of 2-meter +temperatures (T2m) and sea surface temperatures (SST). ERA5 monthly mean data +is used as input for T2m over land, SST over oceans, and solar radiation at the +top of the atmosphere for each month of 40 years to train the model. Validation +forecasts are performed for an additional two years, followed by five years of +forecast evaluations to account for natural annual variability. AI-trained +inference forecast weights generate forecasts in seconds, enabling ensemble +seasonal forecasts. Root Mean Squared Error (RMSE), Anomaly Correlation +Coefficient (ACC), and Heidke Skill Score (HSS) statistics are presented +globally and over specific regions. These forecasts outperform persistence, +climatology, and multiple linear regression for all domains. DUNE forecasts +demonstrate comparable statistical accuracy to NOAA's operational monthly and +seasonal probabilistic outlook forecasts over the US but at significantly +higher resolutions. RMSE and ACC error statistics for other recent AI-based +daily forecasts also show superior performance for DUNE-based forecasts. The +DUNE model's application to an ensemble data assimilation cycle shows +comparable forecast accuracy with a single high-resolution model, potentially +eliminating the need for retraining on extrapolated datasets. + +
+
+ comment: Excluding Appendix: 18 pages, 14 figures +
+
+
+
+
+ + ☆ Open-Source Molecular Processing Pipeline for Generating Molecules + + +
+ Generative models for molecules have shown considerable promise for use in +computational chemistry, but remain difficult to use for non-experts. For this +reason, we introduce open-source infrastructure for easily building generative +molecular models into the widely used DeepChem [Ramsundar et al., 2019] library +with the aim of creating a robust and reusable molecular generation pipeline. +In particular, we add high quality PyTorch [Paszke et al., 2019] +implementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao +and Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our +implementations show strong performance comparable with past work [Kuznetsov +and Polykovskiy, 2021, Cao and Kipf, 2022]. + +
+
+ comment: Presented at the 2024 Molecular Machine Learning Conference (MoML + 2024) +
+
+
+
+
+ + ☆ Deep Learning System Boundary Testing through Latent Space Style Mixing + + +
+ Evaluating the behavioral frontier of deep learning (DL) systems is crucial +for understanding their generalizability and robustness. However, boundary +testing is challenging due to their high-dimensional input space. Generative +artificial intelligence offers a promising solution by modeling data +distribution within compact latent space representations, thereby facilitating +finer-grained explorations. In this work, we introduce MIMICRY, a novel +black-box system-agnostic test generator that leverages these latent +representations to generate frontier inputs for the DL systems under test. +Specifically, MIMICRY uses style-based generative adversarial networks trained +to learn the representation of inputs with disentangled features. This +representation enables embedding style-mixing operations between a source and a +target input, combining their features to explore the boundary between them. We +evaluated the effectiveness of different MIMICRY configurations in generating +boundary inputs for four popular DL image classification systems. Our results +show that manipulating the latent space allows for effective and efficient +exploration of behavioral frontiers. As opposed to a model-based baseline, +MIMICRY generates a higher quality frontier of behaviors which includes more +and closer inputs. Additionally, we assessed the validity of these inputs, +revealing a high validity rate according to human assessors. + +
+
+
+
+
+ + ☆ Reciprocal Learning + + +
+ We demonstrate that a wide array of machine learning algorithms are specific +instances of one single paradigm: reciprocal learning. These instances range +from active learning over multi-armed bandits to self-training. We show that +all these algorithms do not only learn parameters from data but also vice +versa: They iteratively alter training data in a way that depends on the +current model fit. We introduce reciprocal learning as a generalization of +these algorithms using the language of decision theory. This allows us to study +under what conditions they converge. The key is to guarantee that reciprocal +learning contracts such that the Banach fixed-point theorem applies. In this +way, we find that reciprocal learning algorithms converge at linear rates to an +approximately optimal model under relatively mild assumptions on the loss +function, if their predictions are probabilistic and the sample adaption is +both non-greedy and either randomized or regularized. We interpret these +findings and provide corollaries that relate them to specific active learning, +self-training, and bandit algorithms. + +
+
+ comment: 41 pages, 3 figures +
+
+
+
+
+ + ☆ A Comprehensive Case Study on the Performance of Machine Learning + Methods on the Classification of Solar Panel Electroluminescence Images + + +
+ Photovoltaics (PV) are widely used to harvest solar energy, an important form +of renewable energy. Photovoltaic arrays consist of multiple solar panels +constructed from solar cells. Solar cells in the field are vulnerable to +various defects, and electroluminescence (EL) imaging provides effective and +non-destructive diagnostics to detect those defects. We use multiple +traditional machine learning and modern deep learning models to classify EL +solar cell images into different functional/defective categories. Because of +the asymmetry in the number of functional vs. defective cells, an imbalanced +label problem arises in the EL image data. The current literature lacks +insights on which methods and metrics to use for model training and prediction. +In this paper, we comprehensively compare different machine learning and deep +learning methods under different performance metrics on the classification of +solar cell EL images from monocrystalline and polycrystalline modules. We +provide a comprehensive discussion on different metrics. Our results provide +insights and guidelines for practitioners in selecting prediction methods and +performance metrics. + +
+
+ comment: 30 pages, 14 figures +
+
+
+
+
+ + ☆ A Large-Scale Study of Model Integration in ML-Enabled Software Systems + + +
+ The rise of machine learning (ML) and its embedding in systems has +drastically changed the engineering of software-intensive systems. +Traditionally, software engineering focuses on manually created artifacts such +as source code and the process of creating them, as well as best practices for +integrating them, i.e., software architectures. In contrast, the development of +ML artifacts, i.e. ML models, comes from data science and focuses on the ML +models and their training data. However, to deliver value to end users, these +ML models must be embedded in traditional software, often forming complex +topologies. In fact, ML-enabled software can easily incorporate many different +ML models. While the challenges and practices of building ML-enabled systems +have been studied to some extent, beyond isolated examples, little is known +about the characteristics of real-world ML-enabled systems. Properly embedding +ML models in systems so that they can be easily maintained or reused is far +from trivial. We need to improve our empirical understanding of such systems, +which we address by presenting the first large-scale study of real ML-enabled +software systems, covering over 2,928 open source systems on GitHub. We +classified and analyzed them to determine their characteristics, as well as +their practices for reusing ML models and related code, and the architecture of +these systems. Our findings provide practitioners and researchers with insight +into practices for embedding and integrating ML models, bringing data science +and software engineering closer together. + +
+
+
+
+
+ + ☆ A Digital Twin Framework Utilizing Machine Learning for Robust + Predictive Maintenance: Enhancing Tire Health Monitoring + + +
+ We introduce a novel digital twin framework for predictive maintenance of +long-term physical systems. Using monitoring tire health as an application, we +show how the digital twin framework can be used to enhance automotive safety +and efficiency, and how the technical challenges can be overcome using a +three-step approach. Firstly, for managing the data complexity over a long +operation span, we employ data reduction techniques to concisely represent +physical tires using historical performance and usage data. Relying on these +data, for fast real-time prediction, we train a transformer-based model offline +on our concise dataset to predict future tire health over time, represented as +Remaining Casing Potential (RCP). Based on our architecture, our model +quantifies both epistemic and aleatoric uncertainty, providing reliable +confidence intervals around predicted RCP. Secondly, to incorporate real-time +data, we update the predictive model in the digital twin framework, ensuring +its accuracy throughout its life span with the aid of hybrid modeling and the +use of discrepancy function. Thirdly, to assist decision making in predictive +maintenance, we implement a Tire State Decision Algorithm, which strategically +determines the optimal timing for tire replacement based on RCP forecasted by +our transformer model. This approach ensures our digital twin accurately +predicts system health, continually refines its digital representation, and +supports predictive maintenance decisions. Our framework effectively embodies a +physical system, leveraging big data and machine learning for predictive +maintenance, model updates, and decision-making. + +
+
+ comment: Paper accepted at ASME IDETC 2024, and fast-tracked for ASME Journal + of Computing and Information Science in Engineering +
+
+
+
+
+ + ☆ Computability of Classification and Deep Learning: From Theoretical + Limits to Practical Feasibility through Quantization + + +
+ The unwavering success of deep learning in the past decade led to the +increasing prevalence of deep learning methods in various application fields. +However, the downsides of deep learning, most prominently its lack of +trustworthiness, may not be compatible with safety-critical or +high-responsibility applications requiring stricter performance guarantees. +Recently, several instances of deep learning applications have been shown to be +subject to theoretical limitations of computability, undermining the +feasibility of performance guarantees when employed on real-world computers. We +extend the findings by studying computability in the deep learning framework +from two perspectives: From an application viewpoint in the context of +classification problems and a general limitation viewpoint in the context of +training neural networks. In particular, we show restrictions on the +algorithmic solvability of classification problems that also render the +algorithmic detection of failure in computations in a general setting +infeasible. Subsequently, we prove algorithmic limitations in training deep +neural networks even in cases where the underlying problem is well-behaved. +Finally, we end with a positive observation, showing that in quantized versions +of classification and deep network training, computability restrictions do not +arise or can be overcome to a certain degree. + +
+
+
+
+
+ + ☆ Improving Structural Diversity of Blackbox LLMs via + Chain-of-Specification Prompting + + +
+ The capability to generate diverse text is a key challenge facing large +language models (LLMs). Thus far, diversity has been studied via metrics such +as $n$-gram diversity or diversity of BERT embeddings. However, for these kinds +of diversity, the user has little control over the dimensions along which +diversity is considered. For example, in the poetry domain, one might desire +diversity in terms of rhyme and meter, whereas in the code domain, one might +desire diversity in terms of the kinds of expressions used to solve a problem. +We propose a diversity metric called structural diversity, where the user +provides a mapping from generated text to features capturing the kinds of +diversity that they care about. In addition, we propose a novel strategy called +chain-of-specification (CoS) prompting for improving diversity by first having +the LLM generate a specification encoding one instance of structural features, +and then prompting the LLM to generate text that satisfies these features; +notably, our strategy works with blackbox LLMs. In our experiments, we show +that for structural diversity in the poetry and code domains, CoS significantly +improves diversity compared to several baselines. + +
+
+
+
+
+ + ☆ Centralized and Federated Heart Disease Classification Models Using UCI + Dataset and their Shapley-value Based Interpretability + + +
+ Cardiovascular diseases are a leading cause of mortality worldwide, +highlighting the need for accurate diagnostic methods. This study benchmarks +centralized and federated machine learning algorithms for heart disease +classification using the UCI dataset which includes 920 patient records from +four hospitals in the USA, Hungary and Switzerland. Our benchmark is supported +by Shapley-value interpretability analysis to quantify features' importance for +classification. In the centralized setup, various binary classification +algorithms are trained on pooled data, with a support vector machine (SVM) +achieving the highest testing accuracy of 83.3\%, surpassing the established +benchmark of 78.7\% with logistic regression. Additionally, federated learning +algorithms with four clients (hospitals) are explored, leveraging the dataset's +natural partition to enhance privacy without sacrificing accuracy. Federated +SVM, an uncommon approach in the literature, achieves a top testing accuracy of +73.8\%. Our interpretability analysis aligns with existing medical knowledge of +heart disease indicators. Overall, this study establishes a benchmark for +efficient and interpretable pre-screening tools for heart disease while +maintaining patients' privacy. + +
+
+
+
+
+ + ☆ A Methodological Report on Anomaly Detection on Dynamic Knowledge Graphs + + +
+ In this paper, we explore different approaches to anomaly detection on +dynamic knowledge graphs, specifically in a microservices environment for +Kubernetes applications. Our approach explores three dynamic knowledge graph +representations: sequential data, one-hop graph structure, and two-hop graph +structure, with each representation incorporating increasingly complex +structural information. Each phase includes different machine learning and deep +learning models. We empirically analyse their performance and propose an +approach based on ensemble learning of these models. Our approach significantly +outperforms the baseline on the ISWC 2024 Dynamic Knowledge Graph Anomaly +Detection dataset, providing a robust solution for anomaly detection in dynamic +complex data. + +
+
+
+
+
+ + ☆ Contexts Matter: An Empirical Study on Contextual Influence in Fairness + Testing for Deep Learning Systems + + +
+ Background: Fairness testing for deep learning systems has been becoming +increasingly important. However, much work assumes perfect context and +conditions from the other parts: well-tuned hyperparameters for accuracy; +rectified bias in data, and mitigated bias in the labeling. Yet, these are +often difficult to achieve in practice due to their resource-/labour-intensive +nature. Aims: In this paper, we aim to understand how varying contexts affect +fairness testing outcomes. Method:We conduct an extensive empirical study, +which covers $10,800$ cases, to investigate how contexts can change the +fairness testing result at the model level against the existing assumptions. We +also study why the outcomes were observed from the lens of correlation/fitness +landscape analysis. Results: Our results show that different context types and +settings generally lead to a significant impact on the testing, which is mainly +caused by the shifts of the fitness landscape under varying contexts. +Conclusions: Our findings provide key insights for practitioners to evaluate +the test generators and hint at future research directions. + +
+
+ comment: Received by ESEM 24 +
+
+
+
+
+ + ☆ Generalization capabilities of MeshGraphNets to unseen geometries for + fluid dynamics + + +
+ This works investigates the generalization capabilities of MeshGraphNets +(MGN) [Pfaff et al. Learning Mesh-Based Simulation with Graph Networks. ICML +2021] to unseen geometries for fluid dynamics, e.g. predicting the flow around +a new obstacle that was not part of the training data. For this purpose, we +create a new benchmark dataset for data-driven computational fluid dynamics +(CFD) which extends DeepMind's flow around a cylinder dataset by including +different shapes and multiple objects. We then use this new dataset to extend +the generalization experiments conducted by DeepMind on MGNs by testing how +well an MGN can generalize to different shapes. In our numerical tests, we show +that MGNs can sometimes generalize well to various shapes by training on a +dataset of one obstacle shape and testing on a dataset of another obstacle +shape. + +
+
+
+
+
+ + ☆ Approximating Discrimination Within Models When Faced With Several + Non-Binary Sensitive Attributes + + +
+ Discrimination mitigation with machine learning (ML) models could be +complicated because multiple factors may interweave with each other including +hierarchically and historically. Yet few existing fairness measures are able to +capture the discrimination level within ML models in the face of multiple +sensitive attributes. To bridge this gap, we propose a fairness measure based +on distances between sets from a manifold perspective, named as 'harmonic +fairness measure via manifolds (HFM)' with two optional versions, which can +deal with a fine-grained discrimination evaluation for several sensitive +attributes of multiple values. To accelerate the computation of distances of +sets, we further propose two approximation algorithms named 'Approximation of +distance between sets for one sensitive attribute with multiple values +(ApproxDist)' and 'Approximation of extended distance between sets for several +sensitive attributes with multiple values (ExtendDist)' to respectively resolve +bias evaluation of one single sensitive attribute with multiple values and that +of several sensitive attributes with multiple values. Moreover, we provide an +algorithmic effectiveness analysis for ApproxDist under certain assumptions to +explain how well it could work. The empirical results demonstrate that our +proposed fairness measure HFM is valid and approximation algorithms (i.e., +ApproxDist and ExtendDist) are effective and efficient. + +
+
+ comment: The first two authors contributed equally, listed in alphabetical + order. arXiv admin note: substantial text overlap with arXiv:2405.09251 +
+
+
+
+
+ + ☆ Building Decision Making Models Through Language Model Regime + + +
+ We propose a novel approach for decision making problems leveraging the +generalization capabilities of large language models (LLMs). Traditional +methods such as expert systems, planning algorithms, and reinforcement learning +often exhibit limited generalization, typically requiring the training of new +models for each unique task. In contrast, LLMs demonstrate remarkable success +in generalizing across varied language tasks, inspiring a new strategy for +training decision making models. Our approach, referred to as "Learning then +Using" (LTU), entails a two-stage process. Initially, the \textit{learning} +phase develops a robust foundational decision making model by integrating +diverse knowledge from various domains and decision making contexts. The +subsequent \textit{using} phase refines this foundation model for specific +decision making scenarios. Distinct from other studies that employ LLMs for +decision making through supervised learning, our LTU method embraces a +versatile training methodology that combines broad pre-training with targeted +fine-tuning. Experiments in e-commerce domains such as advertising and search +optimization have shown that LTU approach outperforms traditional supervised +learning regimes in decision making capabilities and generalization. The LTU +approach is the first practical training architecture for both single-step and +multi-step decision making tasks combined with LLMs, which can be applied +beyond game and robot domains. It provides a robust and adaptable framework for +decision making, enhances the effectiveness and flexibility of various systems +in tackling various challenges. + +
+
+
+
+
+ + ☆ A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in + Adverse Weather and Lighting + + +
+ High-autonomy vehicle functions rely on machine learning (ML) algorithms to +understand the environment. Despite displaying remarkable performance in fair +weather scenarios, perception algorithms are heavily affected by adverse +weather and lighting conditions. To overcome these difficulties, ML engineers +mainly rely on comprehensive real-world datasets. However, the difficulties in +real-world data collection for critical areas of the operational design domain +(ODD) often means synthetic data is required for perception training and safety +validation. Thus, we present A-BDD, a large set of over 60,000 synthetically +augmented images based on BDD100K that are equipped with semantic segmentation +and bounding box annotations (inherited from the BDD100K dataset). The dataset +contains augmented data for rain, fog, overcast and sunglare/shadow with +varying intensity levels. We further introduce novel strategies utilizing +feature-based image quality metrics like FID and CMMD, which help identify +useful augmented and real-world data for ML training and testing. By conducting +experiments on A-BDD, we provide evidence that data augmentations can play a +pivotal role in closing performance gaps in adverse weather and lighting +conditions. + +
+
+
+
+
+ + ☆ Fully Bayesian Differential Gaussian Processes through Stochastic + Differential Equations + + +
+ Traditional deep Gaussian processes model the data evolution using a discrete +hierarchy, whereas differential Gaussian processes (DIFFGPs) represent the +evolution as an infinitely deep Gaussian process. However, prior DIFFGP methods +often overlook the uncertainty of kernel hyperparameters and assume them to be +fixed and time-invariant, failing to leverage the unique synergy between +continuous-time models and approximate inference. In this work, we propose a +fully Bayesian approach that treats the kernel hyperparameters as random +variables and constructs coupled stochastic differential equations (SDEs) to +learn their posterior distribution and that of inducing points. By +incorporating estimation uncertainty on hyperparameters, our method enhances +the model's flexibility and adaptability to complex dynamics. Additionally, our +approach provides a time-varying, comprehensive, and realistic posterior +approximation through coupling variables using SDE methods. Experimental +results demonstrate the advantages of our method over traditional approaches, +showcasing its superior performance in terms of flexibility, accuracy, and +other metrics. Our work opens up exciting research avenues for advancing +Bayesian inference and offers a powerful modeling tool for continuous-time +Gaussian processes. + +
+
+
+
+
+ + ☆ Don't You (Project Around Discs)? Neural Network Surrogate and Projected + Gradient Descent for Calibrating an Intervertebral Disc Finite Element Model + + +
+ Accurate calibration of finite element (FE) models of human intervertebral +discs (IVDs) is essential for their reliability and application in diagnosing +and planning treatments for spinal conditions. Traditional calibration methods +are computationally intensive, requiring iterative, derivative-free +optimization algorithms that often take hours or days to converge. + This study addresses these challenges by introducing a novel, efficient, and +effective calibration method for an L4-L5 IVD FE model using a neural network +(NN) surrogate. The NN surrogate predicts simulation outcomes with high +accuracy, outperforming other machine learning models, and significantly +reduces the computational cost associated with traditional FE simulations. +Next, a Projected Gradient Descent (PGD) approach guided by gradients of the NN +surrogate is proposed to efficiently calibrate FE models. Our method explicitly +enforces feasibility with a projection step, thus maintaining material bounds +throughout the optimization process. + The proposed method is evaluated against state-of-the-art Genetic Algorithm +(GA) and inverse model baselines on synthetic and in vitro experimental +datasets. Our approach demonstrates superior performance on synthetic data, +achieving a Mean Absolute Error (MAE) of 0.06 compared to the baselines' MAE of +0.18 and 0.54, respectively. On experimental specimens, our method outperforms +the baseline in 5 out of 6 cases. Most importantly, our approach reduces +calibration time to under three seconds, compared to up to 8 days per sample +required by traditional calibration. Such efficiency paves the way for applying +more complex FE models, enabling accurate patient-specific simulations and +advancing spinal treatment planning. + +
+
+ comment: Under submission. Project code: + https://github.com/matanat/IVD-CalibNN/ +
+
+
+
+
+ + ☆ TruVRF: Towards Triple-Granularity Verification on Machine Unlearning + + +
+ The concept of the right to be forgotten has led to growing interest in +machine unlearning, but reliable validation methods are lacking, creating +opportunities for dishonest model providers to mislead data contributors. +Traditional invasive methods like backdoor injection are not feasible for +legacy data. To address this, we introduce TruVRF, a non-invasive unlearning +verification framework operating at class-, volume-, and sample-level +granularities. TruVRF includes three Unlearning-Metrics designed to detect +different types of dishonest servers: Neglecting, Lazy, and Deceiving. +Unlearning-Metric-I checks class alignment, Unlearning-Metric-II verifies +sample count, and Unlearning-Metric-III confirms specific sample deletion. +Evaluations on three datasets show TruVRF's robust performance, with over 90% +accuracy for Metrics I and III, and a 4.8% to 8.2% inference deviation for +Metric II. TruVRF also demonstrates generalizability and practicality across +various conditions and with state-of-the-art unlearning frameworks like SISA +and Amnesiac Unlearning. + +
+
+
+
+
+ + ☆ Perceptual Similarity for Measuring Decision-Making Style and Policy + Diversity in Games + + +
+ Defining and measuring decision-making styles, also known as playstyles, is +crucial in gaming, where these styles reflect a broad spectrum of individuality +and diversity. However, finding a universally applicable measure for these +styles poses a challenge. Building on Playstyle Distance, the first +unsupervised metric to measure playstyle similarity based on game screens and +raw actions, we introduce three enhancements to increase accuracy: multiscale +analysis with varied state granularity, a perceptual kernel rooted in +psychology, and the utilization of the intersection-over-union method for +efficient evaluation. These innovations not only advance measurement precision +but also offer insights into human cognition of similarity. Across two racing +games and seven Atari games, our techniques significantly improve the precision +of zero-shot playstyle classification, achieving an accuracy exceeding 90 +percent with fewer than 512 observation-action pairs, which is less than half +an episode of these games. Furthermore, our experiments with 2048 and Go +demonstrate the potential of discrete playstyle measures in puzzle and board +games. We also develop an algorithm for assessing decision-making diversity +using these measures. Our findings improve the measurement of end-to-end game +analysis and the evolution of artificial intelligence for diverse playstyles. + +
+
+ comment: TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49 +
+
+
+
+
+ + ☆ What Ails Generative Structure-based Drug Design: Too Little or Too Much + Expressivity? + + +
+ Several generative models with elaborate training and sampling procedures +have been proposed recently to accelerate structure-based drug design (SBDD); +however, perplexingly, their empirical performance turns out to be suboptimal. +We seek to better understand this phenomenon from both theoretical and +empirical perspectives. Since most of these models apply graph neural networks +(GNNs), one may suspect that they inherit the representational limitations of +GNNs. We analyze this aspect, establishing the first such results for +protein-ligand complexes. A plausible counterview may attribute the +underperformance of these models to their excessive parameterizations, inducing +expressivity at the expense of generalization. We also investigate this +possibility with a simple metric-aware approach that learns an economical +surrogate for affinity to infer an unlabelled molecular graph and optimizes for +labels conditioned on this graph and molecular properties. The resulting model +achieves state-of-the-art results using 100x fewer trainable parameters and +affords up to 1000x speedup. Collectively, our findings underscore the need to +reassess and redirect the existing paradigm and efforts for SBDD. + +
+
+ comment: 25 pages, 11 figures +
+
+
+
+
+ + ☆ Spacetime $E(n)$-Transformer: Equivariant Attention for Spatio-temporal + Graphs + + +
+ We introduce an $E(n)$-equivariant Transformer architecture for +spatio-temporal graph data. By imposing rotation, translation, and permutation +equivariance inductive biases in both space and time, we show that the +Spacetime $E(n)$-Transformer (SET) outperforms purely spatial and temporal +models without symmetry-preserving properties. We benchmark SET against said +models on the charged $N$-body problem, a simple physical system with complex +dynamics. While existing spatio-temporal graph neural networks focus on +sequential modeling, we empirically demonstrate that leveraging underlying +domain symmetries yields considerable improvements for modeling dynamical +systems on graphs. + +
+
+
+
+
+ + ☆ Graph Clustering with Cross-View Feature Propagation + + +
+ Graph clustering is a fundamental and challenging learning task, which is +conventionally approached by grouping similar vertices based on edge structure +and feature similarity.In contrast to previous methods, in this paper, we +investigate how multi-view feature propagation can influence cluster discovery +in graph data.To this end, we present Graph Clustering With Cross-View Feature +Propagation (GCCFP), a novel method that leverages multi-view feature +propagation to enhance cluster identification in graph data.GCCFP employs a +unified objective function that utilizes graph topology and multi-view vertex +features to determine vertex cluster membership, regularized by a module that +supports key latent feature propagation. We derive an iterative algorithm to +optimize this function, prove model convergence within a finite number of +iterations, and analyze its computational complexity. Our experiments on +various real-world graphs demonstrate the superior clustering performance of +GCCFP compared to well-established methods, manifesting its effectiveness +across different scenarios. + +
+
+
+
+
+ + ☆ A Comprehensive Survey on EEG-Based Emotion Recognition: A Graph-Based + Perspective + + +
+ Compared to other modalities, electroencephalogram (EEG) based emotion +recognition can intuitively respond to emotional patterns in the human brain +and, therefore, has become one of the most focused tasks in affective +computing. The nature of emotions is a physiological and psychological state +change in response to brain region connectivity, making emotion recognition +focus more on the dependency between brain regions instead of specific brain +regions. A significant trend is the application of graphs to encapsulate such +dependency as dynamic functional connections between nodes across temporal and +spatial dimensions. Concurrently, the neuroscientific underpinnings behind this +dependency endow the application of graphs in this field with a distinctive +significance. However, there is neither a comprehensive review nor a tutorial +for constructing emotion-relevant graphs in EEG-based emotion recognition. In +this paper, we present a comprehensive survey of these studies, delivering a +systematic review of graph-related methods in this field from a methodological +perspective. We propose a unified framework for graph applications in this +field and categorize these methods on this basis. Finally, based on previous +studies, we also present several open challenges and future directions in this +field. + +
+
+
+
+
+ + ☆ Layer-Specific Optimization: Sensitivity Based Convolution Layers Basis + Search + + +
+ Deep neural network models have a complex architecture and are +overparameterized. The number of parameters is more than the whole dataset, +which is highly resource-consuming. This complicates their application and +limits its usage on different devices. Reduction in the number of network +parameters helps to reduce the size of the model, but at the same time, +thoughtlessly applied, can lead to a deterioration in the quality of the +network. One way to reduce the number of model parameters is matrix +decomposition, where a matrix is represented as a product of smaller matrices. +In this paper, we propose a new way of applying the matrix decomposition with +respect to the weights of convolutional layers. The essence of the method is to +train not all convolutions, but only the subset of convolutions (basis +convolutions), and represent the rest as linear combinations of the basis ones. +Experiments on models from the ResNet family and the CIFAR-10 dataset +demonstrate that basis convolutions can not only reduce the size of the model +but also accelerate the forward and backward passes of the network. Another +contribution of this work is that we propose a fast method for selecting a +subset of network layers in which the use of matrix decomposition does not +degrade the quality of the final model. + +
+
+ comment: A revived draft of an unpublished (and never-to-be-published) + article. For the sake of history, memory, and old times +
+
+
+
+
+ + ☆ Uncertainty-Informed Volume Visualization using Implicit Neural + Representation IEEE VIS 2024 + + +
+ The increasing adoption of Deep Neural Networks (DNNs) has led to their +application in many challenging scientific visualization tasks. While advanced +DNNs offer impressive generalization capabilities, understanding factors such +as model prediction quality, robustness, and uncertainty is crucial. These +insights can enable domain scientists to make informed decisions about their +data. However, DNNs inherently lack ability to estimate prediction uncertainty, +necessitating new research to construct robust uncertainty-aware visualization +techniques tailored for various visualization tasks. In this work, we propose +uncertainty-aware implicit neural representations to model scalar field data +sets effectively and comprehensively study the efficacy and benefits of +estimated uncertainty information for volume visualization tasks. We evaluate +the effectiveness of two principled deep uncertainty estimation techniques: (1) +Deep Ensemble and (2) Monte Carlo Dropout (MCDropout). These techniques enable +uncertainty-informed volume visualization in scalar field data sets. Our +extensive exploration across multiple data sets demonstrates that +uncertainty-aware models produce informative volume visualization results. +Moreover, integrating prediction uncertainty enhances the trustworthiness of +our DNN model, making it suitable for robustly analyzing and visualizing +real-world scientific volumetric data sets. + +
+
+ comment: To appear in IEEE Workshop on Uncertainty Visualization in + conjunction with IEEE VIS 2024, Florida, USA +
+
+
+
+
+ + ☆ LUT Tensor Core: Lookup Table Enables Efficient Low-Bit LLM Inference + Acceleration + + +
+ As large language model (LLM) inference demands ever-greater resources, there +is a rapid growing trend of using low-bit weights to shrink memory usage and +boost inference efficiency. However, these low-bit LLMs introduce the need for +mixed-precision matrix multiplication (mpGEMM), which is a crucial yet +under-explored operation that involves multiplying lower-precision weights with +higher-precision activations. Unfortunately, current hardware does not natively +support mpGEMM, resulting in indirect and inefficient dequantization-based +implementations. + To address the mpGEMM requirements in low-bit LLMs, we explored the lookup +table (LUT)-based approach for mpGEMM. However, a conventional LUT +implementation falls short of its potential. To fully harness the power of +LUT-based mpGEMM, we introduce LUT Tensor Core, a software-hardware co-design +optimized for low-bit LLM inference. Specifically, we introduce software-based +operator fusion and table symmetrization techniques to optimize table +precompute and table storage, respectively. Then, LUT Tensor Core proposes the +hardware design featuring an elongated tiling shape design to enhance table +reuse and a bit-serial design to support various precision combinations in +mpGEMM. Moreover, we design an end-to-end compilation stack with new +instructions for LUT-based mpGEMM, enabling efficient LLM compilation and +optimizations. The evaluation on low-bit LLMs (e.g., BitNet, LLAMA) shows that +LUT Tensor Core achieves more than a magnitude of improvements on both compute +density and energy efficiency. + +
+
+
+
+
+ + ☆ Transfer learning of state-based potential games for process + optimization in decentralized manufacturing systems + + +
+ This paper presents a novel transfer learning approach in state-based +potential games (TL-SbPGs) for enhancing distributed self-optimization in +manufacturing systems. The approach focuses on the practical relevant +industrial setting where sharing and transferring gained knowledge among +similar-behaved players improves the self-learning mechanism in large-scale +systems. With TL-SbPGs, the gained knowledge can be reused by other players to +optimize their policies, thereby improving the learning outcomes of the players +and accelerating the learning process. To accomplish this goal, we develop +transfer learning concepts and similarity criteria for players, which offer two +distinct settings: (a) predefined similarities between players and (b) +dynamically inferred similarities between players during training. We formally +prove the applicability of the SbPG framework in transfer learning. +Additionally, we introduce an efficient method to determine the optimal timing +and weighting of the transfer learning procedure during the training phase. +Through experiments on a laboratory-scale testbed, we demonstrate that TL-SbPGs +significantly boost production efficiency while reducing power consumption of +the production schedules while also outperforming native SbPGs. + +
+
+ comment: This pre-print was submitted to Computers in Industry on May 02, 2024 +
+
+
+
+
+ + ☆ Parameters Inference for Nonlinear Wave Equations with Markovian + Switching + + +
+ Traditional partial differential equations with constant coefficients often +struggle to capture abrupt changes in real-world phenomena, leading to the +development of variable coefficient PDEs and Markovian switching models. +Recently, research has introduced the concept of PDEs with Markov switching +models, established their well-posedness and presented numerical methods. +However, there has been limited discussion on parameter estimation for the jump +coefficients in these models. This paper addresses this gap by focusing on +parameter inference for the wave equation with Markovian switching. We propose +a Bayesian statistical framework using discrete sparse Bayesian learning to +establish its convergence and a uniform error bound. Our method requires fewer +assumptions and enables independent parameter inference for each segment by +allowing different underlying structures for the parameter estimation problem +within each segmented time interval. The effectiveness of our approach is +demonstrated through three numerical cases, which involve noisy spatiotemporal +data from different wave equations with Markovian switching. The results show +strong performance in parameter estimation for variable coefficient PDEs. + +
+
+
+
+
+ + ☆ Global-to-Local Support Spectrums for Language Model Explainability + + +
+ Existing sample-based methods, like influence functions and representer +points, measure the importance of a training point by approximating the effect +of its removal from training. As such, they are skewed towards outliers and +points that are very close to the decision boundaries. The explanations +provided by these methods are often static and not specific enough for +different test points. In this paper, we propose a method to generate an +explanation in the form of support spectrums which are based on two main ideas: +the support sets and a global-to-local importance measure. The support set is +the set of training points, in the predicted class, that ``lie in between'' the +test point and training points in the other classes. They indicate how well the +test point can be distinguished from the points not in the predicted class. The +global-to-local importance measure is obtained by decoupling existing methods +into the global and local components which are then used to select the points +in the support set. Using this method, we are able to generate explanations +that are tailored to specific test points. In the experiments, we show the +effectiveness of the method in image classification and text generation tasks. + +
+
+
+
+
+ + ☆ Target Detection of Safety Protective Gear Using the Improved YOLOv5 + + +
+ In high-risk railway construction, personal protective equipment monitoring +is critical but challenging due to small and frequently obstructed targets. We +propose YOLO-EA, an innovative model that enhances safety measure detection by +integrating ECA into its backbone's convolutional layers, improving discernment +of minuscule objects like hardhats. YOLO-EA further refines target recognition +under occlusion by replacing GIoU with EIoU loss. YOLO-EA's effectiveness was +empirically substantiated using a dataset derived from real-world railway +construction site surveillance footage. It outperforms YOLOv5, achieving 98.9% +precision and 94.7% recall, up 2.5% and 0.5% respectively, while maintaining +real-time performance at 70.774 fps. This highly efficient and precise YOLO-EA +holds great promise for practical application in intricate construction +scenarios, enforcing stringent safety compliance during complex railway +construction projects. + +
+
+
+
+
+ + ☆ ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge + Graph QA datasets with Large Language Models + + +
+ The rapid advancement of Large Language Models (LLMs) and conversational +assistants necessitates dynamic, scalable, and configurable conversational +datasets for training and evaluation. These datasets must accommodate diverse +user interaction modes, including text and voice, each presenting unique +modeling challenges. Knowledge Graphs (KGs), with their structured and evolving +nature, offer an ideal foundation for current and precise knowledge. Although +human-curated KG-based conversational datasets exist, they struggle to keep +pace with the rapidly changing user information needs. We present ConvKGYarn, a +scalable method for generating up-to-date and configurable conversational KGQA +datasets. Qualitative psychometric analyses confirm our method can generate +high-quality datasets rivaling a popular conversational KGQA dataset while +offering it at scale and covering a wide range of human-interaction +configurations. We showcase its utility by testing LLMs on diverse +conversations - exploring model behavior on conversational KGQA sets with +different configurations grounded in the same KG fact set. Our results +highlight the ability of ConvKGYarn to improve KGQA foundations and evaluate +parametric knowledge of LLMs, thus offering a robust solution to the constantly +evolving landscape of conversational assistants. + +
+
+
+
+
+ + ☆ Inverse design of Non-parameterized Ventilated Acoustic Resonator via + Variational Autoencoder with Acoustic Response-encoded Latent Space + + +
+ Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge +as an alternative for sound attenuation in environments that require +ventilation, owing to its excellent low-frequency attenuation performance and +flexible shape adaptability. However, due to the non-linear acoustic responses +of VARs, the VAR designs are generally obtained within a limited parametrized +design space, and the design relies on the iteration of the numerical +simulation which consumes a considerable amount of computational time and +resources. This paper proposes an acoustic response-encoded variational +autoencoder (AR-VAE), a novel variational autoencoder-based generative design +model for the efficient and accurate inverse design of VAR even with +non-parametrized designs. The AR-VAE matches the high-dimensional acoustic +response with the VAR cross-section image in the dimension-reduced latent +space, which enables the AR-VAE to generate various non-parametrized VAR +cross-section images with the target acoustic response. AR-VAE generates +non-parameterized VARs from target acoustic responses, which show a 25-fold +reduction in mean squared error compared to conventional deep learning-based +parameter searching methods while exhibiting lower average mean squared error +and peak frequency variance. By combining the inverse-designed VARs by AR-VAE, +multi-cavity VAR was devised for broadband and multitarget peak frequency +attenuation. The proposed design method presents a new approach for structural +inverse-design with a high-dimensional non-linear physical response. + +
+
+
+
+
+ + ☆ Cluster-Segregate-Perturb (CSP): A Model-agnostic Explainability + Pipeline for Spatiotemporal Land Surface Forecasting Models + + +
+ Satellite images have become increasingly valuable for modelling regional +climate change effects. Earth surface forecasting represents one such task that +integrates satellite images with meteorological data to capture the joint +evolution of regional climate change effects. However, understanding the +complex relationship between specific meteorological variables and land surface +evolution poses a significant challenge. In light of this challenge, our paper +introduces a pipeline that integrates principles from both perturbation-based +explainability techniques like LIME and global marginal explainability +techniques like PDP, besides addressing the constraints of using such +techniques when applying them to high-dimensional spatiotemporal deep models. +The proposed pipeline simplifies the undertaking of diverse investigative +analyses, such as marginal sensitivity analysis, marginal correlation analysis, +lag analysis, etc., on complex land surface forecasting models In this study we +utilised Convolutional Long Short-Term Memory (ConvLSTM) as the surface +forecasting model and did analyses on the Normalized Difference Vegetation +Index (NDVI) of the surface forecasts, since meteorological variables like +temperature, pressure, and precipitation significantly influence it. The study +area encompasses various regions in Europe. Our analyses show that +precipitation exhibits the highest sensitivity in the study area, followed by +temperature and pressure. Pressure has little to no direct effect on NDVI. +Additionally, interesting nonlinear correlations between meteorological +variables and NDVI have been uncovered. + +
+
+
+
+
+ + ☆ Quantum Gradient Class Activation Map for Model Interpretability + + +
+ Quantum machine learning (QML) has recently made significant advancements in +various topics. Despite the successes, the safety and interpretability of QML +applications have not been thoroughly investigated. This work proposes using +Variational Quantum Circuits (VQCs) for activation mapping to enhance model +transparency, introducing the Quantum Gradient Class Activation Map +(QGrad-CAM). This hybrid quantum-classical computing framework leverages both +quantum and classical strengths and gives access to the derivation of an +explicit formula of feature map importance. Experimental results demonstrate +significant, fine-grained, class-discriminative visual explanations generated +across both image and speech datasets. + +
+
+ comment: Submitted to IEEE SiPS 2024 +
+
+
+
+
+ + ☆ Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer + Detection + + +
+ Polyp segmentation plays a crucial role in the early detection and diagnosis +of colorectal cancer. However, obtaining accurate segmentations often requires +labor-intensive annotations and specialized models. Recently, Meta AI Research +released a general Segment Anything Model 2 (SAM 2), which has demonstrated +promising performance in several segmentation tasks. In this work, we evaluate +the performance of SAM 2 in segmenting polyps under various prompted settings. +We hope this report will provide insights to advance the field of polyp +segmentation and promote more interesting work in the future. This project is +publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2. + +
+
+
+
+
+ + ☆ Online-Score-Aided Federated Learning: Taming the Resource Constraints + in Wireless Networks + + +
+ While FL is a widely popular distributed ML strategy that protects data +privacy, time-varying wireless network parameters and heterogeneous system +configurations of the wireless device pose significant challenges. Although the +limited radio and computational resources of the network and the clients, +respectively, are widely acknowledged, two critical yet often ignored aspects +are (a) wireless devices can only dedicate a small chunk of their limited +storage for the FL task and (b) new training samples may arrive in an online +manner in many practical wireless applications. Therefore, we propose a new FL +algorithm called OSAFL, specifically designed to learn tasks relevant to +wireless applications under these practical considerations. Since it has long +been proven that under extreme resource constraints, clients may perform an +arbitrary number of local training steps, which may lead to client drift under +statistically heterogeneous data distributions, we leverage normalized gradient +similarities and exploit weighting clients' updates based on optimized scores +that facilitate the convergence rate of the proposed OSAFL algorithm. Our +extensive simulation results on two different tasks -- each with three +different datasets -- with four popular ML models validate the effectiveness of +OSAFL compared to six existing state-of-the-art FL baselines. + +
+
+ comment: Under review for possible publication in IEEE Transactions on + Wireless Communications (TWC) +
+
+
+
+
+ + ☆ GFlowNet Training by Policy Gradients + + +
+ Generative Flow Networks (GFlowNets) have been shown effective to generate +combinatorial objects with desired properties. We here propose a new GFlowNet +training framework, with policy-dependent rewards, that bridges keeping flow +balance of GFlowNets to optimizing the expected accumulated reward in +traditional Reinforcement-Learning (RL). This enables the derivation of new +policy-based GFlowNet training methods, in contrast to existing ones resembling +value-based RL. It is known that the design of backward policies in GFlowNet +training affects efficiency. We further develop a coupled training strategy +that jointly solves GFlowNet forward policy training and backward policy +design. Performance analysis is provided with a theoretical guarantee of our +policy-based GFlowNet training. Experiments on both simulated and real-world +datasets verify that our policy-based strategies provide advanced RL +perspectives for robust gradient estimation to improve GFlowNet performance. + +
+
+
+
+
+ + ☆ Low-Rank Approximation, Adaptation, and Other Tales + + +
+ Low-rank approximation is a fundamental technique in modern data analysis, +widely utilized across various fields such as signal processing, machine +learning, and natural language processing. Despite its ubiquity, the mechanics +of low-rank approximation and its application in adaptation can sometimes be +obscure, leaving practitioners and researchers with questions about its true +capabilities and limitations. This paper seeks to clarify low-rank +approximation and adaptation by offering a comprehensive guide that reveals +their inner workings and explains their utility in a clear and accessible way. +Our focus here is to develop a solid intuition for how low-rank approximation +and adaptation operate, and why they are so effective. We begin with basic +concepts and gradually build up to the mathematical underpinnings, ensuring +that readers of all backgrounds can gain a deeper understanding of low-rank +approximation and adaptation. We strive to strike a balance between informal +explanations and rigorous mathematics, ensuring that both newcomers and +experienced experts can benefit from this survey. Additionally, we introduce +new low-rank decomposition and adaptation algorithms that have not yet been +explored in the field, hoping that future researchers will investigate their +potential applicability. + +
+
+
+
+
+ + ☆ Operator Learning Using Random Features: A Tool for Scientific Computing + + +
+ Supervised operator learning centers on the use of training data, in the form +of input-output pairs, to estimate maps between infinite-dimensional spaces. It +is emerging as a powerful tool to complement traditional scientific computing, +which may often be framed in terms of operators mapping between spaces of +functions. Building on the classical random features methodology for scalar +regression, this paper introduces the function-valued random features method. +This leads to a supervised operator learning architecture that is practical for +nonlinear problems yet is structured enough to facilitate efficient training +through the optimization of a convex, quadratic cost. Due to the quadratic +structure, the trained model is equipped with convergence guarantees and error +and complexity bounds, properties that are not readily available for most other +operator learning architectures. At its core, the proposed approach builds a +linear combination of random operators. This turns out to be a low-rank +approximation of an operator-valued kernel ridge regression algorithm, and +hence the method also has strong connections to Gaussian process regression. +The paper designs function-valued random features that are tailored to the +structure of two nonlinear operator learning benchmark problems arising from +parametric partial differential equations. Numerical results demonstrate the +scalability, discretization invariance, and transferability of the +function-valued random features method. + +
+
+ comment: 36 pages, 1 table, 9 figures. SIGEST version of SIAM J. Sci. Comput. + Vol. 43 No. 5 (2021) pp. A3212-A3243, hence text overlap with + arXiv:2005.10224 +
+
+
+
+
+ + ☆ The NP-hardness of the Gromov-Wasserstein distance + + +
+ This note addresses the property frequently mentioned in the literature that +the Gromov-Wasserstein (GW) distance is NP-hard. We provide the details on the +non-convex nature of the GW optimization problem that imply NP-hardness of the +GW distance between finite spaces for any instance of an input data. We further +illustrate the non-convexity of the problem with several explicit examples. + +
+
+
+
+
+ + ☆ From Graphs to Qubits: A Critical Review of Quantum Graph Neural + Networks + + +
+ Quantum Graph Neural Networks (QGNNs) represent a novel fusion of quantum +computing and Graph Neural Networks (GNNs), aimed at overcoming the +computational and scalability challenges inherent in classical GNNs that are +powerful tools for analyzing data with complex relational structures but suffer +from limitations such as high computational complexity and over-smoothing in +large-scale applications. Quantum computing, leveraging principles like +superposition and entanglement, offers a pathway to enhanced computational +capabilities. This paper critically reviews the state-of-the-art in QGNNs, +exploring various architectures. We discuss their applications across diverse +fields such as high-energy physics, molecular chemistry, finance and earth +sciences, highlighting the potential for quantum advantage. Additionally, we +address the significant challenges faced by QGNNs, including noise, +decoherence, and scalability issues, proposing potential strategies to mitigate +these problems. This comprehensive review aims to provide a foundational +understanding of QGNNs, fostering further research and development in this +promising interdisciplinary field. + +
+
+ comment: 21 pages, 9 figures, 2 tables. arXiv admin note: text overlap with + arXiv:1909.12264 by other authors +
+
+
+
+
+ + ☆ Learned Ranking Function: From Short-term Behavior Predictions to + Long-term User Satisfaction RecSys 24 + + +
+ We present the Learned Ranking Function (LRF), a system that takes short-term +user-item behavior predictions as input and outputs a slate of recommendations +that directly optimizes for long-term user satisfaction. Most previous work is +based on optimizing the hyperparameters of a heuristic function. We propose to +model the problem directly as a slate optimization problem with the objective +of maximizing long-term user satisfaction. We also develop a novel constraint +optimization algorithm that stabilizes objective trade-offs for multi-objective +optimization. We evaluate our approach with live experiments and describe its +deployment on YouTube. + +
+
+ comment: RecSys 24 +
+
+
+
+
+ + ☆ Fooling SHAP with Output Shuffling Attacks + + +
+ Explainable AI~(XAI) methods such as SHAP can help discover feature +attributions in black-box models. If the method reveals a significant +attribution from a ``protected feature'' (e.g., gender, race) on the model +output, the model is considered unfair. However, adversarial attacks can +subvert the detection of XAI methods. Previous approaches to constructing such +an adversarial model require access to underlying data distribution, which may +not be possible in many practical scenarios. We relax this constraint and +propose a novel family of attacks, called shuffling attacks, that are +data-agnostic. The proposed attack strategies can adapt any trained machine +learning model to fool Shapley value-based explanations. We prove that Shapley +values cannot detect shuffling attacks. However, algorithms that estimate +Shapley values, such as linear SHAP and SHAP, can detect these attacks with +varying degrees of effectiveness. We demonstrate the efficacy of the attack +strategies by comparing the performance of linear SHAP and SHAP using +real-world datasets. + +
+
+
+
+
+ + ☆ Prompt Recovery for Image Generation Models: A Comparative Study of + Discrete Optimizers + + +
+ Recovering natural language prompts for image generation models, solely based +on the generated images is a difficult discrete optimization problem. In this +work, we present the first head-to-head comparison of recent discrete +optimization techniques for the problem of prompt inversion. We evaluate Greedy +Coordinate Gradients (GCG), PEZ , Random Search, AutoDAN and BLIP2's image +captioner across various evaluation metrics related to the quality of inverted +prompts and the quality of the images generated by the inverted prompts. We +find that focusing on the CLIP similarity between the inverted prompts and the +ground truth image acts as a poor proxy for the similarity between ground truth +image and the image generated by the inverted prompts. While the discrete +optimizers effectively minimize their objectives, simply using responses from a +well-trained captioner often leads to generated images that more closely +resemble those produced by the original prompts. + +
+
+ comment: 9 Pages, 4 Figures +
+
+
+
+
+ + ☆ Music2Latent: Consistency Autoencoders for Latent Audio Compression + + +
+ Efficient audio representations in a compressed continuous latent space are +critical for generative audio modeling and Music Information Retrieval (MIR) +tasks. However, some existing audio autoencoders have limitations, such as +multi-stage training procedures, slow iterative sampling, or low reconstruction +quality. We introduce Music2Latent, an audio autoencoder that overcomes these +limitations by leveraging consistency models. Music2Latent encodes samples into +a compressed continuous latent space in a single end-to-end training process +while enabling high-fidelity single-step reconstruction. Key innovations +include conditioning the consistency model on upsampled encoder outputs at all +levels through cross connections, using frequency-wise self-attention to +capture long-range frequency dependencies, and employing frequency-wise learned +scaling to handle varying value distributions across frequencies at different +noise levels. We demonstrate that Music2Latent outperforms existing continuous +audio autoencoders in sound quality and reconstruction accuracy while achieving +competitive performance on downstream MIR tasks using its latent +representations. To our knowledge, this represents the first successful attempt +at training an end-to-end consistency autoencoder model. + +
+
+ comment: Accepted to ISMIR 2024 +
+
+
+
+
+ + ☆ Implicit Neural Representation For Accurate CFD Flow Field Prediction + + +
+ Despite the plethora of deep learning frameworks for flow field prediction, +most of them deal with flow fields on regular domains, and although the best +ones can cope with irregular domains, they mostly rely on graph networks, so +that real industrial applications remain currently elusive. We present a deep +learning framework for 3D flow field prediction applied to blades of aircraft +engine turbines and compressors. Crucially, we view any 3D field as a function +from coordinates that is modeled by a neural network we call the backbone-net. +It inherits the property of coordinate-based MLPs, namely the +discretization-agnostic representation of flow fields in domains of arbitrary +topology at infinite resolution. First, we demonstrate the performance of the +backbone-net solo in regressing 3D steady simulations of single blade rows in +various flow regimes: it can accurately render important flow characteristics +such as boundary layers, wakes and shock waves. Second, we introduce a +hyper-net that maps the surface mesh of a blade to the parameters of the +backbone-net. By doing so, the flow solution can be directly predicted from the +blade geometry, irrespective of its parameterization. Together, backbone-net +and hyper-net form a highly-accurate memory-efficient data-driven proxy to CFD +solvers with good generalization on unseen geometries. + +
+
+ comment: ECCOMAS CONGRESS 2024, 9th European Congress on Computational Methods + in Applied Sciences and Engineering +
+
+
+
+
+ + ☆ Kernel Sum of Squares for Data Adapted Kernel Learning of Dynamical + Systems from Data: A global optimization approach + + +
+ This paper examines the application of the Kernel Sum of Squares (KSOS) +method for enhancing kernel learning from data, particularly in the context of +dynamical systems. Traditional kernel-based methods, despite their theoretical +soundness and numerical efficiency, frequently struggle with selecting optimal +base kernels and parameter tuning, especially with gradient-based methods prone +to local optima. KSOS mitigates these issues by leveraging a global +optimization framework with kernel-based surrogate functions, thereby achieving +more reliable and precise learning of dynamical systems. Through comprehensive +numerical experiments on the Logistic Map, Henon Map, and Lorentz System, KSOS +is shown to consistently outperform gradient descent in minimizing the +relative-$\rho$ metric and improving kernel accuracy. These results highlight +KSOS's effectiveness in predicting the behavior of chaotic dynamical systems, +demonstrating its capability to adapt kernels to underlying dynamics and +enhance the robustness and predictive power of kernel-based approaches, making +it a valuable asset for time series analysis in various scientific fields. + +
+
+
+
+
+ + ☆ Wireless Channel Aware Data Augmentation Methods for Deep Leaning-Based + Indoor Localization + + +
+ Indoor localization is a challenging problem that - unlike outdoor +localization - lacks a universal and robust solution. Machine Learning (ML), +particularly Deep Learning (DL), methods have been investigated as a promising +approach. Although such methods bring remarkable localization accuracy, they +heavily depend on the training data collected from the environment. The data +collection is usually a laborious and time-consuming task, but Data +Augmentation (DA) can be used to alleviate this issue. In this paper, different +from previously used DA, we propose methods that utilize the domain knowledge +about wireless propagation channels and devices. The methods exploit the +typical hardware component drift in the transceivers and/or the statistical +behavior of the channel, in combination with the measured Power Delay Profile +(PDP). We comprehensively evaluate the proposed methods to demonstrate their +effectiveness. This investigation mainly focuses on the impact of factors such +as the number of measurements, augmentation proportion, and the environment of +interest impact the effectiveness of the different DA methods. We show that in +the low-data regime (few actual measurements available), localization accuracy +increases up to 50%, matching non-augmented results in the high-data regime. In +addition, the proposed methods may outperform the measurement-only high-data +performance by up to 33% using only 1/4 of the amount of measured data. We also +exhibit the effect of different training data distribution and quality on the +effectiveness of DA. Finally, we demonstrate the power of the proposed methods +when employed along with Transfer Learning (TL) to address the data scarcity in +target and/or source environments. + +
+
+ comment: 13 pages, 14 figures +
+
+
+
+
+ + ☆ Evaluating Language Models for Efficient Code Generation + + +
+ We introduce Differential Performance Evaluation (DPE), a framework designed +to reliably evaluate Large Language Models (LLMs) for efficient code +generation. Traditional coding benchmarks often fail to provide reliable +insights into code efficiency, due to their reliance on simplistic test inputs +and the absence of effective compound metrics. DPE addresses these issues by +focusing on efficiency-demanding programming tasks and establishing an +insightful compound metric for performance evaluation. DPE operates in two +phases: To curate efficiency datasets, it selects efficiency-demanding tasks +from existing coding benchmarks and generates computationally expensive inputs +to stress the efficiency of LLM solutions. To assess the code efficiency, DPE +profiles the new solution and compares it globally against a set of reference +solutions that exhibit distinct efficiency levels, where the matched level +defines its efficiency score. As a proof of concept, we use DPE to create +EvalPerf, a benchmark with 121 performance-challenging coding tasks. Our +comprehensive evaluation draws interesting findings on the efficiency impact of +model sizes, instruction tuning, and prompting. For example, while the scaling +law fails to account for code efficiency, general instruction tuning benefits +both code correctness and efficiency. We also evaluate the evaluation by +examining the effectiveness of DPE, showing that EvalPerf is reliable and +convenient to use even across platforms. + +
+
+
+
+
+ + ☆ Multi-View Neural Differential Equations for Continuous-Time Stream Data + in Long-Term Traffic Forecasting + + +
+ Long-term traffic flow forecasting plays a crucial role in intelligent +transportation as it allows traffic managers to adjust their decisions in +advance. However, the problem is challenging due to spatio-temporal +correlations and complex dynamic patterns in continuous-time stream data. +Neural Differential Equations (NDEs) are among the state-of-the-art methods for +learning continuous-time traffic dynamics. However, the traditional NDE models +face issues in long-term traffic forecasting due to failures in capturing +delayed traffic patterns, dynamic edge (location-to-location correlation) +patterns, and abrupt trend patterns. To fill this gap, we propose a new NDE +architecture called Multi-View Neural Differential Equations. Our model +captures current states, delayed states, and trends in different state +variables (views) by learning latent multiple representations within Neural +Differential Equations. Extensive experiments conducted on several real-world +traffic datasets demonstrate that our proposed method outperforms the +state-of-the-art and achieves superior prediction accuracy for long-term +forecasting and robustness with noisy or missing inputs. + +
+
+
+
+
+ + ☆ Bayesian Learning in a Nonlinear Multiscale State-Space Model + + +
+ The ubiquity of multiscale interactions in complex systems is +well-recognized, with development and heredity serving as a prime example of +how processes at different temporal scales influence one another. This work +introduces a novel multiscale state-space model to explore the dynamic +interplay between systems interacting across different time scales, with +feedback between each scale. We propose a Bayesian learning framework to +estimate unknown states by learning the unknown process noise covariances +within this multiscale model. We develop a Particle Gibbs with Ancestor +Sampling (PGAS) algorithm for inference and demonstrate through simulations the +efficacy of our approach. + +
+
+
+
+
+ + ☆ Neural Networks as Spin Models: From Glass to Hidden Order Through + Training + + +
+ We explore a one-to-one correspondence between a neural network (NN) and a +statistical mechanical spin model where neurons are mapped to Ising spins and +weights to spin-spin couplings. The process of training an NN produces a family +of spin Hamiltonians parameterized by training time. We study the magnetic +phases and the melting transition temperature as training progresses. First, we +prove analytically that the common initial state before training--an NN with +independent random weights--maps to a layered version of the classical +Sherrington-Kirkpatrick spin glass exhibiting a replica symmetry breaking. The +spin-glass-to-paramagnet transition temperature is calculated. Further, we use +the Thouless-Anderson-Palmer (TAP) equations--a theoretical technique to +analyze the landscape of energy minima of random systems--to determine the +evolution of the magnetic phases on two types of NNs (one with continuous and +one with binarized activations) trained on the MNIST dataset. The two NN types +give rise to similar results, showing a quick destruction of the spin glass and +the appearance of a phase with a hidden order, whose melting transition +temperature $T_c$ grows as a power law in training time. We also discuss the +properties of the spectrum of the spin system's bond matrix in the context of +rich vs. lazy learning. We suggest that this statistical mechanical view of NNs +provides a useful unifying perspective on the training process, which can be +viewed as selecting and strengthening a symmetry-broken state associated with +the training task. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ Source Separation of Multi-source Raw Music using a Residual Quantized + Variational Autoencoder + + +
+ I developed a neural audio codec model based on the residual quantized +variational autoencoder architecture. I train the model on the Slakh2100 +dataset, a standard dataset for musical source separation, composed of +multi-track audio. The model can separate audio sources, achieving almost SoTA +results with much less computing power. The code is publicly available at +github.com/LeonardoBerti00/Source-Separation-of-Multi-source-Music-using-Residual-Quantizad-Variational-Autoencoder + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Formal-LLM: Integrating Formal Language and Natural Language for + Controllable LLM-based Agents + + +
+ Recent advancements on Large Language Models (LLMs) enable AI Agents to +automatically generate and execute multi-step plans to solve complex tasks. +However, since LLM's content generation process is hardly controllable, current +LLM-based agents frequently generate invalid or non-executable plans, which +jeopardizes the performance of the generated plans and corrupts users' trust in +LLM-based agents. In response, this paper proposes a novel "Formal-LLM" +framework for LLM-based agents by integrating the expressiveness of natural +language and the precision of formal language. Specifically, the framework +allows agent developers to express their requirements or constraints for the +planning process as an automaton. A stack-based LLM plan generation process is +then conducted under the supervision of the automaton to ensure that the +generated plan satisfies the constraints, making the planning process +controllable. We conduct experiments on both benchmark tasks and practical +real-life tasks, and our framework achieves over 50% overall performance +increase, which validates the feasibility and effectiveness of employing +Formal-LLM to guide the plan generation of agents, preventing the agents from +generating invalid and unsuccessful plans. Further, more controllable LLM-based +agents can facilitate the broader utilization of LLM in application scenarios +where high validity of planning is essential. The source code of this work is +available at https://github.com/agiresearch/Formal-LLM. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Cognitive Biases in Large Language Models as Evaluators + + +
+ Large Language Models (LLMs) have recently been shown to be effective as +automatic evaluators with simple prompting and in-context learning. In this +work, we assemble 15 LLMs of four different size ranges and evaluate their +output responses by preference ranking from the other LLMs as evaluators, such +as System Star is better than System Square. We then evaluate the quality of +ranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators +(CoBBLEr), a benchmark to measure six different cognitive biases in LLM +evaluation outputs, such as the Egocentric bias where a model prefers to rank +its own outputs highly in evaluation. We find that LLMs are biased text quality +evaluators, exhibiting strong indications on our bias benchmark (average of 40% +of comparisons across all models) within each of their evaluations that +question their robustness as evaluators. Furthermore, we examine the +correlation between human and machine preferences and calculate the average +Rank-Biased Overlap (RBO) score to be 49.6%, indicating that machine +preferences are misaligned with humans. According to our findings, LLMs may +still be unable to be utilized for automatic annotation aligned with human +preferences. Our project page is at: https://minnesotanlp.github.io/cobbler. + +
+
+ comment: Publishsed at 2024. 29 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Moderating Illicit Online Image Promotion for Unsafe User-Generated + Content Games Using Large Vision-Language Models USENIX Security + + +
+ Online user generated content games (UGCGs) are increasingly popular among +children and adolescents for social interaction and more creative online +entertainment. However, they pose a heightened risk of exposure to explicit +content, raising growing concerns for the online safety of children and +adolescents. Despite these concerns, few studies have addressed the issue of +illicit image-based promotions of unsafe UGCGs on social media, which can +inadvertently attract young users. This challenge arises from the difficulty of +obtaining comprehensive training data for UGCG images and the unique nature of +these images, which differ from traditional unsafe content. In this work, we +take the first step towards studying the threat of illicit promotions of unsafe +UGCGs. We collect a real-world dataset comprising 2,924 images that display +diverse sexually explicit and violent content used to promote UGCGs by their +game creators. Our in-depth studies reveal a new understanding of this problem +and the urgent need for automatically flagging illicit UGCG promotions. We +additionally create a cutting-edge system, UGCG-Guard, designed to aid social +media platforms in effectively identifying images used for illicit UGCG +promotions. This system leverages recently introduced large vision-language +models (VLMs) and employs a novel conditional prompting strategy for zero-shot +domain adaptation, along with chain-of-thought (CoT) reasoning for contextual +identification. UGCG-Guard achieves outstanding results, with an accuracy rate +of 94% in detecting these images used for the illicit promotion of such games +in real-world scenarios. + +
+
+ comment: To Appear in the 33rd USENIX Security Symposium, August 14-16, 2024 +
+
+
+
+
+ + ♻ ☆ KIX: A Knowledge and Interaction-Centric Metacognitive Framework for + Task Generalization + + +
+ People aptly exhibit general intelligence behaviors in solving a variety of +tasks with flexibility and ability to adapt to novel situations by reusing and +applying high-level knowledge acquired over time. But artificial agents are +more like specialists, lacking such generalist behaviors. Artificial agents +will require understanding and exploiting critical structured knowledge +representations. We present a metacognitive generalization framework, +Knowledge-Interaction-eXecution (KIX), and argue that interactions with objects +leveraging type space facilitate the learning of transferable interaction +concepts and generalization. It is a natural way of integrating knowledge into +reinforcement learning and is promising to act as an enabler for autonomous and +generalist behaviors in artificial intelligence systems. + +
+
+
+
+
+ + ♻ ☆ ReLU-KAN: New Kolmogorov-Arnold Networks that Only Need Matrix Addition, + Dot Multiplication, and ReLU + + +
+ Limited by the complexity of basis function (B-spline) calculations, +Kolmogorov-Arnold Networks (KAN) suffer from restricted parallel computing +capability on GPUs. This paper proposes a novel ReLU-KAN implementation that +inherits the core idea of KAN. By adopting ReLU (Rectified Linear Unit) and +point-wise multiplication, we simplify the design of KAN's basis function and +optimize the computation process for efficient CUDA computing. The proposed +ReLU-KAN architecture can be readily implemented on existing deep learning +frameworks (e.g., PyTorch) for both inference and training. Experimental +results demonstrate that ReLU-KAN achieves a 20x speedup compared to +traditional KAN with 4-layer networks. Furthermore, ReLU-KAN exhibits a more +stable training process with superior fitting ability while preserving the +"catastrophic forgetting avoidance" property of KAN. You can get the code in +https://github.com/quiqi/relu_kan + +
+
+
+
+
+ + ♻ ☆ An Experimental Comparison of Partitioning Strategies for Distributed + Graph Neural Network Training EDBT + + +
+ Recently, graph neural networks (GNNs) have gained much attention as a +growing area of deep learning capable of learning on graph-structured data. +However, the computational and memory requirements for training GNNs on +large-scale graphs make it necessary to distribute the training. A prerequisite +for distributed GNN training is to partition the input graph into smaller parts +that are distributed among multiple machines of a compute cluster. Although +graph partitioning has been studied with regard to graph analytics and graph +databases, its effect on GNN training performance is largely unexplored. As a +consequence, it is unclear whether investing computational efforts into +high-quality graph partitioning would pay off in GNN training scenarios. + In this paper, we study the effectiveness of graph partitioning for +distributed GNN training. Our study aims to understand how different factors +such as GNN parameters, mini-batch size, graph type, features size, and +scale-out factor influence the effectiveness of graph partitioning. We conduct +experiments with two different GNN systems using vertex and edge partitioning. +We found that high-quality graph partitioning is a very effective optimization +to speed up GNN training and to reduce memory consumption. Furthermore, our +results show that invested partitioning time can quickly be amortized by +reduced GNN training time, making it a relevant optimization for most GNN +scenarios. Compared to research on distributed graph processing, our study +reveals that graph partitioning plays an even more significant role in +distributed GNN training, which motivates further research on the graph +partitioning problem. + +
+
+ comment: To be published in Proceedings of the 28th International Conference + on Extending Database Technology (EDBT), 25th, March-28th March, 2025 +
+
+
+
+
+ + ♻ ☆ Non-Stationary Latent Auto-Regressive Bandits + + +
+ We consider the stochastic multi-armed bandit problem with non-stationary +rewards. We present a novel formulation of non-stationarity in the environment +where changes in the mean reward of the arms over time are due to some unknown, +latent, auto-regressive (AR) state of order $k$. We call this new environment +the latent AR bandit. Different forms of the latent AR bandit appear in many +real-world settings, especially in emerging scientific fields such as +behavioral health or education where there are few mechanistic models of the +environment. If the AR order $k$ is known, we propose an algorithm that +achieves $\tilde{O}(k\sqrt{T})$ regret in this setting. Empirically, our +algorithm outperforms standard UCB across multiple non-stationary environments, +even if $k$ is mis-specified. + +
+
+
+
+
+ + ♻ ☆ Monitoring Fidelity of Online Reinforcement Learning Algorithms in + Clinical Trials + + +
+ Online reinforcement learning (RL) algorithms offer great potential for +personalizing treatment for participants in clinical trials. However, deploying +an online, autonomous algorithm in the high-stakes healthcare setting makes +quality control and data quality especially difficult to achieve. This paper +proposes algorithm fidelity as a critical requirement for deploying online RL +algorithms in clinical trials. It emphasizes the responsibility of the +algorithm to (1) safeguard participants and (2) preserve the scientific utility +of the data for post-trial analyses. We also present a framework for +pre-deployment planning and real-time monitoring to help algorithm developers +and clinical researchers ensure algorithm fidelity. To illustrate our +framework's practical application, we present real-world examples from the +Oralytics clinical trial. Since Spring 2023, this trial successfully deployed +an autonomous, online RL algorithm to personalize behavioral interventions for +participants at risk for dental disease. + +
+
+
+
+
+ + ♻ ☆ Fair Column Subset Selection KDD 2024 + + +
+ The problem of column subset selection asks for a subset of columns from an +input matrix such that the matrix can be reconstructed as accurately as +possible within the span of the selected columns. A natural extension is to +consider a setting where the matrix rows are partitioned into two groups, and +the goal is to choose a subset of columns that minimizes the maximum +reconstruction error of both groups, relative to their respective best rank-k +approximation. Extending the known results of column subset selection to this +fair setting is not straightforward: in certain scenarios it is unavoidable to +choose columns separately for each group, resulting in double the expected +column count. We propose a deterministic leverage-score sampling strategy for +the fair setting and show that sampling a column subset of minimum size becomes +NP-hard in the presence of two groups. Despite these negative results, we give +an approximation algorithm that guarantees a solution within 1.5 times the +optimal solution size. We also present practical heuristic algorithms based on +rank-revealing QR factorization. Finally, we validate our methods through an +extensive set of experiments using real-world data. + +
+
+ comment: KDD 2024 +
+
+
+
+
+ + ♻ ☆ Toward a Surgeon-in-the-Loop Ophthalmic Robotic Apprentice using + Reinforcement and Imitation Learning IROS'24 + + +
+ Robot-assisted surgical systems have demonstrated significant potential in +enhancing surgical precision and minimizing human errors. However, existing +systems cannot accommodate individual surgeons' unique preferences and +requirements. Additionally, they primarily focus on general surgeries (e.g., +laparoscopy) and are unsuitable for highly precise microsurgeries, such as +ophthalmic procedures. Thus, we propose an image-guided approach for +surgeon-centered autonomous agents that can adapt to the individual surgeon's +skill level and preferred surgical techniques during ophthalmic cataract +surgery. Our approach trains reinforcement and imitation learning agents +simultaneously using curriculum learning approaches guided by image data to +perform all tasks of the incision phase of cataract surgery. By integrating the +surgeon's actions and preferences into the training process, our approach +enables the robot to implicitly learn and adapt to the individual surgeon's +unique techniques through surgeon-in-the-loop demonstrations. This results in a +more intuitive and personalized surgical experience for the surgeon while +ensuring consistent performance for the autonomous robotic apprentice. We +define and evaluate the effectiveness of our approach in a simulated +environment using our proposed metrics and highlight the trade-off between a +generic agent and a surgeon-centered adapted agent. Finally, our approach has +the potential to extend to other ophthalmic and microsurgical procedures, +opening the door to a new generation of surgeon-in-the-loop autonomous surgical +robots. We provide an open-source simulation framework for future development +and reproducibility at +https://github.com/amrgomaaelhady/CataractAdaptSurgRobot. + +
+
+ comment: Accepted at IROS'24 +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning in High-frequency Market Making + + +
+ This paper establishes a new and comprehensive theoretical analysis for the +application of reinforcement learning (RL) in high-frequency market making. We +bridge the modern RL theory and the continuous-time statistical models in +high-frequency financial economics. Different with most existing literature on +methodological research about developing various RL methods for market making +problem, our work is a pilot to provide the theoretical analysis. We target the +effects of sampling frequency, and find an interesting tradeoff between error +and complexity of RL algorithm when tweaking the values of the time increment +$\Delta$ $-$ as $\Delta$ becomes smaller, the error will be smaller but the +complexity will be larger. We also study the two-player case under the +general-sum game framework and establish the convergence of Nash equilibrium to +the continuous-time game equilibrium as $\Delta\rightarrow0$. The Nash +Q-learning algorithm, which is an online multi-agent RL method, is applied to +solve the equilibrium. Our theories are not only useful for practitioners to +choose the sampling frequency, but also very general and applicable to other +high-frequency financial decision making problems, e.g., optimal executions, as +long as the time-discretization of a continuous-time markov decision process is +adopted. Monte Carlo simulation evidence support all of our theories. + +
+
+
+
+
+ + ♻ ☆ CT evaluation of 2D and 3D holistic deep learning methods for the + volumetric segmentation of airway lesions + + +
+ This research embarked on a comparative exploration of the holistic +segmentation capabilities of Convolutional Neural Networks (CNNs) in both 2D +and 3D formats, focusing on cystic fibrosis (CF) lesions. The study utilized +data from two CF reference centers, covering five major CF structural changes. +Initially, it compared the 2D and 3D models, highlighting the 3D model's +superior capability in capturing complex features like mucus plugs and +consolidations. To improve the 2D model's performance, a loss adapted to fine +structures segmentation was implemented and evaluated, significantly enhancing +its accuracy, though not surpassing the 3D model's performance. The models +underwent further validation through external evaluation against pulmonary +function tests (PFTs), confirming the robustness of the findings. Moreover, +this study went beyond comparing metrics; it also included comprehensive +assessments of the models' interpretability and reliability, providing valuable +insights for their clinical application. + +
+
+ comment: 6 pages, 3 figures, 2 tables, IEEE International Symposium on + Biomedical Imaging (ISBI) 2024 +
+
+
+
+
+ + ♻ ☆ Unified Discrete Diffusion for Categorical Data + + +
+ Discrete diffusion models have seen a surge of attention with applications on +naturally discrete data such as language and graphs. Although discrete-time +discrete diffusion has been established for a while, only recently Campbell et +al. (2022) introduced the first framework for continuous-time discrete +diffusion. However, their training and sampling processes differ significantly +from the discrete-time version, necessitating nontrivial approximations for +tractability. In this paper, we first present a series of mathematical +simplifications of the variational lower bound that enable more accurate and +easy-to-optimize training for discrete diffusion. In addition, we derive a +simple formulation for backward denoising that enables exact and accelerated +sampling, and importantly, an elegant unification of discrete-time and +continuous-time discrete diffusion. Thanks to simpler analytical formulations, +both forward and now also backward probabilities can flexibly accommodate any +noise distribution, including different noise distributions for multi-element +objects. Experiments show that our proposed USD3 (for Unified Simplified +Discrete Denoising Diffusion) outperform all SOTA baselines on established +datasets. We open-source our unified code at +https://github.com/LingxiaoShawn/USD3. + +
+
+ comment: Unify Discrete Denoising Diffusion +
+
+
+
+
+ + ♻ ☆ MoMa: Efficient Early-Fusion Pre-training with Mixture of Modality-Aware + Experts + + +
+ We introduce MoMa, a novel modality-aware mixture-of-experts (MoE) +architecture designed for pre-training mixed-modal, early-fusion language +models. MoMa processes images and text in arbitrary sequences by dividing +expert modules into modality-specific groups. These groups exclusively process +designated tokens while employing learned routing within each group to maintain +semantically informed adaptivity. Our empirical results reveal substantial +pre-training efficiency gains through this modality-specific parameter +allocation. Under a 1-trillion-token training budget, the MoMa 1.4B model, +featuring 4 text experts and 4 image experts, achieves impressive FLOPs +savings: 3.7x overall, with 2.6x for text and 5.2x for image processing +compared to a compute-equivalent dense baseline, measured by pre-training loss. +This outperforms the standard expert-choice MoE with 8 mixed-modal experts, +which achieves 3x overall FLOPs savings (3x for text, 2.8x for image). +Combining MoMa with mixture-of-depths (MoD) further improves pre-training FLOPs +savings to 4.2x overall (text: 3.4x, image: 5.3x), although this combination +hurts performance in causal inference due to increased sensitivity to router +accuracy. These results demonstrate MoMa's potential to significantly advance +the efficiency of mixed-modal, early-fusion language model pre-training, paving +the way for more resource-efficient and capable multimodal AI systems. + +
+
+ comment: v2 -> update related work section v3 -> fix spelling +
+
+
+
+
+ + ♻ ☆ A Text-guided Protein Design Framework + + +
+ Current AI-assisted protein design mainly utilizes protein sequential and +structural information. Meanwhile, there exists tremendous knowledge curated by +humans in the text format describing proteins' high-level functionalities. Yet, +whether the incorporation of such text data can help protein design tasks has +not been explored. To bridge this gap, we propose ProteinDT, a multi-modal +framework that leverages textual descriptions for protein design. ProteinDT +consists of three subsequent steps: ProteinCLAP which aligns the representation +of two modalities, a facilitator that generates the protein representation from +the text modality, and a decoder that creates the protein sequences from the +representation. To train ProteinDT, we construct a large dataset, +SwissProtCLAP, with 441K text and protein pairs. We quantitatively verify the +effectiveness of ProteinDT on three challenging tasks: (1) over 90\% accuracy +for text-guided protein generation; (2) best hit ratio on 12 zero-shot +text-guided protein editing tasks; (3) superior performance on four out of six +protein property prediction benchmarks. + +
+
+
+
+
+ + ♻ ☆ Decentralized Intelligence Network (DIN) + + +
+ Decentralized Intelligence Network (DIN) is a theoretical framework +addressing data fragmentation and siloing challenges, enabling scalable AI +through data sovereignty. It facilitates effective AI utilization within +sovereign networks by overcoming barriers to accessing diverse data sources, +leveraging: 1) personal data stores to ensure data sovereignty, where data +remains securely within Participants' control; 2) a scalable federated learning +protocol implemented on a public blockchain for decentralized AI training, +where only model parameter updates are shared, keeping data within the personal +data stores; and 3) a scalable, trustless cryptographic rewards mechanism on a +public blockchain to incentivize participation and ensure fair reward +distribution through a decentralized auditing protocol. This approach +guarantees that no entity can prevent or control access to training data or +influence financial benefits, as coordination and reward distribution are +managed on the public blockchain with an immutable record. The framework +supports effective AI training by allowing Participants to maintain control +over their data, benefit financially, and contribute to a decentralized, +scalable ecosystem that leverages collective AI to develop beneficial +algorithms. + +
+
+ comment: 14 pages, 1 figure. These works have been selected for presentation + as a speaker at the Summit on Responsible Decentralized Intelligence - Future + of Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the + Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC +
+
+
+
+
+ + ♻ ☆ Private Fine-tuning of Large Language Models with Zeroth-order + Optimization + + +
+ Differentially private stochastic gradient descent (DP-SGD) allows models to +be trained in a privacy-preserving manner, but has proven difficult to scale to +the era of foundation models. We introduce DP-ZO, a private fine-tuning +framework for large language models by privatizing zeroth order optimization +methods. A key insight into the design of our method is that the direction of +the gradient in the zeroth-order optimization we use is random and the only +information from training data is the step size, i.e., a scalar. Therefore, we +only need to privatize the scalar step size, which is memory-efficient. DP-ZO +provides a strong privacy-utility trade-off across different tasks, and model +sizes that are comparable to DP-SGD in $(\varepsilon,\delta)$-DP. Notably, +DP-ZO possesses significant advantages over DP-SGD in memory efficiency, and +obtains higher utility in $\varepsilon$-DP when using the Laplace mechanism. + +
+
+
+
+
+ + ♻ ☆ Leveraging KANs For Enhanced Deep Koopman Operator Discovery + + +
+ Multi-layer perceptrons (MLP's) have been extensively utilized in discovering +Deep Koopman operators for linearizing nonlinear dynamics. With the emergence +of Kolmogorov-Arnold Networks (KANs) as a more efficient and accurate +alternative to the MLP Neural Network, we propose a comparison of the +performance of each network type in the context of learning Koopman operators +with control. In this work, we propose a KANs-based deep Koopman framework with +applications to an orbital Two-Body Problem (2BP) and the pendulum for +data-driven discovery of linear system dynamics. KANs were found to be superior +in nearly all aspects of training; learning 31 times faster, being 15 times +more parameter efficiency, and predicting 1.25 times more accurately as +compared to the MLP Deep Neural Networks (DNNs) in the case of the 2BP. Thus, +KANs shows potential for being an efficient tool in the development of Deep +Koopman Theory. + +
+
+ comment: 6 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ A mathematical perspective on Transformers + + +
+ Transformers play a central role in the inner workings of large language +models. We develop a mathematical framework for analyzing Transformers based on +their interpretation as interacting particle systems, which reveals that +clusters emerge in long time. Our study explores the underlying theory and +offers new perspectives for mathematicians as well as computer scientists. + +
+
+
+
+
+ + ♻ ☆ Discover-then-Name: Task-Agnostic Concept Bottlenecks via Automated + Concept Discovery ECCV + + +
+ Concept Bottleneck Models (CBMs) have recently been proposed to address the +'black-box' problem of deep neural networks, by first mapping images to a +human-understandable concept space and then linearly combining concepts for +classification. Such models typically require first coming up with a set of +concepts relevant to the task and then aligning the representations of a +feature extractor to map to these concepts. However, even with powerful +foundational feature extractors like CLIP, there are no guarantees that the +specified concepts are detectable. In this work, we leverage recent advances in +mechanistic interpretability and propose a novel CBM approach -- called +Discover-then-Name-CBM (DN-CBM) -- that inverts the typical paradigm: instead +of pre-selecting concepts based on the downstream classification task, we use +sparse autoencoders to first discover concepts learnt by the model, and then +name them and train linear probes for classification. Our concept extraction +strategy is efficient, since it is agnostic to the downstream task, and uses +concepts already known to the model. We perform a comprehensive evaluation +across multiple datasets and CLIP architectures and show that our method yields +semantically meaningful concepts, assigns appropriate names to them that make +them easy to interpret, and yields performant and interpretable CBMs. Code +available at https://github.com/neuroexplicit-saar/discover-then-name. + +
+
+ comment: 40 pages, 21 figures, 6 tables, European Conference on Computer + Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ On the Generalization of Preference Learning with DPO + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities but +often struggle to align with human preferences, leading to harmful or +undesirable outputs. Preference learning, which trains models to distinguish +between preferred and non-preferred responses based on human feedback, has +become a crucial component for ensuring that LLMs align with human values. +Despite the widespread adoption in real-world systems, a thorough theoretical +understanding of the generalization guarantees for these models remain lacking. +This paper bridges that gap by introducing a new theoretical framework to +analyze the generalization guarantees of models trained with direct preference +optimization (DPO). While existing generalization theory often focuses on +overparameterized models achieving near-optimal loss or models independent of +the training process, our framework rigorously assesses how well models +generalize after a finite number of gradient steps, reflecting real-world LLM +training practices. By analyzing the reward margin associated with each sample +and its trajectory throughout training, we can effectively bound the +generalization error. We derive learning guarantees showing that, under +specific conditions, models trained with DPO can correctly discern preferred +responses on unseen data with high probability. These insights are empirically +validated on contemporary LLMs, underscoring the practical relevance of our +theoretical findings. + +
+
+
+
+
+ + ♻ ☆ Time is Not Enough: Time-Frequency based Explanation for Time-Series + Black-Box Models CIKM 2024 + + +
+ Despite the massive attention given to time-series explanations due to their +extensive applications, a notable limitation in existing approaches is their +primary reliance on the time-domain. This overlooks the inherent characteristic +of time-series data containing both time and frequency features. In this work, +we present Spectral eXplanation (SpectralX), an XAI framework that provides +time-frequency explanations for time-series black-box classifiers. This easily +adaptable framework enables users to "plug-in" various perturbation-based XAI +methods for any pre-trained time-series classification models to assess their +impact on the explanation quality without having to modify the framework +architecture. Additionally, we introduce Feature Importance Approximations +(FIA), a new perturbation-based XAI method. These methods consist of feature +insertion, deletion, and combination techniques to enhance computational +efficiency and class-specific explanations in time-series classification tasks. +We conduct extensive experiments in the generated synthetic dataset and various +UCR Time-Series datasets to first compare the explanation performance of FIA +and other existing perturbation-based XAI methods in both time-domain and +time-frequency domain, and then show the superiority of our FIA in the +time-frequency domain with the SpectralX framework. Finally, we conduct a user +study to confirm the practicality of our FIA in SpectralX framework for +class-specific time-frequency based time-series explanations. The source code +is available in https://github.com/gustmd0121/Time_is_not_Enough + +
+
+ comment: Accepted to CIKM 2024 (10 pages, 9 figures, 9 tables) +
+
+
+
+
+ + ♻ ☆ Improving the Evaluation and Actionability of Explanation Methods for + Multivariate Time Series Classification + + +
+ Explanation for Multivariate Time Series Classification (MTSC) is an +important topic that is under explored. There are very few quantitative +evaluation methodologies and even fewer examples of actionable explanation, +where the explanation methods are shown to objectively improve specific +computational tasks on time series data. In this paper we focus on analyzing +InterpretTime, a recent evaluation methodology for attribution methods applied +to MTSC. We showcase some significant weaknesses of the original methodology +and propose ideas to improve both its accuracy and efficiency. Unlike related +work, we go beyond evaluation and also showcase the actionability of the +produced explainer ranking, by using the best attribution methods for the task +of channel selection in MTSC. We find that perturbation-based methods such as +SHAP and Feature Ablation work well across a set of datasets, classifiers and +tasks and outperform gradient-based methods. We apply the best ranked +explainers to channel selection for MTSC and show significant data size +reduction and improved classifier accuracy. + +
+
+
+
+
+ + ♻ ☆ Investigating the ability of deep learning to predict Welding Depth and + Pore Volume in Hairpin Welding + + +
+ To advance quality assurance in the welding process, this study presents a +deep learning DL model that enables the prediction of two critical welds' Key +Performance Characteristics (KPCs): welding depth and average pore volume. In +the proposed approach, a wide range of laser welding Key Input Characteristics +(KICs) is utilized, including welding beam geometries, welding feed rates, path +repetitions for weld beam geometries, and bright light weld ratios for all +paths, all of which were obtained from hairpin welding experiments. Two DL +networks are employed with multiple hidden dense layers and linear activation +functions to investigate the capabilities of deep neural networks in capturing +the complex nonlinear relationships between the welding input and output +variables (KPCs and KICs). Applying DL networks to the small numerical +experimental hairpin welding dataset has shown promising results, achieving +Mean Absolute Error (MAE) values 0.1079 for predicting welding depth and 0.0641 +for average pore volume. This, in turn, promises significant advantages in +controlling welding outcomes, moving beyond the current trend of relying only +on defect classification in weld monitoring, to capture the correlation between +the weld parameters and weld geometries. + +
+
+
+
+
+ + ♻ ☆ Localising the Seizure Onset Zone from Single-Pulse Electrical + Stimulation Responses with a CNN Transformer + + +
+ Epilepsy is one of the most common neurological disorders, often requiring +surgical intervention when medication fails to control seizures. For effective +surgical outcomes, precise localisation of the epileptogenic focus - often +approximated through the Seizure Onset Zone (SOZ) - is critical yet remains a +challenge. Active probing through electrical stimulation is already standard +clinical practice for identifying epileptogenic areas. Our study advances the +application of deep learning for SOZ localisation using Single-Pulse Electrical +Stimulation (SPES) responses, with two key contributions. Firstly, we implement +an existing deep learning model to compare two SPES analysis paradigms: +divergent and convergent. These paradigms evaluate outward and inward effective +connections, respectively. We assess the generalisability of these models to +unseen patients and electrode placements using held-out test sets. Our findings +reveal a notable improvement in moving from a divergent (AUROC: 0.574) to a +convergent approach (AUROC: 0.666), marking the first application of the latter +in this context. Secondly, we demonstrate the efficacy of CNN Transformers with +cross-channel attention in handling heterogeneous electrode placements, +increasing the AUROC to 0.730. These findings represent a significant step in +modelling patient-specific intracranial EEG electrode placements in SPES. +Future work will explore integrating these models into clinical decision-making +processes to bridge the gap between deep learning research and practical +healthcare applications. + +
+
+ comment: 21 pages, 6 figures, accepted at Machine Learning for Healthcare 2024 +
+
+
+
+
+ + ♻ ☆ Generating $SROI^-$ Ontologies via Knowledge Graph Query Embedding + Learning ECAI 2024 + + +
+ Query embedding approaches answer complex logical queries over incomplete +knowledge graphs (KGs) by computing and operating on low-dimensional vector +representations of entities, relations, and queries. However, current query +embedding models heavily rely on excessively parameterized neural networks and +cannot explain the knowledge learned from the graph. We propose a novel query +embedding method, AConE, which explains the knowledge learned from the graph in +the form of $SROI^-$ description logic axioms while being more +parameter-efficient than most existing approaches. AConE associates queries to +a $SROI^-$ description logic concept. Every $SROI^-$ concept is embedded as a +cone in complex vector space, and each $SROI^-$ relation is embedded as a +transformation that rotates and scales cones. We show theoretically that AConE +can learn $SROI^-$ axioms, and defines an algebra whose operations correspond +one to one to $SROI^-$ description logic concept constructs. Our empirical +study on multiple query datasets shows that AConE achieves superior results +over previous baselines with fewer parameters. Notably on the WN18RR dataset, +AConE achieves significant improvement over baseline models. We provide +comprehensive analyses showing that the capability to represent axioms +positively impacts the results of query answering. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ♻ ☆ A Survey on LoRA of Large Language Models + + +
+ Low-Rank Adaptation~(LoRA), which updates the dense neural network layers +with pluggable low-rank matrices, is one of the best performed parameter +efficient fine-tuning paradigms. Furthermore, it has significant advantages in +cross-task generalization and privacy-preserving. Hence, LoRA has gained much +attention recently, and the number of related literature demonstrates +exponential growth. It is necessary to conduct a comprehensive overview of the +current progress on LoRA. This survey categorizes and reviews the progress from +the perspectives of (1) downstream adaptation improving variants that improve +LoRA's performance on downstream tasks; (2) cross-task generalization methods +that mix multiple LoRA plugins to achieve cross-task generalization; (3) +efficiency-improving methods that boost the computation-efficiency of LoRA; (4) +data privacy-preserving methods that use LoRA in federated learning; (5) +application. Besides, this survey also discusses the future directions in this +field. At last, we provide a Github +page~\footnote{\href{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}{https://github.com/ZJU-LLMs/Awesome-LoRAs.git}} +for readers to check the updates and initiate discussions on this survey paper. + +
+
+
+
+
+ + ♻ ☆ A Look at Value-Based Decision-Time vs. Background Planning Methods + Across Different Settings + + +
+ In model-based reinforcement learning (RL), an agent can leverage a learned +model to improve its way of behaving in different ways. Two of the prevalent +ways to do this are through decision-time and background planning methods. In +this study, we are interested in understanding how the value-based versions of +these two planning methods will compare against each other across different +settings. Towards this goal, we first consider the simplest instantiations of +value-based decision-time and background planning methods and provide +theoretical results on which one will perform better in the regular RL and +transfer learning settings. Then, we consider the modern instantiations of them +and provide hypotheses on which one will perform better in the same settings. +Finally, we perform illustrative experiments to validate these theoretical +results and hypotheses. Overall, our findings suggest that even though +value-based versions of the two planning methods perform on par in their +simplest instantiations, the modern instantiations of value-based decision-time +planning methods can perform on par or better than the modern instantiations of +value-based background planning methods in both the regular RL and transfer +learning settings. + +
+
+ comment: Accepted to EWRL 2024 +
+
+
+
+
+ + ♻ ☆ Identifying Three-Dimensional Radiative Patterns Associated with Early + Tropical Cyclone Intensification + + +
+ Cloud radiative feedback impacts early tropical cyclone (TC) intensification, +but limitations in existing diagnostic frameworks make them unsuitable for +studying asymmetric or transient radiative heating. We propose a linear +Variational Encoder-Decoder (VED) to learn the hidden relationship between +radiation and the surface intensification of realistic simulated TCs. Limiting +VED model inputs enables using its uncertainty to identify periods when +radiation has more importance for intensification. A close examination of the +extracted 3D radiative structures suggests that longwave radiative forcing from +inner core deep convection and shallow clouds both contribute to +intensification, with the deep convection having the most impact overall. We +find that deep convection downwind of the shallow clouds is critical to the +intensification of Haiyan. Our work demonstrates that machine learning can +discover thermodynamic-kinematic relationships without relying on axisymmetric +or deterministic assumptions, paving the way towards the objective discovery of +processes leading to TC intensification in realistic conditions. + +
+
+ comment: 15 pages, 6 figures (main text) +
+
+
+
+
+ + ♻ ☆ Improving Bias Correction Standards by Quantifying its Effects on + Treatment Outcomes ECML + + +
+ With the growing access to administrative health databases, retrospective +studies have become crucial evidence for medical treatments. Yet, +non-randomized studies frequently face selection biases, requiring mitigation +strategies. Propensity score matching (PSM) addresses these biases by selecting +comparable populations, allowing for analysis without further methodological +constraints. However, PSM has several drawbacks. Different matching methods can +produce significantly different Average Treatment Effects (ATE) for the same +task, even when meeting all validation criteria. To prevent cherry-picking the +best method, public authorities must involve field experts and engage in +extensive discussions with researchers. + To address this issue, we introduce a novel metric, A2A, to reduce the number +of valid matches. A2A constructs artificial matching tasks that mirror the +original ones but with known outcomes, assessing each matching method's +performance comprehensively from propensity estimation to ATE estimation. When +combined with Standardized Mean Difference, A2A enhances the precision of model +selection, resulting in a reduction of up to 50% in ATE estimation errors +across synthetic tasks and up to 90% in predicted ATE variability across both +synthetic and real-world datasets. To our knowledge, A2A is the first metric +capable of evaluating outcome correction accuracy using covariates not involved +in selection. + Computing A2A requires solving hundreds of PSMs, we therefore automate all +manual steps of the PSM pipeline. We integrate PSM methods from Python and R, +our automated pipeline, a new metric, and reproducible experiments into +popmatch, our new Python package, to enhance reproducibility and accessibility +to bias correction methods. + +
+
+ comment: ECML PKDD 2024, 18 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Manifold Learning by Mixture Models of VAEs for Inverse Problems + + +
+ Representing a manifold of very high-dimensional data with generative models +has been shown to be computationally efficient in practice. However, this +requires that the data manifold admits a global parameterization. In order to +represent manifolds of arbitrary topology, we propose to learn a mixture model +of variational autoencoders. Here, every encoder-decoder pair represents one +chart of a manifold. We propose a loss function for maximum likelihood +estimation of the model weights and choose an architecture that provides us the +analytical expression of the charts and of their inverses. Once the manifold is +learned, we use it for solving inverse problems by minimizing a data fidelity +term restricted to the learned manifold. To solve the arising minimization +problem we propose a Riemannian gradient descent algorithm on the learned +manifold. We demonstrate the performance of our method for low-dimensional toy +examples as well as for deblurring and electrical impedance tomography on +certain image manifolds. + +
+
+
+
+
+ + ♻ ☆ Helios: An extremely low power event-based gesture recognition for + always-on smart eyewear ECCV + + +
+ This paper introduces Helios, the first extremely low-power, real-time, +event-based hand gesture recognition system designed for all-day on smart +eyewear. As augmented reality (AR) evolves, current smart glasses like the Meta +Ray-Bans prioritize visual and wearable comfort at the expense of +functionality. Existing human-machine interfaces (HMIs) in these devices, such +as capacitive touch and voice controls, present limitations in ergonomics, +privacy and power consumption. Helios addresses these challenges by leveraging +natural hand interactions for a more intuitive and comfortable user experience. +Our system utilizes a extremely low-power and compact 3mmx4mm/20mW event camera +to perform natural hand-based gesture recognition for always-on smart eyewear. +The camera's output is processed by a convolutional neural network (CNN) +running on a NXP Nano UltraLite compute platform, consuming less than 350mW. +Helios can recognize seven classes of gestures, including subtle microgestures +like swipes and pinches, with 91% accuracy. We also demonstrate real-time +performance across 20 users at a remarkably low latency of 60ms. Our user +testing results align with the positive feedback we received during our recent +successful demo at AWE-USA-2024. + +
+
+ comment: Accepted at ECCV-Integrating Computer Vision in Smart Eyewear, 2024. + 18 pages, 10 figures. First three authors contributed equally to this paper +
+
+
+
+
+ + ♻ ☆ IN-Sight: Interactive Navigation through Sight IROS 2024 + + +
+ Current visual navigation systems often treat the environment as static, +lacking the ability to adaptively interact with obstacles. This limitation +leads to navigation failure when encountering unavoidable obstructions. In +response, we introduce IN-Sight, a novel approach to self-supervised path +planning, enabling more effective navigation strategies through interaction +with obstacles. Utilizing RGB-D observations, IN-Sight calculates +traversability scores and incorporates them into a semantic map, facilitating +long-range path planning in complex, maze-like environments. To precisely +navigate around obstacles, IN-Sight employs a local planner, trained +imperatively on a differentiable costmap using representation learning +techniques. The entire framework undergoes end-to-end training within the +state-of-the-art photorealistic Intel SPEAR Simulator. We validate the +effectiveness of IN-Sight through extensive benchmarking in a variety of +simulated scenarios and ablation studies. Moreover, we demonstrate the system's +real-world applicability with zero-shot sim-to-real transfer, deploying our +planner on the legged robot platform ANYmal, showcasing its practical potential +for interactive navigation in real environments. + +
+
+ comment: The 2024 IEEE/RSJ International Conference on Intelligent Robots and + Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Integrating Present and Past in Unsupervised Continual Learning + + +
+ We formulate a unifying framework for unsupervised continual learning (UCL), +which disentangles learning objectives that are specific to the present and the +past data, encompassing stability, plasticity, and cross-task consolidation. +The framework reveals that many existing UCL approaches overlook cross-task +consolidation and try to balance plasticity and stability in a shared embedding +space. This results in worse performance due to a lack of within-task data +diversity and reduced effectiveness in learning the current task. Our method, +Osiris, which explicitly optimizes all three objectives on separate embedding +spaces, achieves state-of-the-art performance on all benchmarks, including two +novel benchmarks proposed in this paper featuring semantically structured task +sequences. Compared to standard benchmarks, these two structured benchmarks +more closely resemble visual signals received by humans and animals when +navigating real-world environments. Finally, we show some preliminary evidence +that continual models can benefit from such realistic learning scenarios. + +
+
+ comment: CoLLAs 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and + Glitch Generation + + +
+ Simulating realistic time-domain observations of gravitational waves (GWs) +and GW detector glitches can help in advancing GW data analysis. Simulated data +can be used in downstream tasks by augmenting datasets for signal searches, +balancing data sets for machine learning, and validating detection schemes. In +this work, we present Conditional Derivative GAN (cDVGAN), a novel conditional +model in the Generative Adversarial Network framework for simulating multiple +classes of time-domain observations that represent gravitational waves (GWs) +and detector glitches. cDVGAN can also generate generalized hybrid samples that +span the variation between classes through interpolation in the conditioned +class vector. cDVGAN introduces an additional player into the typical 2-player +adversarial game of GANs, where an auxiliary discriminator analyzes the +first-order derivative time-series. Our results show that this provides +synthetic data that better captures the features of the original data. cDVGAN +conditions on three classes, two denoised from LIGO blip and tomte glitch +events from its 3rd observing run (O3), and the third representing binary black +hole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN +models in replicating the features of the three classes. Specifically, our +experiments show that training convolutional neural networks (CNNs) with our +cDVGAN-generated data improves the detection of samples embedded in detector +noise beyond the synthetic data from other state-of-the-art GAN models. Our +best synthetic dataset yields as much as a 4.2% increase in +area-under-the-curve (AUC) performance compared to synthetic datasets from +baseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN +outperforms CNNs trained only on the standard classes, when identifying real +samples embedded in LIGO detector background (4% AUC improvement for cDVGAN). + +
+
+ comment: 20 pages, 17 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Doubly Stochastic Adaptive Neighbors Clustering via the Marcus Mapping + + +
+ Clustering is a fundamental task in machine learning and data science, and +similarity graph-based clustering is an important approach within this domain. +Doubly stochastic symmetric similarity graphs provide numerous benefits for +clustering problems and downstream tasks, yet learning such graphs remains a +significant challenge. Marcus theorem states that a strictly positive symmetric +matrix can be transformed into a doubly stochastic symmetric matrix by diagonal +matrices. However, in clustering, learning sparse matrices is crucial for +computational efficiency. We extend Marcus theorem by proposing the Marcus +mapping, which indicates that certain sparse matrices can also be transformed +into doubly stochastic symmetric matrices via diagonal matrices. Additionally, +we introduce rank constraints into the clustering problem and propose the +Doubly Stochastic Adaptive Neighbors Clustering algorithm based on the Marcus +Mapping (ANCMM). This ensures that the learned graph naturally divides into the +desired number of clusters. We validate the effectiveness of our algorithm +through extensive comparisons with state-of-the-art algorithms. Finally, we +explore the relationship between the Marcus mapping and optimal transport. We +prove that the Marcus mapping solves a specific type of optimal transport +problem and demonstrate that solving this problem through Marcus mapping is +more efficient than directly applying optimal transport methods. + +
+
+
+
+
+ + ♻ ☆ Achieving More with Less: A Tensor-Optimization-Powered Ensemble Method + + +
+ Ensemble learning is a method that leverages weak learners to produce a +strong learner. However, obtaining a large number of base learners requires +substantial time and computational resources. Therefore, it is meaningful to +study how to achieve the performance typically obtained with many base learners +using only a few. We argue that to achieve this, it is essential to enhance +both classification performance and generalization ability during the ensemble +process. To increase model accuracy, each weak base learner needs to be more +efficiently integrated. It is observed that different base learners exhibit +varying levels of accuracy in predicting different classes. To capitalize on +this, we introduce confidence tensors $\tilde{\mathbf{\Theta}}$ and +$\tilde{\mathbf{\Theta}}_{rst}$ signifies the degree of confidence that the +$t$-th base classifier assigns the sample to class $r$ while it actually +belongs to class $s$. To the best of our knowledge, this is the first time an +evaluation of the performance of base classifiers across different classes has +been proposed. The proposed confidence tensor compensates for the strengths and +weaknesses of each base classifier in different classes, enabling the method to +achieve superior results with a smaller number of base learners. To enhance +generalization performance, we design a smooth and convex objective function +that leverages the concept of margin, making the strong learner more +discriminative. Furthermore, it is proved that in gradient matrix of the loss +function, the sum of each column's elements is zero, allowing us to solve a +constrained optimization problem using gradient-based methods. We then compare +our algorithm with random forests of ten times the size and other classical +methods across numerous datasets, demonstrating the superiority of our +approach. + +
+
+
+
+
+ + ♻ ☆ Representation Learning with Conditional Information Flow Maximization ACL 2024 + + +
+ This paper proposes an information-theoretic representation learning +framework, named conditional information flow maximization, to extract +noise-invariant sufficient representations for the input data and target task. +It promotes the learned representations have good feature uniformity and +sufficient predictive ability, which can enhance the generalization of +pre-trained language models (PLMs) for the target task. Firstly, an information +flow maximization principle is proposed to learn more sufficient +representations for the input and target by simultaneously maximizing both +input-representation and representation-label mutual information. Unlike the +information bottleneck, we handle the input-representation information in an +opposite way to avoid the over-compression issue of latent representations. +Besides, to mitigate the negative effect of potential redundant features from +the input, we design a conditional information minimization principle to +eliminate negative redundant features while preserve noise-invariant features. +Experiments on 13 language understanding benchmarks demonstrate that our method +effectively improves the performance of PLMs for classification and regression. +Extensive experiments show that the learned representations are more +sufficient, robust and transferable. + +
+
+ comment: 16 pages, accepted to ACL 2024 (main conference), the code is + available at https://github.com/zerohd4869/CIFM +
+
+
+
+
+ + ♻ ☆ FDApy: a Python package for functional data + + +
+ We introduce FDApy, an open-source Python package for the analysis of +functional data. The package provides tools for the representation of +(multivariate) functional data defined on different dimensional domains and for +functional data that is irregularly sampled. Additionally, dimension reduction +techniques are implemented for multivariate and/or multidimensional functional +data that are regularly or irregularly sampled. A toolbox for generating +functional datasets is also provided. The documentation includes installation +and usage instructions, examples on simulated and real datasets and a complete +description of the API. FDApy is released under the MIT license. The code and +documentation are available at https://github.com/StevenGolovkine/FDApy. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Confidence Self-Calibration for Multi-Label Class-Incremental Learning ECCV + + +
+ The partial label challenge in Multi-Label Class-Incremental Learning (MLCIL) +arises when only the new classes are labeled during training, while past and +future labels remain unavailable. This issue leads to a proliferation of +false-positive errors due to erroneously high confidence multi-label +predictions, exacerbating catastrophic forgetting within the disjoint label +space. In this paper, we aim to refine multi-label confidence calibration in +MLCIL and propose a Confidence Self-Calibration (CSC) approach. Firstly, for +label relationship calibration, we introduce a class-incremental graph +convolutional network that bridges the isolated label spaces by constructing +learnable, dynamically extended label relationship graph. Then, for confidence +calibration, we present a max-entropy regularization for each multi-label +increment, facilitating confidence self-calibration through the penalization of +over-confident output distributions. Our approach attains new state-of-the-art +results in MLCIL tasks on both MS-COCO and PASCAL VOC datasets, with the +calibration of label confidences confirmed through our methodology. + +
+
+ comment: Accepted at the European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ Dense Hopfield Networks in the Teacher-Student Setting + + +
+ Dense Hopfield networks are known for their feature to prototype transition +and adversarial robustness. However, previous theoretical studies have been +mostly concerned with their storage capacity. We bridge this gap by studying +the phase diagram of p-body Hopfield networks in the teacher-student setting of +an unsupervised learning problem, uncovering ferromagnetic phases reminiscent +of the prototype and feature learning regimes. On the Nishimori line, we find +the critical size of the training set necessary for efficient pattern +retrieval. Interestingly, we find that that the paramagnetic to ferromagnetic +transition of the teacher-student setting coincides with the paramagnetic to +spin-glass transition of the direct model, i.e. with random patterns. Outside +of the Nishimori line, we investigate the learning performance in relation to +the inference temperature and dataset noise. Moreover, we show that using a +larger p for the student than the teacher gives the student an extensive +tolerance to noise. We then derive a closed-form expression measuring the +adversarial robustness of such a student at zero temperature, corroborating the +positive correlation between number of parameters and robustness observed in +large neural networks. We also use our model to clarify why the prototype phase +of modern Hopfield networks is adversarially robust. + +
+
+ comment: 34 pages, 9 figures, updated to match published version, implemented + minor changes proposed in referee reports +
+
+
+
+
+ + ♻ ☆ Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion + Prior and Reward Feedback Learning + + +
+ Recent advances in text-to-image (T2I) diffusion models have enabled +impressive image generation capabilities guided by text prompts. However, +extending these techniques to video generation remains challenging, with +existing text-to-video (T2V) methods often struggling to produce high-quality +and motion-consistent videos. In this work, we introduce Control-A-Video, a +controllable T2V diffusion model that can generate videos conditioned on text +prompts and reference control maps like edge and depth maps. To tackle video +quality and motion consistency issues, we propose novel strategies to +incorporate content prior and motion prior into the diffusion-based generation +process. Specifically, we employ a first-frame condition scheme to transfer +video generation from the image domain. Additionally, we introduce +residual-based and optical flow-based noise initialization to infuse motion +priors from reference videos, promoting relevance among frame latents for +reduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback +Learning (ST-ReFL) algorithm that optimizes the video diffusion model using +multiple reward models for video quality and motion consistency, leading to +superior outputs. Comprehensive experiments demonstrate that our framework +generates higher-quality, more consistent videos compared to existing +state-of-the-art methods in controllable text-to-video generation + +
+
+
+
+
+ + ♻ ☆ A Natural Gas Consumption Forecasting System for Continual Learning + Scenarios based on Hoeffding Trees with Change Point Detection Mechanism + + +
+ Forecasting natural gas consumption, considering seasonality and trends, is +crucial in planning its supply and consumption and optimizing the cost of +obtaining it, mainly by industrial entities. However, in times of threats to +its supply, it is also a critical element that guarantees the supply of this +raw material to meet individual consumers' needs, ensuring society's energy +security. This article introduces a novel multistep ahead forecasting of +natural gas consumption with change point detection integration for model +collection selection with continual learning capabilities using data stream +processing. The performance of the forecasting models based on the proposed +approach is evaluated in a complex real-world use case of natural gas +consumption forecasting. We employed Hoeffding tree predictors as forecasting +models and the Pruned Exact Linear Time (PELT) algorithm for the change point +detection procedure. The change point detection integration enables selecting a +different model collection for successive time frames. Thus, three model +collection selection procedures (with and without an error feedback loop) are +defined and evaluated for forecasting scenarios with various densities of +detected change points. These models were compared with change point agnostic +baseline approaches. Our experiments show that fewer change points result in a +lower forecasting error regardless of the model collection selection procedure +employed. Also, simpler model collection selection procedures omitting +forecasting error feedback leads to more robust forecasting models suitable for +continual learning tasks. + +
+
+
+
+
+ + ♻ ☆ A Deep Learning Method for Simultaneous Denoising and Missing Wedge + Reconstruction in Cryogenic Electron Tomography + + +
+ Cryogenic electron tomography is a technique for imaging biological samples +in 3D. A microscope collects a series of 2D projections of the sample, and the +goal is to reconstruct the 3D density of the sample called the tomogram. +Reconstruction is difficult as the 2D projections are noisy and can not be +recorded from all directions, resulting in a missing wedge of information. +Tomograms conventionally reconstructed with filtered back-projection suffer +from noise and strong artifacts due to the missing wedge. Here, we propose a +deep-learning approach for simultaneous denoising and missing wedge +reconstruction called DeepDeWedge. The algorithm requires no ground truth data +and is based on fitting a neural network to the 2D projections using a +self-supervised loss. DeepDeWedge is simpler than current state-of-the-art +approaches for denoising and missing wedge reconstruction, performs +competitively and produces more denoised tomograms with higher overall +contrast. + +
+
+
+
+
+ + ♻ ☆ Protecting Copyrighted Material with Unique Identifiers in Large + Language Model Training + + +
+ A major public concern regarding the training of large language models (LLMs) +is whether they abusing copyrighted online text. Previous membership inference +methods may be misled by similar examples in vast amounts of training data. +Additionally, these methods are often too complex for general users to +understand and use, making them centralized, lacking transparency, and +trustworthiness. To address these issues, we propose an alternative +\textit{insert-and-detection} methodology, advocating that web users and +content platforms employ \textbf{\textit{unique identifiers}} for reliable and +independent membership inference. Users and platforms can create their own +identifiers, embed them in copyrighted text, and independently detect them in +future LLMs. As an initial demonstration, we introduce \textit{ghost +sentences}, a primitive form of unique identifiers, consisting primarily of +passphrases made up of random words. By embedding one ghost sentences in a few +copyrighted texts, users can detect its membership using a perplexity test and +a \textit{user-friendly} last-$k$ words test. The perplexity test is based on +the fact that LLMs trained on natural language should exhibit high perplexity +when encountering unnatural passphrases. As the repetition increases, users can +leverage the verbatim memorization ability of LLMs to perform a last-$k$ words +test by chatting with LLMs without writing any code. Both tests offer rigorous +statistical guarantees for membership inference. For LLaMA-13B, a perplexity +test on 30 ghost sentences with an average of 7 repetitions in 148K examples +yields a 0.891 ROC AUC. For the last-$k$ words test with OpenLLaMA-3B, 11 out +of 16 users, with an average of 24 examples each, successfully identify their +data from 1.8M examples. + +
+
+ comment: Preprint, work in progress +
+
+
+
+
+ + ♻ ☆ Addressing Maximization Bias in Reinforcement Learning with Two-Sample + Testing + + +
+ Value-based reinforcement-learning algorithms have shown strong results in +games, robotics, and other real-world applications. Overestimation bias is a +known threat to those algorithms and can sometimes lead to dramatic performance +decreases or even complete algorithmic failure. We frame the bias problem +statistically and consider it an instance of estimating the maximum expected +value (MEV) of a set of random variables. We propose the $T$-Estimator (TE) +based on two-sample testing for the mean, that flexibly interpolates between +over- and underestimation by adjusting the significance level of the underlying +hypothesis tests. We also introduce a generalization, termed $K$-Estimator +(KE), that obeys the same bias and variance bounds as the TE and relies on a +nearly arbitrary kernel function. We introduce modifications of $Q$-Learning +and the Bootstrapped Deep $Q$-Network (BDQN) using the TE and the KE, and prove +convergence in the tabular setting. Furthermore, we propose an adaptive variant +of the TE-based BDQN that dynamically adjusts the significance level to +minimize the absolute estimation bias. All proposed estimators and algorithms +are thoroughly tested and validated on diverse tasks and environments, +illustrating the bias control and performance potential of the TE and KE. + +
+
+
+
+
+ + ♻ ☆ Between Randomness and Arbitrariness: Some Lessons for Reliable Machine + Learning at Scale + + +
+ To develop rigorous knowledge about ML models -- and the systems in which +they are embedded -- we need reliable measurements. But reliable measurement is +fundamentally challenging, and touches on issues of reproducibility, +scalability, uncertainty quantification, epistemology, and more. This +dissertation addresses criteria needed to take reliability seriously: both +criteria for designing meaningful metrics, and for methodologies that ensure +that we can dependably and efficiently measure these metrics at scale and in +practice. In doing so, this dissertation articulates a research vision for a +new field of scholarship at the intersection of machine learning, law, and +policy. Within this frame, we cover topics that fit under three different +themes: (1) quantifying and mitigating sources of arbitrariness in ML, (2) +taming randomness in uncertainty estimation and optimization algorithms, in +order to achieve scalability without sacrificing reliability, and (3) providing +methods for evaluating generative-AI systems, with specific focuses on +quantifying memorization in language models and training latent diffusion +models on open-licensed data. By making contributions in these three themes, +this dissertation serves as an empirical proof by example that research on +reliable measurement for machine learning is intimately and inescapably bound +up with research in law and policy. These different disciplines pose similar +research questions about reliable measurement in machine learning. They are, in +fact, two complementary sides of the same research vision, which, broadly +construed, aims to construct machine-learning systems that cohere with broader +societal values. + +
+
+ comment: Ph.D. Dissertation +
+
+
+
+
+ + ♻ ☆ Scalable Event-by-event Processing of Neuromorphic Sensory Signals With + Deep State-Space Models + + +
+ Event-based sensors are well suited for real-time processing due to their +fast response times and encoding of the sensory data as successive temporal +differences. These and other valuable properties, such as a high dynamic range, +are suppressed when the data is converted to a frame-based format. However, +most current methods either collapse events into frames or cannot scale up when +processing the event data directly event-by-event. In this work, we address the +key challenges of scaling up event-by-event modeling of the long event streams +emitted by such sensors, which is a particularly relevant problem for +neuromorphic computing. While prior methods can process up to a few thousand +time steps, our model, based on modern recurrent deep state-space models, +scales to event streams of millions of events for both training and +inference.We leverage their stable parameterization for learning long-range +dependencies, parallelizability along the sequence dimension, and their ability +to integrate asynchronous events effectively to scale them up to long event +streams.We further augment these with novel event-centric techniques enabling +our model to match or beat the state-of-the-art performance on several event +stream benchmarks. In the Spiking Speech Commands task, we improve +state-of-the-art by a large margin of 6.6% to 87.1%. On the DVS128-Gestures +dataset, we achieve competitive results without using frames or convolutional +neural networks. Our work demonstrates, for the first time, that it is possible +to use fully event-based processing with purely recurrent networks to achieve +state-of-the-art task performance in several event-based benchmarks. + +
+
+
+
+
+ + ♻ ☆ Exponentially improved efficient machine learning for quantum many-body + states with provable guarantees + + +
+ Solving the ground state and the ground-state properties of quantum many-body +systems is generically a hard task for classical algorithms. For a family of +Hamiltonians defined on an $m$-dimensional space of physical parameters, the +ground state and its properties at an arbitrary parameter configuration can be +predicted via a machine learning protocol up to a prescribed prediction error +$\varepsilon$, provided that a sample set (of size $N$) of the states can be +efficiently prepared and measured. In a recent work [Huang et al., Science 377, +eabk3333 (2022)], a rigorous guarantee for such a generalization was proved. +Unfortunately, an exponential scaling for the provable sample complexity, +$N=m^{{\cal{O}}\left(\frac{1}{\varepsilon}\right)}$, was found to be universal +for generic gapped Hamiltonians. This result applies to the situation where the +dimension of the parameter space is large while the scaling with the accuracy +is not an urgent factor. In this work, we consider an alternative scenario +where $m$ is a finite, not necessarily large constant while the scaling with +the prediction error becomes the central concern. By jointly preserving the +fundamental properties of density matrices in the learning protocol and +utilizing the continuity of quantum states in the parameter range of interest, +we rigorously obtain a polynomial sample complexity for predicting quantum +many-body states and their properties, with respect to the uniform prediction +error $\varepsilon$ and the number of qubits $n$. Moreover, if restricted to +learning local quantum-state properties, the number of samples with respect to +$n$ can be further reduced exponentially. Our results provide theoretical +guarantees for efficient learning of quantum many-body states and their +properties, with model-independent applications not restricted to ground states +of gapped Hamiltonians. + +
+
+ comment: Published on Physical Review Research 6, 033035 (2024) +
+
+
+
+
+ + ♻ ☆ Axiomatic Characterisations of Sample-based Explainers + + +
+ Explaining decisions of black-box classifiers is both important and +computationally challenging. In this paper, we scrutinize explainers that +generate feature-based explanations from samples or datasets. We start by +presenting a set of desirable properties that explainers would ideally satisfy, +delve into their relationships, and highlight incompatibilities of some of +them. We identify the entire family of explainers that satisfy two key +properties which are compatible with all the others. Its instances provide +sufficient reasons, called weak abductive explanations.We then unravel its +various subfamilies that satisfy subsets of compatible properties. Indeed, we +fully characterize all the explainers that satisfy any subset of compatible +properties. In particular, we introduce the first (broad family of) explainers +that guarantee the existence of explanations and their global consistency.We +discuss some of its instances including the irrefutable explainer and the +surrogate explainer whose explanations can be found in polynomial time. + +
+
+
+
+
+ + ♻ ☆ Encoding Temporal Statistical-space Priors via Augmented Representation IJCAI 2024 + + +
+ Modeling time series data remains a pervasive issue as the temporal dimension +is inherent to numerous domains. Despite significant strides in time series +forecasting, high noise-to-signal ratio, non-normality, non-stationarity, and +lack of data continue challenging practitioners. In response, we leverage a +simple representation augmentation technique to overcome these challenges. Our +augmented representation acts as a statistical-space prior encoded at each time +step. In response, we name our method Statistical-space Augmented +Representation (SSAR). The underlying high-dimensional data-generating process +inspires our representation augmentation. We rigorously examine the empirical +generalization performance on two data sets with two downstream temporal +learning algorithms. Our approach significantly beats all five up-to-date +baselines. Moreover, the highly modular nature of our approach can easily be +applied to various settings. Lastly, fully-fledged theoretical perspectives are +available throughout the writing for a clear and rigorous understanding. + +
+
+ comment: IJCAI 2024 STRL Workshop (Oral) +
+
+
+
+
+ + ♻ ☆ MMICT: Boosting Multi-Modal Fine-Tuning with In-Context Examples + + +
+ Although In-Context Learning (ICL) brings remarkable performance gains to +Large Language Models (LLMs), the improvements remain lower than fine-tuning on +downstream tasks. This paper introduces Multi-Modal In-Context Tuning (MMICT), +a novel multi-modal fine-tuning paradigm that boosts multi-modal fine-tuning by +fully leveraging the promising ICL capability of multi-modal LLMs (MM-LLMs). We +propose the Multi-Modal Hub (M-Hub), a unified module that captures various +multi-modal features according to different inputs and objectives. Based on +M-Hub, MMICT enables MM-LLMs to learn from in-context visual-guided textual +features and subsequently generate outputs conditioned on the textual-guided +visual features. Moreover, leveraging the flexibility of M-Hub, we design a +variety of in-context demonstrations. Extensive experiments on a diverse range +of downstream multi-modal tasks demonstrate that MMICT significantly +outperforms traditional fine-tuning strategy and the vanilla ICT method that +directly takes the concatenation of all information from different modalities +as input. Our implementation is available at: +https://github.com/KDEGroup/MMICT. + +
+
+ comment: TOMM 2024 +
+
+
+
+
+ + ♻ ☆ Unsupervised Anomaly Detection in Time-series: An Extensive Evaluation + and Analysis of State-of-the-art Methods + + +
+ Unsupervised anomaly detection in time-series has been extensively +investigated in the literature. Notwithstanding the relevance of this topic in +numerous application fields, a comprehensive and extensive evaluation of recent +state-of-the-art techniques taking into account real-world constraints is still +needed. Some efforts have been made to compare existing unsupervised +time-series anomaly detection methods rigorously. However, only standard +performance metrics, namely precision, recall, and F1-score are usually +considered. Essential aspects for assessing their practical relevance are +therefore neglected. This paper proposes an in-depth evaluation study of recent +unsupervised anomaly detection techniques in time-series. Instead of relying +solely on standard performance metrics, additional yet informative metrics and +protocols are taken into account. In particular, (i) more elaborate performance +metrics specifically tailored for time-series are used; (ii) the model size and +the model stability are studied; (iii) an analysis of the tested approaches +with respect to the anomaly type is provided; and (iv) a clear and unique +protocol is followed for all experiments. Overall, this extensive analysis aims +to assess the maturity of state-of-the-art time-series anomaly detection, give +insights regarding their applicability under real-world setups and provide to +the community a more complete evaluation protocol. + +
+
+ comment: Accepted at Expert Systems with Applications journal +
+
+
+
+
+ + ♻ ☆ A Diagnostic Model for Acute Lymphoblastic Leukemia Using Metaheuristics + and Deep Learning Methods + + +
+ Acute lymphoblastic leukemia (ALL) severity is determined by the presence and +ratios of blast cells (abnormal white blood cells) in both bone marrow and +peripheral blood. Manual diagnosis of this disease is a tedious and +time-consuming operation, making it difficult for professionals to accurately +examine blast cell characteristics. To address this difficulty, researchers use +deep learning and machine learning. In this paper, a ResNet-based feature +extractor is utilized to detect ALL, along with a variety of feature selectors +and classifiers. To get the best results, a variety of transfer learning +models, including the Resnet, VGG, EfficientNet, and DensNet families, are used +as deep feature extractors. Following extraction, different feature selectors +are used, including Genetic algorithm, PCA, ANOVA, Random Forest, Univariate, +Mutual information, Lasso, XGB, Variance, and Binary ant colony. After feature +qualification, a variety of classifiers are used, with MLP outperforming the +others. The recommended technique is used to categorize ALL and HEM in the +selected dataset which is C-NMC 2019. This technique got an impressive 90.71% +accuracy and 95.76% sensitivity for the relevant classifications, and its +metrics on this dataset outperformed others. + +
+
+
+
+
+ + ♻ ☆ One-Shot Collaborative Data Distillation + + +
+ Large machine-learning training datasets can be distilled into small +collections of informative synthetic data samples. These synthetic sets support +efficient model learning and reduce the communication cost of data sharing. +Thus, high-fidelity distilled data can support the efficient deployment of +machine learning applications in distributed network environments. A naive way +to construct a synthetic set in a distributed environment is to allow each +client to perform local data distillation and to merge local distillations at a +central server. However, the quality of the resulting set is impaired by +heterogeneity in the distributions of the local data held by clients. To +overcome this challenge, we introduce the first collaborative data distillation +technique, called CollabDM, which captures the global distribution of the data +and requires only a single round of communication between client and server. +Our method outperforms the state-of-the-art one-shot learning method on skewed +data in distributed learning environments. We also show the promising practical +benefits of our method when applied to attack detection in 5G networks. + +
+
+
+
+
+ + ♻ ☆ Deep learning phase recovery: data-driven, physics-driven, or combining + both? + + +
+ Phase recovery, calculating the phase of a light wave from its intensity +measurements, is essential for various applications, such as coherent +diffraction imaging, adaptive optics, and biomedical imaging. It enables the +reconstruction of an object's refractive index distribution or topography as +well as the correction of imaging system aberrations. In recent years, deep +learning has been proven to be highly effective in addressing phase recovery +problems. Two most direct deep learning phase recovery strategies are +data-driven (DD) with supervised learning mode and physics-driven (PD) with +self-supervised learning mode. DD and PD achieve the same goal in different +ways and lack the necessary study to reveal similarities and differences. +Therefore, in this paper, we comprehensively compare these two deep learning +phase recovery strategies in terms of time consumption, accuracy, +generalization ability, ill-posedness adaptability, and prior capacity. What's +more, we propose a co-driven (CD) strategy of combining datasets and physics +for the balance of high- and low-frequency information. The codes for DD, PD, +and CD are publicly available at https://github.com/kqwang/DLPR. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Kolmogorov Arnold Networks (KAN) + + +
+ Through this comprehensive survey of Kolmogorov-Arnold Networks(KAN), we have +gained a thorough understanding of its theoretical foundation, architectural +design, application scenarios, and current research progress. KAN, with its +unique architecture and flexible activation functions, excels in handling +complex data patterns and nonlinear relationships, demonstrating wide-ranging +application potential. While challenges remain, KAN is poised to pave the way +for innovative solutions in various fields, potentially revolutionizing how we +approach complex computational problems. + +
+
+
+
+
+ + ♻ ☆ LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic + Prompt Compression ACL 2024 + + +
+ This paper focuses on task-agnostic prompt compression for better +generalizability and efficiency. Considering the redundancy in natural +language, existing approaches compress prompts by removing tokens or lexical +units according to their information entropy obtained from a causal language +model such as LLaMa-7B. The challenge is that information entropy may be a +suboptimal compression metric: (i) it only leverages unidirectional context and +may fail to capture all essential information needed for prompt compression; +(ii) it is not aligned with the prompt compression objective. + To address these issues, we propose a data distillation procedure to derive +knowledge from an LLM to compress prompts without losing crucial information, +and meantime, introduce an extractive text compression dataset. We formulate +prompt compression as a token classification problem to guarantee the +faithfulness of the compressed prompt to the original one, and use a +Transformer encoder as the base architecture to capture all essential +information for prompt compression from the full bidirectional context. Our +approach leads to lower latency by explicitly learning the compression +objective with smaller models such as XLM-RoBERTa-large and mBERT. + We evaluate our method on both in-domain and out-of-domain datasets, +including MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its +small size, our model shows significant performance gains over strong baselines +and demonstrates robust generalization ability across different LLMs. +Additionally, our model is 3x-6x faster than existing prompt compression +methods, while accelerating the end-to-end latency by 1.6x-2.9x with +compression ratios of 2x-5x. Our code is available at +https://aka.ms/LLMLingua-2. + +
+
+ comment: Accepted at Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios + via Prompt Compression ACL 2024 + + +
+ In long context scenarios, large language models (LLMs) face three main +challenges: higher computational cost, performance reduction, and position +bias. Research indicates that LLM performance hinges on the density and +position of key information in the input prompt. Inspired by these findings, we +propose LongLLMLingua for prompt compression towards improving LLMs' perception +of the key information to simultaneously address the three challenges. Our +extensive evaluation across various long context scenarios demonstrates that +LongLLMLingua not only enhances performance but also significantly reduces +costs and latency. For instance, in the NaturalQuestions benchmark, +LongLLMLingua boosts performance by up to 21.4% with around 4x fewer tokens in +GPT-3.5-Turbo, leading to substantial cost savings. It achieves a 94.0% cost +reduction in the LooGLE benchmark. Moreover, when compressing prompts of about +10k tokens at ratios of 2x-6x, LongLLMLingua can accelerate end-to-end latency +by 1.4x-2.6x. Our code is available at https://aka.ms/LongLLMLingua. + +
+
+ comment: Accepted at ACL 2024 +
+
+
+
+
+ + ♻ ☆ A Billion-scale Foundation Model for Remote Sensing Images + + +
+ As the potential of foundation models in visual tasks has garnered +significant attention, pretraining these models before downstream tasks has +become a crucial step. The three key factors in pretraining foundation models +are the pretraining method, the size of the pretraining dataset, and the number +of model parameters. Recently, research in the remote sensing field has focused +primarily on the pretraining method and the size of the dataset, with limited +emphasis on the number of model parameters. This paper addresses this gap by +examining the effect of increasing the number of model parameters on the +performance of foundation models in downstream tasks such as rotated object +detection and semantic segmentation. We pretrained foundation models with +varying numbers of parameters, including 86M, 605.26M, 1.3B, and 2.4B, to +determine whether performance in downstream tasks improved with an increase in +parameters. To the best of our knowledge, this is the first billion-scale +foundation model in the remote sensing field. Furthermore, we propose an +effective method for scaling up and fine-tuning a vision transformer in the +remote sensing field. To evaluate general performance in downstream tasks, we +employed the DOTA v2.0 and DIOR-R benchmark datasets for rotated object +detection, and the Potsdam and LoveDA datasets for semantic segmentation. +Experimental results demonstrated that, across all benchmark datasets and +downstream tasks, the performance of the foundation models and data efficiency +improved as the number of parameters increased. Moreover, our models achieve +the state-of-the-art performance on several datasets including DIOR-R, Postdam, +and LoveDA. + +
+
+ comment: This manuscript is the accepted version for IEEE Journal of Selected + Topics in Applied Earth Observations and Remote Sensing (IEEE J-STARS) +
+
+
+
+
+ + ♻ ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +implicit-learned memories to retain as many temporal cues as possible. However, +implicit memories still struggle to preserve long-term features effectively. In +this paper, we consider those invisible objects as pseudo-occluded objects and +aim to detect them by tracking through occlusions. Firstly, we introduce the +visibility attribute of objects and contribute an auto-labeling algorithm to +not only clean the existing event camera dataset but also append additional +visibility labels to it. Secondly, we exploit tracking strategies for +pseudo-occluded objects to maintain their permanence and retain their bounding +boxes, even when features have not been available for a very long time. These +strategies can be treated as an explicit-learned memory guided by the tracking +objective to record the displacements of objects across frames. Lastly, we +propose a spatio-temporal feature aggregation module to enrich the latent +features and a consistency loss to increase the robustness of the overall +pipeline. We conduct comprehensive experiments to verify our method's +effectiveness where still objects are retained, but real occluded objects are +discarded. The results demonstrate that (1) the additional visibility labels +can assist in supervised training, and (2) our method outperforms +state-of-the-art approaches with a significant improvement of 7.9% absolute +mAP. + +
+
+
+
+
+ + ♻ ☆ Dyadic Reinforcement Learning + + +
+ Mobile health aims to enhance health outcomes by delivering interventions to +individuals as they go about their daily life. The involvement of care partners +and social support networks often proves crucial in helping individuals +managing burdensome medical conditions. This presents opportunities in mobile +health to design interventions that target the dyadic relationship -- the +relationship between a target person and their care partner -- with the aim of +enhancing social support. In this paper, we develop dyadic RL, an online +reinforcement learning algorithm designed to personalize intervention delivery +based on contextual factors and past responses of a target person and their +care partner. Here, multiple sets of interventions impact the dyad across +multiple time intervals. The developed dyadic RL is Bayesian and hierarchical. +We formally introduce the problem setup, develop dyadic RL and establish a +regret bound. We demonstrate dyadic RL's empirical performance through +simulation studies on both toy scenarios and on a realistic test bed +constructed from data collected in a mobile health study. + +
+
+
+
+
+ + ♻ ☆ xLSTMTime : Long-term Time Series Forecasting With xLSTM + + +
+ In recent years, transformer-based models have gained prominence in +multivariate long-term time series forecasting (LTSF), demonstrating +significant advancements despite facing challenges such as high computational +demands, difficulty in capturing temporal dynamics, and managing long-term +dependencies. The emergence of LTSF-Linear, with its straightforward linear +architecture, has notably outperformed transformer-based counterparts, +prompting a reevaluation of the transformer's utility in time series +forecasting. In response, this paper presents an adaptation of a recent +architecture termed extended LSTM (xLSTM) for LTSF. xLSTM incorporates +exponential gating and a revised memory structure with higher capacity that has +good potential for LTSF. Our adopted architecture for LTSF termed as xLSTMTime +surpasses current approaches. We compare xLSTMTime's performance against +various state-of-the-art models across multiple real-world da-tasets, +demonstrating superior forecasting capabilities. Our findings suggest that +refined recurrent architectures can offer competitive alternatives to +transformer-based models in LTSF tasks, po-tentially redefining the landscape +of time series forecasting. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The basic theory for large language + models + + +
+ Language models have emerged as a critical area of focus in artificial +intelligence, particularly with the introduction of groundbreaking innovations +like ChatGPT. Large-scale Transformer networks have quickly become the leading +approach for advancing natural language processing algorithms. Built on the +Transformer architecture, these models enable interactions that closely mimic +human communication and, equipped with extensive knowledge, can even assist in +guiding human tasks. Despite their impressive capabilities and growing +complexity, a key question remains-the theoretical foundations of large +language models (LLMs). What makes Transformer so effective for powering +intelligent language applications, such as translation and coding? What +underlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme +enhance the fine-tuning of LLMs? And what supports the practicality of pruning +LLMs? To address these critical questions and explore the technological +strategies within LLMs, we leverage the Universal Approximation Theory (UAT) to +offer a theoretical backdrop, shedding light on the mechanisms that underpin +these advancements. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: Foundations for Parallelism in Neural + Networks + + +
+ Neural networks are increasingly evolving towards training large models with +big data, a method that has demonstrated superior performance across many +tasks. However, this approach introduces an urgent problem: current deep +learning models are predominantly serial, meaning that as the number of network +layers increases, so do the training and inference times. This is unacceptable +if deep learning is to continue advancing. Therefore, this paper proposes a +deep learning parallelization strategy based on the Universal Approximation +Theorem (UAT). From this foundation, we designed a parallel network called +Para-Former to test our theory. Unlike traditional serial models, the inference +time of Para-Former does not increase with the number of layers, significantly +accelerating the inference speed of multi-layer networks. Experimental results +validate the effectiveness of this network. + +
+
+
+
+
+ + ♻ ☆ Decoding Speculative Decoding + + +
+ Speculative Decoding is a widely used technique to speed up inference for +Large Language Models (LLMs) without sacrificing quality. When performing +inference, speculative decoding uses a smaller draft model to generate +speculative tokens and then uses the target LLM to verify those draft tokens. +The speedup provided by speculative decoding heavily depends on the choice of +the draft model. In this work, we perform a detailed study comprising over 350 +experiments with LLaMA-65B and OPT-66B using speculative decoding and delineate +the factors that affect the performance gain provided by speculative decoding. +Our experiments indicate that the performance of speculative decoding depends +heavily on the latency of the draft model, and the draft model's capability in +language modeling does not correlate strongly with its performance in +speculative decoding. Based on these insights we explore a new design space for +draft models and design hardware-efficient draft models for speculative +decoding. Our newly designed draft model for LLaMA-65B can provide 111% higher +throughput than existing draft models and can generalize further to the LLaMA-2 +model family and supervised fine-tuned models. + +
+
+
+
+
+ + ♻ ☆ Large language models can be zero-shot anomaly detectors for time + series? + + +
+ Recent studies have shown the ability of large language models to perform a +variety of tasks, including time series forecasting. The flexible nature of +these models allows them to be used for many applications. In this paper, we +present a novel study of large language models used for the challenging task of +time series anomaly detection. This problem entails two aspects novel for LLMs: +the need for the model to identify part of the input sequence (or multiple +parts) as anomalous; and the need for it to work with time series data rather +than the traditional text input. We introduce sigllm, a framework for time +series anomaly detection using large language models. Our framework includes a +time-series-to-text conversion module, as well as end-to-end pipelines that +prompt language models to perform time series anomaly detection. We investigate +two paradigms for testing the abilities of large language models to perform the +detection task. First, we present a prompt-based detection method that directly +asks a language model to indicate which elements of the input are anomalies. +Second, we leverage the forecasting capability of a large language model to +guide the anomaly detection process. We evaluated our framework on 11 datasets +spanning various sources and 10 pipelines. We show that the forecasting method +significantly outperformed the prompting method in all 11 datasets with respect +to the F1 score. Moreover, while large language models are capable of finding +anomalies, state-of-the-art deep learning models are still superior in +performance, achieving results 30% better than large language models. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Are Zero-Shot Time Series Forecasters NeurIPS 2023 + + +
+ By encoding time series as a string of numerical digits, we can frame time +series forecasting as next-token prediction in text. Developing this approach, +we find that large language models (LLMs) such as GPT-3 and LLaMA-2 can +surprisingly zero-shot extrapolate time series at a level comparable to or +exceeding the performance of purpose-built time series models trained on the +downstream tasks. To facilitate this performance, we propose procedures for +effectively tokenizing time series data and converting discrete distributions +over tokens into highly flexible densities over continuous values. We argue the +success of LLMs for time series stems from their ability to naturally represent +multimodal distributions, in conjunction with biases for simplicity, and +repetition, which align with the salient features in many time series, such as +repeated seasonal trends. We also show how LLMs can naturally handle missing +data without imputation through non-numerical text, accommodate textual side +information, and answer questions to help explain predictions. While we find +that increasing model size generally improves performance on time series, we +show GPT-4 can perform worse than GPT-3 because of how it tokenizes numbers, +and poor uncertainty calibration, which is likely the result of alignment +interventions such as RLHF. + +
+
+ comment: NeurIPS 2023. Code available at: https://github.com/ngruver/llmtime +
+
+
+
+
+ + ♻ ☆ A Realistic Protocol for Evaluation of Weakly Supervised Object + Localization + + +
+ Weakly Supervised Object Localization (WSOL) allows training deep learning +models for classification and localization (LOC) using only global class-level +labels. The absence of bounding box (bbox) supervision during training raises +challenges in the literature for hyper-parameter tuning, model selection, and +evaluation. WSOL methods rely on a validation set with bbox annotations for +model selection, and a test set with bbox annotations for threshold estimation +for producing bboxes from localization maps. This approach, however, is not +aligned with the WSOL setting as these annotations are typically unavailable in +real-world scenarios. Our initial empirical analysis shows a significant +decline in LOC performance when model selection and threshold estimation rely +solely on class labels and the image itself, respectively, compared to using +manual bbox annotations. This highlights the importance of incorporating bbox +labels for optimal model performance. In this paper, a new WSOL evaluation +protocol is proposed that provides LOC information without the need for manual +bbox annotations. In particular, we generated noisy pseudo-boxes from a +pretrained off-the-shelf region proposal method such as Selective Search, CLIP, +and RPN for model selection. These bboxes are also employed to estimate the +threshold from LOC maps, circumventing the need for test-set bbox annotations. +Our experiments with several WSOL methods on ILSVRC and CUB datasets show that +using the proposed pseudo-bboxes for validation facilitates the model selection +and threshold estimation, with LOC performance comparable to those selected +using GT bboxes on the validation set and threshold estimation on the test set. +It also outperforms models selected using class-level labels, and then +dynamically thresholded based solely on LOC maps. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Tackling the Local Bias in Federated Graph Learning + + +
+ Federated graph learning (FGL) has become an important research topic in +response to the increasing scale and the distributed nature of graph-structured +data in the real world. In FGL, a global graph is distributed across different +clients, where each client holds a subgraph. Existing FGL methods often fail to +effectively utilize cross-client edges, losing structural information during +the training; additionally, local graphs often exhibit significant distribution +divergence. These two issues make local models in FGL less desirable than in +centralized graph learning, namely the local bias problem in this paper. To +solve this problem, we propose a novel FGL framework to make the local models +similar to the model trained in a centralized setting. Specifically, we design +a distributed learning scheme, fully leveraging cross-client edges to aggregate +information from other clients. In addition, we propose a label-guided sampling +approach to alleviate the imbalanced local data and meanwhile, distinctly +reduce the training overhead. Extensive experiments demonstrate that local bias +can compromise the model performance and slow down the convergence during +training. Experimental results also verify that our framework successfully +mitigates local bias, achieving better performance than other baselines with +lower time and memory overhead. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Prioritizing Informative Features and Examples for Deep Learning from + Noisy Data + + +
+ In this dissertation, we propose a systemic framework that prioritizes +informative features and examples to enhance each stage of the development +process. Specifically, we prioritize informative features and examples and +improve the performance of feature learning, data labeling, and data selection. +We first propose an approach to extract only informative features that are +inherent to solving a target task by using auxiliary out-of-distribution data. +We deactivate the noise features in the target distribution by using that in +the out-of-distribution data. Next, we introduce an approach that prioritizes +informative examples from unlabeled noisy data in order to reduce the labeling +cost of active learning. In order to solve the purity-information dilemma, +where an attempt to select informative examples induces the selection of many +noisy examples, we propose a meta-model that finds the best balance between +purity and informativeness. Lastly, we suggest an approach that prioritizes +informative examples from labeled noisy data to preserve the performance of +data selection. For labeled image noise data, we propose a data selection +method that considers the confidence of neighboring samples to maintain the +performance of the state-of-the-art Re-labeling models. For labeled text noise +data, we present an instruction selection method that takes diversity into +account for ranking the quality of instructions with prompting, thereby +enhancing the performance of aligned large language models. + Overall, our unified framework induces the deep learning development process +robust to noisy data, thereby effectively mitigating noisy features and +examples in real-world applications. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ♻ ☆ Revisiting Inexact Fixed-Point Iterations for Min-Max Problems: + Stochasticity and Structured Nonconvexity + + +
+ We focus on constrained, $L$-smooth, potentially stochastic and +nonconvex-nonconcave min-max problems either satisfying +$\rho$-cohypomonotonicity or admitting a solution to the $\rho$-weakly Minty +Variational Inequality (MVI), where larger values of the parameter $\rho>0$ +correspond to a greater degree of nonconvexity. These problem classes include +examples in two player reinforcement learning, interaction dominant min-max +problems, and certain synthetic test problems on which classical min-max +algorithms fail. It has been conjectured that first-order methods can tolerate +a value of $\rho$ no larger than $\frac{1}{L}$, but existing results in the +literature have stagnated at the tighter requirement $\rho < \frac{1}{2L}$. +With a simple argument, we obtain optimal or best-known complexity guarantees +with cohypomonotonicity or weak MVI conditions for $\rho < \frac{1}{L}$. First +main insight for the improvements in the convergence analyses is to harness the +recently proposed $\textit{conic nonexpansiveness}$ property of operators. +Second, we provide a refined analysis for inexact Halpern iteration that +relaxes the required inexactness level to improve some state-of-the-art +complexity results even for constrained stochastic convex-concave min-max +problems. Third, we analyze a stochastic inexact Krasnosel'ski\u{\i}-Mann +iteration with a multilevel Monte Carlo estimator when the assumptions only +hold with respect to a solution. + +
+
+
+
+
+ + ♻ ☆ Enhancing Node Representations for Real-World Complex Networks with + Topological Augmentation ECAI 2024 + + +
+ Graph augmentation methods play a crucial role in improving the performance +and enhancing generalisation capabilities in Graph Neural Networks (GNNs). +Existing graph augmentation methods mainly perturb the graph structures, and +are usually limited to pairwise node relations. These methods cannot fully +address the complexities of real-world large-scale networks, which often +involve higher-order node relations beyond only being pairwise. Meanwhile, +real-world graph datasets are predominantly modelled as simple graphs, due to +the scarcity of data that can be used to form higher-order edges. Therefore, +reconfiguring the higher-order edges as an integration into graph augmentation +strategies lights up a promising research path to address the aforementioned +issues. In this paper, we present Topological Augmentation (TopoAug), a novel +graph augmentation method that builds a combinatorial complex from the original +graph by constructing virtual hyperedges directly from the raw data. TopoAug +then produces auxiliary node features by extracting information from the +combinatorial complex, which are used for enhancing GNN performances on +downstream tasks. We design three diverse virtual hyperedge construction +strategies to accompany the construction of combinatorial complexes: (1) via +graph statistics, (2) from multiple data perspectives, and (3) utilising +multi-modality. Furthermore, to facilitate TopoAug evaluation, we provide 23 +novel real-world graph datasets across various domains including social media, +biology, and e-commerce. Our empirical study shows that TopoAug consistently +and significantly outperforms GNN baselines and other graph augmentation +methods, across a variety of application contexts, which clearly indicates that +it can effectively incorporate higher-order node relations into the graph +augmentation for real-world complex networks. + +
+
+ comment: In 27th European Conference on Artificial Intelligence (ECAI 2024). + 13 pages, 2 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Nonconvex Factorization and Manifold Formulations are Almost Equivalent + in Low-rank Matrix Optimization + + +
+ In this paper, we consider the geometric landscape connection of the widely +studied manifold and factorization formulations in low-rank positive +semidefinite (PSD) and general matrix optimization. We establish a sandwich +relation on the spectrum of Riemannian and Euclidean Hessians at first-order +stationary points (FOSPs). As a result of that, we obtain an equivalence on the +set of FOSPs, second-order stationary points (SOSPs) and strict saddles between +the manifold and the factorization formulations. In addition, we show the +sandwich relation can be used to transfer more quantitative geometric +properties from one formulation to another. Similarities and differences in the +landscape connection under the PSD case and the general case are discussed. To +the best of our knowledge, this is the first geometric landscape connection +between the manifold and the factorization formulations for handling rank +constraints, and it provides a geometric explanation for the similar empirical +performance of factorization and manifold approaches in low-rank matrix +optimization observed in the literature. In the general low-rank matrix +optimization, the landscape connection of two factorization formulations +(unregularized and regularized ones) is also provided. By applying these +geometric landscape connections, in particular, the sandwich relation, we are +able to solve unanswered questions in literature and establish stronger results +in the applications on geometric analysis of phase retrieval, well-conditioned +low-rank matrix optimization, and the role of regularization in factorization +arising from machine learning and signal processing. + +
+
+
+
+
+ + ♻ ☆ MIS-ME: A Multi-modal Framework for Soil Moisture Estimation + + +
+ Soil moisture estimation is an important task to enable precision agriculture +in creating optimal plans for irrigation, fertilization, and harvest. It is +common to utilize statistical and machine learning models to estimate soil +moisture from traditional data sources such as weather forecasts, soil +properties, and crop properties. However, there is a growing interest in +utilizing aerial and geospatial imagery to estimate soil moisture. Although +these images capture high-resolution crop details, they are expensive to curate +and challenging to interpret. Imagine, an AI-enhanced software tool that +predicts soil moisture using visual cues captured by smartphones and +statistical data given by weather forecasts. This work is a first step towards +that goal of developing a multi-modal approach for soil moisture estimation. In +particular, we curate a dataset consisting of real-world images taken from +ground stations and their corresponding weather data. We also propose MIS-ME - +Meteorological & Image based Soil Moisture Estimator, a multi-modal framework +for soil moisture estimation. Our extensive analysis shows that MIS-ME achieves +a MAPE of 10.14%, outperforming traditional unimodal approaches with a +reduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image +data, highlighting the effectiveness of tailored multi-modal approaches. + +
+
+ comment: Accepted by DSAA2024 +
+
+
+
+
+ + ♻ ☆ Predicting the First Response Latency of Maintainers and Contributors in + Pull Requests + + +
+ The success of a Pull Request (PR) depends on the responsiveness of the +maintainers and the contributor during the review process. Being aware of the +expected waiting times can lead to better interactions and managed expectations +for both the maintainers and the contributor. In this paper, we propose a +machine-learning approach to predict the first response latency of the +maintainers following the submission of a PR, and the first response latency of +the contributor after receiving the first response from the maintainers. We +curate a dataset of 20 large and popular open-source projects on GitHub and +extract 21 features to characterize projects, contributors, PRs, and review +processes. Using these features, we then evaluate seven types of classifiers to +identify the best-performing models. We also conduct permutation feature +importance and SHAP analyses to understand the importance and the impact of +different features on the predicted response latencies. We find that our +CatBoost models are the most effective for predicting the first response +latencies of both maintainers and contributors. We also observe that PRs +submitted earlier in the week, containing an average number of commits, and +with concise descriptions are more likely to receive faster first responses +from the maintainers. Similarly, PRs with a lower first response latency from +maintainers, that received the first response of maintainers earlier in the +week, and containing an average number of commits tend to receive faster first +responses from the contributors. Additionally, contributors with a higher +acceptance rate and a history of timely responses in the project are likely to +both obtain and provide faster first responses. Moreover, we show the +effectiveness of our approach in a cross-project setting. + +
+
+ comment: Manuscript accepted for publication in IEEE Transactions on Software + Engineering (TSE) +
+
+
+
+
+ + ♻ ☆ From $r$ to $Q^*$: Your Language Model is Secretly a Q-Function + + +
+ Reinforcement Learning From Human Feedback (RLHF) has been critical to the +success of the latest generation of generative AI models. In response to the +complex nature of the classical RLHF pipeline, direct alignment algorithms such +as Direct Preference Optimization (DPO) have emerged as an alternative +approach. Although DPO solves the same objective as the standard RLHF setup, +there is a mismatch between the two approaches. Standard RLHF deploys +reinforcement learning in a specific token-level MDP, while DPO is derived as a +bandit problem in which the whole response of the model is treated as a single +arm. In this work we rectify this difference. We theoretically show that we can +derive DPO in the token-level MDP as a general inverse Q-learning algorithm, +which satisfies the Bellman equation. Using our theoretical results, we provide +three concrete empirical insights. First, we show that because of its token +level interpretation, DPO is able to perform some type of credit assignment. +Next, we prove that under the token level formulation, classical search-based +algorithms, such as MCTS, which have recently been applied to the language +generation space, are equivalent to likelihood-based search on a DPO policy. +Empirically we show that a simple beam search yields meaningful improvement +over the base DPO policy. Finally, we show how the choice of reference policy +causes implicit rewards to decline during training. We conclude by discussing +applications of our work, including information elicitation in multi-turn +dialogue, reasoning, agentic applications and end-to-end training of +multi-model systems. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Performative Prediction with Bandit Feedback: Learning through + Reparameterization + + +
+ Performative prediction, as introduced by Perdomo et al, is a framework for +studying social prediction in which the data distribution itself changes in +response to the deployment of a model. Existing work in this field usually +hinges on three assumptions that are easily violated in practice: that the +performative risk is convex over the deployed model, that the mapping from the +model to the data distribution is known to the model designer in advance, and +the first-order information of the performative risk is available. In this +paper, we initiate the study of performative prediction problems that do not +require these assumptions. Specifically, we develop a reparameterization +framework that reparametrizes the performative prediction objective as a +function of the induced data distribution. We then develop a two-level +zeroth-order optimization procedure, where the first level performs iterative +optimization on the distribution parameter space, and the second level learns +the model that induces a particular target distribution at each iteration. +Under mild conditions, this reparameterization allows us to transform the +non-convex objective into a convex one and achieve provable regret guarantees. +In particular, we provide a regret bound that is sublinear in the total number +of performative samples taken and is only polynomial in the dimension of the +model parameter. + +
+
+
+
+
+ + ♻ ☆ EvoluNet: Advancing Dynamic Non-IID Transfer Learning on Graphs ICML 2024 + + +
+ Non-IID transfer learning on graphs is crucial in many high-stakes domains. +The majority of existing works assume stationary distribution for both source +and target domains. However, real-world graphs are intrinsically dynamic, +presenting challenges in terms of domain evolution and dynamic discrepancy +between source and target domains. To bridge the gap, we shift the problem to +the dynamic setting and pose the question: given the label-rich source graphs +and the label-scarce target graphs both observed in previous T timestamps, how +can we effectively characterize the evolving domain discrepancy and optimize +the generalization performance of the target domain at the incoming T+1 +timestamp? To answer it, we propose a generalization bound for dynamic non-IID +transfer learning on graphs, which implies the generalization performance is +dominated by domain evolution and domain discrepancy between source and target +graphs. Inspired by the theoretical results, we introduce a novel generic +framework named EvoluNet. It leverages a transformer-based temporal encoding +module to model temporal information of the evolving domains and then uses a +dynamic domain unification module to efficiently learn domain-invariant +representations across the source and target domains. Finally, EvoluNet +outperforms the state-of-the-art models by up to 12.1%, demonstrating its +effectiveness in transferring knowledge from dynamic source graphs to dynamic +target graphs. + +
+
+ comment: Accepted at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Neural Dynamical Operator: Continuous Spatial-Temporal Model with + Gradient-Based and Derivative-Free Optimization Methods + + +
+ Data-driven modeling techniques have been explored in the spatial-temporal +modeling of complex dynamical systems for many engineering applications. +However, a systematic approach is still lacking to leverage the information +from different types of data, e.g., with different spatial and temporal +resolutions, and the combined use of short-term trajectories and long-term +statistics. In this work, we build on the recent progress of neural operator +and present a data-driven modeling framework called neural dynamical operator +that is continuous in both space and time. A key feature of the neural +dynamical operator is the resolution-invariance with respect to both spatial +and temporal discretizations, without demanding abundant training data in +different temporal resolutions. To improve the long-term performance of the +calibrated model, we further propose a hybrid optimization scheme that +leverages both gradient-based and derivative-free optimization methods and +efficiently trains on both short-term time series and long-term statistics. We +investigate the performance of the neural dynamical operator with three +numerical examples, including the viscous Burgers' equation, the Navier-Stokes +equations, and the Kuramoto-Sivashinsky equation. The results confirm the +resolution-invariance of the proposed modeling framework and also demonstrate +stable long-term simulations with only short-term time series data. In +addition, we show that the proposed model can better predict long-term +statistics via the hybrid optimization scheme with a combined use of short-term +and long-term data. + +
+
+
+
+
+ + ♻ ☆ Convergence Properties of Score-Based Models for Linear Inverse Problems + Using Graduated Optimisation + + +
+ The incorporation of generative models as regularisers within variational +formulations for inverse problems has proven effective across numerous image +reconstruction tasks. However, the resulting optimisation problem is often +non-convex and challenging to solve. In this work, we show that score-based +generative models (SGMs) can be used in a graduated optimisation framework to +solve inverse problems. We show that the resulting graduated non-convexity flow +converge to stationary points of the original problem and provide a numerical +convergence analysis of a 2D toy example. We further provide experiments on +computed tomography image reconstruction, where we show that this framework is +able to recover high-quality images, independent of the initial value. The +experiments highlight the potential of using SGMs in graduated optimisation +frameworks. The source code is publicly available on GitHub. + +
+
+ comment: 8 pages +
+
+
+
+
+
+
+
+ + Multimedia 12 + +
+
+
+ + ☆ Rethinking Video with a Universal Event-Based Representation + + +
+ Traditionally, video is structured as a sequence of discrete image frames. +Recently, however, a novel video sensing paradigm has emerged which eschews +video frames entirely. These "event" sensors aim to mimic the human vision +system with asynchronous sensing, where each pixel has an independent, sparse +data stream. While these cameras enable high-speed and high-dynamic-range +sensing, researchers often revert to a framed representation of the event data +for existing applications, or build bespoke applications for a particular +camera's event data type. At the same time, classical video systems have +significant computational redundancy at the application layer, since pixel +samples are repeated across frames in the uncompressed domain. + To address the shortcomings of existing systems, I introduce Address, +Decimation, {\Delta}t Event Representation (AD{\Delta}ER, pronounced "adder"), +a novel intermediate video representation and system framework. The framework +transcodes a variety of framed and event camera sources into a single +event-based representation, which supports source-modeled lossy compression and +backward compatibility with traditional frame-based applications. I demonstrate +that AD{\Delta}ER achieves state-of-the-art application speed and compression +performance for scenes with high temporal redundancy. Crucially, I describe how +AD{\Delta}ER unlocks an entirely new control mechanism for computer vision: +application speed can correlate with both the scene content and the level of +lossy compression. Finally, I discuss the implications for event-based video on +large-scale video surveillance and resource-constrained sensing. + +
+
+ comment: 137 pages. PhD dissertation at the University of North Carolina, + Chapel Hill +
+
+
+
+
+ + ☆ Palantir: Towards Efficient Super Resolution for Ultra-high-definition + Live Streaming + + +
+ Neural enhancement through super-resolution deep neural networks opens up new +possibilities for ultra-high-definition live streaming over existing encoding +and networking infrastructure. Yet, the heavy SR DNN inference overhead leads +to severe deployment challenges. To reduce the overhead, existing systems +propose to apply DNN-based SR only on selected anchor frames while upscaling +non-anchor frames via the lightweight reusing-based SR approach. However, +frame-level scheduling is coarse-grained and fails to deliver optimal +efficiency. In this work, we propose Palantir, the first neural-enhanced UHD +live streaming system with fine-grained patch-level scheduling. In the +presented solutions, two novel techniques are incorporated to make good +scheduling decisions for inference overhead optimization and reduce the +scheduling latency. Firstly, under the guidance of our pioneering and +theoretical analysis, Palantir constructs a directed acyclic graph (DAG) for +lightweight yet accurate quality estimation under any possible anchor patch +set. Secondly, to further optimize the scheduling latency, Palantir improves +parallelizability by refactoring the computation subprocedure of the estimation +process into a sparse matrix-matrix multiplication operation. The evaluation +results suggest that Palantir incurs a negligible scheduling latency accounting +for less than 5.7% of the end-to-end latency requirement. When compared to the +state-of-the-art real-time frame-level scheduling strategy, Palantir reduces +the energy overhead of SR-integrated mobile clients by 38.1% at most (and 22.4% +on average) and the monetary costs of cloud-based SR by 80.1% at most (and +38.4% on average). + +
+
+
+
+
+ + ☆ DPDETR: Decoupled Position Detection Transformer for Infrared-Visible + Object Detection + + +
+ Infrared-visible object detection aims to achieve robust object detection by +leveraging the complementary information of infrared and visible image pairs. +However, the commonly existing modality misalignment problem presents two +challenges: fusing misalignment complementary features is difficult, and +current methods cannot accurately locate objects in both modalities under +misalignment conditions. In this paper, we propose a Decoupled Position +Detection Transformer (DPDETR) to address these problems. Specifically, we +explicitly formulate the object category, visible modality position, and +infrared modality position to enable the network to learn the intrinsic +relationships and output accurate positions of objects in both modalities. To +fuse misaligned object features accurately, we propose a Decoupled Position +Multispectral Cross-attention module that adaptively samples and aggregates +multispectral complementary features with the constraint of infrared and +visible reference positions. Additionally, we design a query-decoupled +Multispectral Decoder structure to address the optimization gap among the three +kinds of object information in our task and propose a Decoupled Position +Contrastive DeNosing Training strategy to enhance the DPDETR's ability to learn +decoupled positions. Experiments on DroneVehicle and KAIST datasets demonstrate +significant improvements compared to other state-of-the-art methods. The code +will be released at https://github.com/gjj45/DPDETR. + +
+
+
+
+
+ + ☆ Freehand Sketch Generation from Mechanical Components ACM MM + + +
+ Drawing freehand sketches of mechanical components on multimedia devices for +AI-based engineering modeling has become a new trend. However, its development +is being impeded because existing works cannot produce suitable sketches for +data-driven research. These works either generate sketches lacking a freehand +style or utilize generative models not originally designed for this task +resulting in poor effectiveness. To address this issue, we design a two-stage +generative framework mimicking the human sketching behavior pattern, called +MSFormer, which is the first time to produce humanoid freehand sketches +tailored for mechanical components. The first stage employs Open CASCADE +technology to obtain multi-view contour sketches from mechanical components, +filtering perturbing signals for the ensuing generation process. Meanwhile, we +design a view selector to simulate viewpoint selection tasks during human +sketching for picking out information-rich sketches. The second stage +translates contour sketches into freehand sketches by a transformer-based +generator. To retain essential modeling features as much as possible and +rationalize stroke distribution, we introduce a novel edge-constraint stroke +initialization. Furthermore, we utilize a CLIP vision encoder and a new loss +function incorporating the Hausdorff distance to enhance the generalizability +and robustness of the model. Extensive experiments demonstrate that our +approach achieves state-of-the-art performance for generating freehand sketches +in the mechanical domain. Project page: https://mcfreeskegen.github.io . + +
+
+ comment: Published at ACM Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ☆ A Simple Task-aware Contrastive Local Descriptor Selection Strategy for + Few-shot Learning between inter class and intra class ICANN 2024 + + +
+ Few-shot image classification aims to classify novel classes with few labeled +samples. Recent research indicates that deep local descriptors have better +representational capabilities. These studies recognize the impact of background +noise on classification performance. They typically filter query descriptors +using all local descriptors in the support classes or engage in bidirectional +selection between local descriptors in support and query sets. However, they +ignore the fact that background features may be useful for the classification +performance of specific tasks. This paper proposes a novel task-aware +contrastive local descriptor selection network (TCDSNet). First, we calculate +the contrastive discriminative score for each local descriptor in the support +class, and select discriminative local descriptors to form a support descriptor +subset. Finally, we leverage support descriptor subsets to adaptively select +discriminative query descriptors for specific tasks. Extensive experiments +demonstrate that our method outperforms state-of-the-art methods on both +general and fine-grained datasets. + +
+
+ comment: Submitted to ICANN 2024 +
+
+
+
+
+ + ☆ FoVNet: Configurable Field-of-View Speech Enhancement with Low + Computation and Distortion for Smart Glasses INTERSPEECH2024 + + +
+ This paper presents a novel multi-channel speech enhancement approach, +FoVNet, that enables highly efficient speech enhancement within a configurable +field of view (FoV) of a smart-glasses user without needing specific +target-talker(s) directions. It advances over prior works by enhancing all +speakers within any given FoV, with a hybrid signal processing and deep +learning approach designed with high computational efficiency. The neural +network component is designed with ultra-low computation (about 50 MMACS). A +multi-channel Wiener filter and a post-processing module are further used to +improve perceptual quality. We evaluate our algorithm with a microphone array +on smart glasses, providing a configurable, efficient solution for augmented +hearing on energy-constrained devices. FoVNet excels in both computational +efficiency and speech quality across multiple scenarios, making it a promising +solution for smart glasses applications. + +
+
+ comment: Accepted by INTERSPEECH2024 +
+
+
+
+
+ + ☆ Source Separation of Multi-source Raw Music using a Residual Quantized + Variational Autoencoder + + +
+ I developed a neural audio codec model based on the residual quantized +variational autoencoder architecture. I train the model on the Slakh2100 +dataset, a standard dataset for musical source separation, composed of +multi-track audio. The model can separate audio sources, achieving almost SoTA +results with much less computing power. The code is publicly available at +github.com/LeonardoBerti00/Source-Separation-of-Multi-source-Music-using-Residual-Quantizad-Variational-Autoencoder + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ BI-MDRG: Bridging Image History in Multimodal Dialogue Response + Generation ECCV 2024 + + +
+ Multimodal Dialogue Response Generation (MDRG) is a recently proposed task +where the model needs to generate responses in texts, images, or a blend of +both based on the dialogue context. Due to the lack of a large-scale dataset +specifically for this task and the benefits of leveraging powerful pre-trained +models, previous work relies on the text modality as an intermediary step for +both the image input and output of the model rather than adopting an end-to-end +approach. However, this approach can overlook crucial information about the +image, hindering 1) image-grounded text response and 2) consistency of objects +in the image response. In this paper, we propose BI-MDRG that bridges the +response generation path such that the image history information is utilized +for enhanced relevance of text responses to the image content and the +consistency of objects in sequential image responses. Through extensive +experiments on the multimodal dialogue benchmark dataset, we show that BI-MDRG +can effectively increase the quality of multimodal dialogue. Additionally, +recognizing the gap in benchmark datasets for evaluating the image consistency +in multimodal dialogue, we have created a curated set of 300 dialogues +annotated to track object consistency across conversations. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Diseño de sonido para producciones audiovisuales e historias sonoras + en el aula. Hacia una docencia creativa mediante el uso de herramientas + inteligentes + + +
+ This study aims to share a teaching experience teaching sound design for +audiovisual productions and compares different projects tackled by students. It +is not intended to be a comparative analysis of different types of teaching but +rather an analysis of different problems observed in different profiles of +students of the subject who study it in different grades. The world of audio +can be very interesting for a large part of the students, both those with +creative and technical inclinations. Musical creation and production, +synchronization with images, dubbing, etc. They are disciplines that are +generally interesting but can have a very high barrier to entry due to their +great technical complexity. Sometimes it can take weeks or even months for the +uninitiated to begin to use audio editing programs with the necessary ease, +which are not always particularly intuitive for students. Learning through the +use of PBL methodologies generates, in our experience, results much superior to +those that can be observed through the use of other teaching methods such as +master classes. Students acquire technical skills while developing creative +projects in which they get personally involved. Despite everything mentioned +above, most interactions between teachers and students focus on aspects of +technical correction. From different parameters in reverbs (such as pre-delay, +decay, modulation...) to how to correctly adjust compressors, noise gates, +etc.; The number of tools with which to work with audio is incredibly +extensive, as well as many of its features that can present serious differences +depending on their manufacturers. + +
+
+ comment: 11 pages, in Spanish language. 1 figure. Preprint from La nueva era + del podcast (2023) +
+
+
+
+
+ + ♻ ☆ Control-A-Video: Controllable Text-to-Video Diffusion Models with Motion + Prior and Reward Feedback Learning + + +
+ Recent advances in text-to-image (T2I) diffusion models have enabled +impressive image generation capabilities guided by text prompts. However, +extending these techniques to video generation remains challenging, with +existing text-to-video (T2V) methods often struggling to produce high-quality +and motion-consistent videos. In this work, we introduce Control-A-Video, a +controllable T2V diffusion model that can generate videos conditioned on text +prompts and reference control maps like edge and depth maps. To tackle video +quality and motion consistency issues, we propose novel strategies to +incorporate content prior and motion prior into the diffusion-based generation +process. Specifically, we employ a first-frame condition scheme to transfer +video generation from the image domain. Additionally, we introduce +residual-based and optical flow-based noise initialization to infuse motion +priors from reference videos, promoting relevance among frame latents for +reduced flickering. Furthermore, we present a Spatio-Temporal Reward Feedback +Learning (ST-ReFL) algorithm that optimizes the video diffusion model using +multiple reward models for video quality and motion consistency, leading to +superior outputs. Comprehensive experiments demonstrate that our framework +generates higher-quality, more consistent videos compared to existing +state-of-the-art methods in controllable text-to-video generation + +
+
+
+
+
+ + ♻ ☆ MultiHateClip: A Multilingual Benchmark Dataset for Hateful Video + Detection on YouTube and Bilibili + + +
+ Hate speech is a pressing issue in modern society, with significant effects +both online and offline. Recent research in hate speech detection has primarily +centered on text-based media, largely overlooking multimodal content such as +videos. Existing studies on hateful video datasets have predominantly focused +on English content within a Western context and have been limited to binary +labels (hateful or non-hateful), lacking detailed contextual information. This +study presents MultiHateClip1 , an novel multilingual dataset created through +hate lexicons and human annotation. It aims to enhance the detection of hateful +videos on platforms such as YouTube and Bilibili, including content in both +English and Chinese languages. Comprising 2,000 videos annotated for +hatefulness, offensiveness, and normalcy, this dataset provides a +cross-cultural perspective on gender-based hate speech. Through a detailed +examination of human annotation results, we discuss the differences between +Chinese and English hateful videos and underscore the importance of different +modalities in hateful and offensive video analysis. Evaluations of +state-of-the-art video classification models, such as VLM, GPT-4V and Qwen-VL, +on MultiHateClip highlight the existing challenges in accurately distinguishing +between hateful and offensive content and the urgent need for models that are +both multimodally and culturally nuanced. MultiHateClip represents a +foundational advance in enhancing hateful video detection by underscoring the +necessity of a multimodal and culturally sensitive approach in combating online +hate speech. + +
+
+ comment: 10 pages, 3 figures, ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ UniAV: Unified Audio-Visual Perception for Multi-Task Video Event + Localization + + +
+ Video localization tasks aim to temporally locate specific instances in +videos, including temporal action localization (TAL), sound event detection +(SED) and audio-visual event localization (AVEL). Existing methods +over-specialize on each task, overlooking the fact that these instances often +occur in the same video to form the complete video content. In this work, we +present UniAV, a Unified Audio-Visual perception network, to achieve joint +learning of TAL, SED and AVEL tasks for the first time. UniAV can leverage +diverse data available in task-specific datasets, allowing the model to learn +and share mutually beneficial knowledge across tasks and modalities. To tackle +the challenges posed by substantial variations in datasets +(size/domain/duration) and distinct task characteristics, we propose to +uniformly encode visual and audio modalities of all videos to derive generic +representations, while also designing task-specific experts to capture unique +knowledge for each task. Besides, we develop a unified language-aware +classifier by utilizing a pre-trained text encoder, enabling the model to +flexibly detect various types of instances and previously unseen ones by simply +changing prompts during inference. UniAV outperforms its single-task +counterparts by a large margin with fewer parameters, achieving on-par or +superior performances compared to state-of-the-art task-specific methods across +ActivityNet 1.3, DESED and UnAV-100 benchmarks. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 29 + +
+
+
+ + ☆ LLM-Based Robust Product Classification in Commerce and Compliance + + +
+ Product classification is a crucial task in international trade, as +compliance regulations are verified and taxes and duties are applied based on +product categories. Manual classification of products is time-consuming and +error-prone, and the sheer volume of products imported and exported renders the +manual process infeasible. Consequently, e-commerce platforms and enterprises +involved in international trade have turned to automatic product classification +using machine learning. However, current approaches do not consider the +real-world challenges associated with product classification, such as very +abbreviated and incomplete product descriptions. In addition, recent +advancements in generative Large Language Models (LLMs) and their reasoning +capabilities are mainly untapped in product classification and e-commerce. In +this research, we explore the real-life challenges of industrial classification +and we propose data perturbations that allow for realistic data simulation. +Furthermore, we employ LLM-based product classification to improve the +robustness of the prediction in presence of incomplete data. Our research shows +that LLMs with in-context learning outperform the supervised approaches in the +clean-data scenario. Additionally, we illustrate that LLMs are significantly +more robust than the supervised approaches when data attacks are present. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Defining Boundaries: A Spectrum of Task Feasibility for Large Language + Models + + +
+ Large language models (LLMs) have shown remarkable performance in various +tasks but often fail to handle queries that exceed their knowledge and +capabilities, leading to incorrect or fabricated responses. This paper +addresses the need for LLMs to recognize and refuse infeasible tasks due to the +required skills surpassing their capabilities. We first systematically +conceptualize infeasible tasks for LLMs, providing formal definitions and +categorizations that cover a spectrum of related hallucinations. We develop and +benchmark a new dataset comprising diverse infeasible and feasible tasks to +test multiple LLMs' abilities on task feasibility. Furthermore, we explore the +potential of training enhancements to increase LLMs' refusal capabilities with +fine-tuning. Experiments validate the effectiveness of our methods, offering +promising directions for refining the operational boundaries of LLMs in real +applications. + +
+
+ comment: 20 pages, 9 tables, 15 Figures +
+
+
+
+
+ + ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: A full draft of the second version of the article +
+
+
+
+
+ + ☆ SAGA: A Participant-specific Examination of Story Alternatives and Goal + Applicability for a Deeper Understanding of Complex Events + + +
+ Interpreting and assessing goal driven actions is vital to understanding and +reasoning over complex events. It is important to be able to acquire the +knowledge needed for this understanding, though doing so is challenging. We +argue that such knowledge can be elicited through a participant achievement +lens. We analyze a complex event in a narrative according to the intended +achievements of the participants in that narrative, the likely future actions +of the participants, and the likelihood of goal success. We collect 6.3K high +quality goal and action annotations reflecting our proposed participant +achievement lens, with an average weighted Fleiss-Kappa IAA of 80%. Our +collection contains annotated alternate versions of each narrative. These +alternate versions vary minimally from the "original" story, but can license +drastically different inferences. Our findings suggest that while modern large +language models can reflect some of the goal-based knowledge we study, they +find it challenging to fully capture the design and intent behind concerted +actions, even when the model pretraining included the data from which we +extracted the goal knowledge. We show that smaller models fine-tuned on our +dataset can achieve performance surpassing larger models. + +
+
+ comment: Accepted to Findings of the Association for Computational Linguistics + 2024 +
+
+
+
+
+ + ☆ HiLight: A Hierarchy-aware Light Global Model with Hierarchical Local + ConTrastive Learning + + +
+ Hierarchical text classification (HTC) is a special sub-task of multi-label +classification (MLC) whose taxonomy is constructed as a tree and each sample is +assigned with at least one path in the tree. Latest HTC models contain three +modules: a text encoder, a structure encoder and a multi-label classification +head. Specially, the structure encoder is designed to encode the hierarchy of +taxonomy. However, the structure encoder has scale problem. As the taxonomy +size increases, the learnable parameters of recent HTC works grow rapidly. +Recursive regularization is another widely-used method to introduce +hierarchical information but it has collapse problem and generally relaxed by +assigning with a small weight (ie. 1e-6). In this paper, we propose a +Hierarchy-aware Light Global model with Hierarchical local conTrastive learning +(HiLight), a lightweight and efficient global model only consisting of a text +encoder and a multi-label classification head. We propose a new learning task +to introduce the hierarchical information, called Hierarchical Local +Contrastive Learning (HiLCL). Extensive experiments are conducted on two +benchmark datasets to demonstrate the effectiveness of our model. + +
+
+
+
+
+ + ☆ LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech + Recognition INTERSPEECH 2024 + + +
+ Test-Time Adaptation (TTA) has emerged as a crucial solution to the domain +shift challenge, wherein the target environment diverges from the original +training environment. A prime exemplification is TTA for Automatic Speech +Recognition (ASR), which enhances model performance by leveraging output +prediction entropy minimization as a self-supervision signal. However, a key +limitation of this self-supervision lies in its primary focus on acoustic +features, with minimal attention to the linguistic properties of the input. To +address this gap, we propose Language Informed Test-Time Adaptation (LI-TTA), +which incorporates linguistic insights during TTA for ASR. LI-TTA integrates +corrections from an external language model to merge linguistic with acoustic +information by minimizing the CTC loss from the correction alongside the +standard TTA loss. With extensive experiments, we show that LI-TTA effectively +improves the performance of TTA for ASR in various distribution shift +situations. + +
+
+ comment: INTERSPEECH 2024 +
+
+
+
+
+ + ☆ Reference-free Hallucination Detection for Large Vision-Language Models + + +
+ Large vision-language models (LVLMs) have made significant progress in recent +years. While LVLMs exhibit excellent ability in language understanding, +question answering, and conversations of visual inputs, they are prone to +producing hallucinations. While several methods are proposed to evaluate the +hallucinations in LVLMs, most are reference-based and depend on external tools, +which complicates their practical application. To assess the viability of +alternative methods, it is critical to understand whether the reference-free +approaches, which do not rely on any external tools, can efficiently detect +hallucinations. Therefore, we initiate an exploratory study to demonstrate the +effectiveness of different reference-free solutions in detecting hallucinations +in LVLMs. In particular, we conduct an extensive study on three kinds of +techniques: uncertainty-based, consistency-based, and supervised uncertainty +quantification methods on four representative LVLMs across two different tasks. +The empirical results show that the reference-free approaches are capable of +effectively detecting non-factual responses in LVLMs, with the supervised +uncertainty quantification method outperforming the others, achieving the best +performance across different settings. + +
+
+
+
+
+ + ☆ VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for + Speech Processing + + +
+ Deep learning has brought significant improvements to the field of +cross-modal representation learning. For tasks such as text-to-speech (TTS), +voice conversion (VC), and automatic speech recognition (ASR), a cross-modal +fine-grained (frame-level) sequence representation is desired, emphasizing the +semantic content of the text modality while de-emphasizing the paralinguistic +information of the speech modality. We propose a method called "Vector +Quantized Contrastive Token-Acoustic Pre-training (VQ-CTAP)", which uses the +cross-modal aligned sequence transcoder to bring text and speech into a joint +multimodal space, learning how to connect text and speech at the frame level. +The proposed VQ-CTAP is a paradigm for cross-modal sequence representation +learning, offering a promising solution for fine-grained generation and +recognition tasks in speech processing. The VQ-CTAP can be directly applied to +VC and ASR tasks without fine-tuning or additional structures. We propose a +sequence-aware semantic connector, which connects multiple frozen pre-trained +modules for the TTS task, exhibiting a plug-and-play capability. We design a +stepping optimization strategy to ensure effective model convergence by +gradually injecting and adjusting the influence of various loss components. +Furthermore, we propose a semantic-transfer-wise paralinguistic consistency +loss to enhance representational capabilities, allowing the model to better +generalize to unseen data and capture the nuances of paralinguistic +information. In addition, VQ-CTAP achieves high-compression speech coding at a +rate of 25Hz from 24kHz input waveforms, which is a 960-fold reduction in the +sampling rate. The audio demo is available at +https://qiangchunyu.github.io/VQCTAP/ + +
+
+
+
+
+ + ☆ Language-Informed Beam Search Decoding for Multilingual Machine + Translation ACL 2024 + + +
+ Beam search decoding is the de-facto method for decoding auto-regressive +Neural Machine Translation (NMT) models, including multilingual NMT where the +target language is specified as an input. However, decoding multilingual NMT +models commonly produces ``off-target'' translations -- yielding translation +outputs not in the intended language. In this paper, we first conduct an error +analysis of off-target translations for a strong multilingual NMT model and +identify how these decodings are produced during beam search. We then propose +Language-informed Beam Search (LiBS), a general decoding algorithm +incorporating an off-the-shelf Language Identification (LiD) model into beam +search decoding to reduce off-target translations. LiBS is an inference-time +procedure that is NMT-model agnostic and does not require any additional +parallel data. Results show that our proposed LiBS algorithm on average +improves +1.1 BLEU and +0.9 BLEU on WMT and OPUS datasets, and reduces +off-target rates from 22.9\% to 7.7\% and 65.8\% to 25.3\% respectively. + +
+
+ comment: ACL 2024 Findings +
+
+
+
+
+ + ☆ Training an NLP Scholar at a Small Liberal Arts College: A Backwards + Designed Course Proposal + + +
+ The rapid growth in natural language processing (NLP) over the last couple +years has generated student interest and excitement in learning more about the +field. In this paper, we present two types of students that NLP courses might +want to train. First, an "NLP engineer" who is able to flexibly design, build +and apply new technologies in NLP for a wide range of tasks. Second, an "NLP +scholar" who is able to pose, refine and answer questions in NLP and how it +relates to the society, while also learning to effectively communicate these +answers to a broader audience. While these two types of skills are not mutually +exclusive -- NLP engineers should be able to think critically, and NLP scholars +should be able to build systems -- we think that courses can differ in the +balance of these skills. As educators at Small Liberal Arts Colleges, the +strengths of our students and our institution favors an approach that is better +suited to train NLP scholars. In this paper we articulate what kinds of skills +an NLP scholar should have, and then adopt a backwards design to propose course +components that can aid the acquisition of these skills. + +
+
+ comment: 9 pages, Presented at 6th Workshop on Teaching NLP +
+
+
+
+
+ + ♻ ☆ LLM Reasoners: New Evaluation, Library, and Analysis of Step-by-Step + Reasoning with Large Language Models + + +
+ Generating accurate step-by-step reasoning is essential for Large Language +Models (LLMs) to address complex problems and enhance robustness and +interpretability. Despite the flux of research on developing advanced reasoning +approaches, systematically analyzing the diverse LLMs and reasoning strategies +in generating reasoning chains remains a significant challenge. The +difficulties stem from the lack of two key elements: (1) an automatic method +for evaluating the generated reasoning chains on different tasks, and (2) a +unified formalism and implementation of the diverse reasoning approaches for +systematic comparison. This paper aims to close the gap: (1) We introduce +AutoRace for fully automated reasoning chain evaluation. Existing metrics rely +on expensive human annotations or pre-defined LLM prompts not adaptable to +different tasks. In contrast, AutoRace automatically creates detailed +evaluation criteria tailored for each task, and uses GPT-4 for accurate +evaluation following the criteria. (2) We develop LLM Reasoners, a library for +standardized modular implementation of existing and new reasoning algorithms, +under a unified formulation of the search, reward, and world model components. +With the new evaluation and library, (3) we conduct extensive study of +different reasoning approaches (e.g., CoT, ToT, RAP). The analysis reveals +interesting findings about different factors contributing to reasoning, +including the reward-guidance, breadth-vs-depth in search, world model, and +prompt formats, etc. + +
+
+ comment: Project website: https://www.llm-reasoners.net/ +
+
+
+
+
+ + ♻ ☆ BiomedGPT: A Generalist Vision-Language Foundation Model for Diverse + Biomedical Tasks + + +
+ Traditional biomedical artificial intelligence (AI) models, designed for +specific tasks or modalities, often exhibit limited flexibility in real-world +deployment and struggle to utilize holistic information. Generalist AI holds +the potential to address these limitations due to its versatility in +interpreting different data types and generating tailored outputs for diverse +needs. However, existing biomedical generalist AI solutions are typically +heavyweight and closed source to researchers, practitioners, and patients. +Here, we propose BiomedGPT, the first open-source and lightweight +vision-language foundation model, designed as a generalist capable of +performing various biomedical tasks. BiomedGPT achieved state-of-the-art +results in 16 out of 25 experiments while maintaining a computing-friendly +model scale. We also conducted human evaluations to assess the capabilities of +BiomedGPT in radiology visual question answering, report generation, and +summarization. BiomedGPT exhibits robust prediction ability with a low error +rate of 3.8% in question answering, satisfactory performance with an error rate +of 8.3% in writing complex radiology reports, and competitive summarization +ability with a nearly equivalent preference score to human experts. Our method +demonstrates that effective training with diverse data can lead to more +practical biomedical AI for improving diagnosis and workflow efficiency. + +
+
+ comment: Fix incorrect citations and add journal reference for the published + version. Nat Med (2024) +
+
+
+
+
+ + ♻ ☆ Prompt-prompted Adaptive Structured Pruning for Efficient LLM Generation + + +
+ With the development of transformer-based large language models (LLMs), they +have been applied to many fields due to their remarkable utility, but this +comes at a considerable computational cost at deployment. Fortunately, some +methods such as pruning or constructing a mixture of experts (MoE) aim at +exploiting sparsity in transformer feedforward (FF) blocks to gain boosts in +speed and reduction in memory requirements. However, these techniques can be +very costly and inflexible in practice, as they often require training or are +restricted to specific types of architectures. To address this, we introduce +GRIFFIN, a novel training-free and calibration-free method that selects unique +FF experts at the sequence level for efficient generation across a plethora of +LLMs with different non-ReLU activation functions. This is possible due to a +critical observation that many trained LLMs naturally produce highly structured +FF activation patterns within a sequence, which we call flocking. Despite our +method's simplicity, we show with 50% of the FF parameters, GRIFFIN maintains +the original model's performance with little to no degradation on a variety of +classification and generation tasks, all while improving latency (e.g. +1.29$\times$ and 1.25$\times$ speed-ups in Gemma 7B and Llama 2 13B, +respectively, on an NVIDIA L40). Code is available at +https://github.com/hdong920/GRIFFIN. + +
+
+ comment: Revision 1: Updated abstract with code link; re-ran top-k + sampling + rows in Table 4, conclusions unchanged Revision 2: Reframing and new + experiments, conclusions unchanged +
+
+
+
+
+ + ♻ ☆ Bot or Human? Detecting ChatGPT Imposters with A Single Question + + +
+ Large language models (LLMs) like GPT-4 have recently demonstrated impressive +capabilities in natural language understanding and generation. However, there +is a concern that they can be misused for malicious purposes, such as fraud or +denial-of-service attacks. Therefore, it is crucial to develop methods for +detecting whether the party involved in a conversation is a bot or a human. In +this paper, we propose a framework named FLAIR, Finding Large Language Model +Authenticity via a Single Inquiry and Response, to detect conversational bots +in an online manner. Specifically, we target a single question scenario that +can effectively differentiate human users from bots. The questions are divided +into two categories: those that are easy for humans but difficult for bots +(e.g., counting, substitution, searching, and ASCII art reasoning), and those +that are easy for bots but difficult for humans (e.g., memorization and +computation). Our approach shows different strengths of these questions in +their effectiveness, providing a new way for online service providers to +protect themselves against nefarious activities. Our code and question set are +available at https://github.com/hongwang600/FLAIR. + +
+
+
+
+
+ + ♻ ☆ Bridging Information Gaps in Dialogues With Grounded Exchanges Using + Knowledge Graphs SIGDIAL 2024 + + +
+ Knowledge models are fundamental to dialogue systems for enabling +conversational interactions, which require handling domain-specific knowledge. +Ensuring effective communication in information-providing conversations entails +aligning user understanding with the knowledge available to the system. +However, dialogue systems often face challenges arising from semantic +inconsistencies in how information is expressed in natural language compared to +how it is represented within the system's internal knowledge. To address this +problem, we study the potential of large language models for conversational +grounding, a mechanism to bridge information gaps by establishing shared +knowledge between dialogue participants. Our approach involves annotating human +conversations across five knowledge domains to create a new dialogue corpus +called BridgeKG. Through a series of experiments on this dataset, we +empirically evaluate the capabilities of large language models in classifying +grounding acts and identifying grounded information items within a knowledge +graph structure. Our findings offer insights into how these models use +in-context learning for conversational grounding tasks and common prediction +errors, which we illustrate with examples from challenging dialogues. We +discuss how the models handle knowledge graphs as a semantic layer between +unstructured dialogue utterances and structured information items. + +
+
+ comment: Accepted to SIGDIAL 2024 +
+
+
+
+
+ + ♻ ☆ LEGENT: Open Platform for Embodied Agents ACL 2024 + + +
+ Despite advancements in Large Language Models (LLMs) and Large Multimodal +Models (LMMs), their integration into language-grounded, human-like embodied +agents remains incomplete, hindering complex real-life task performance in +physical environments. Existing integrations often feature limited open +sourcing, challenging collective progress in this field. We introduce LEGENT, +an open, scalable platform for developing embodied agents using LLMs and LMMs. +LEGENT offers a dual approach: a rich, interactive 3D environment with +communicable and actionable agents, paired with a user-friendly interface, and +a sophisticated data generation pipeline utilizing advanced algorithms to +exploit supervision from simulated worlds at scale. In our experiments, an +embryonic vision-language-action model trained on LEGENT-generated data +surpasses GPT-4V in embodied tasks, showcasing promising generalization +capabilities. + +
+
+ comment: ACL 2024 System Demonstration +
+
+
+
+
+ + ♻ ☆ Learning or Self-aligning? Rethinking Instruction Fine-tuning ACL2024 + + +
+ Instruction Fine-tuning~(IFT) is a critical phase in building large language +models~(LLMs). Previous works mainly focus on the IFT's role in the transfer of +behavioral norms and the learning of additional world knowledge. However, the +understanding of the underlying mechanisms of IFT remains significantly +limited. In this paper, we design a knowledge intervention framework to +decouple the potential underlying factors of IFT, thereby enabling individual +analysis of different factors. Surprisingly, our experiments reveal that +attempting to learn additional world knowledge through IFT often struggles to +yield positive impacts and can even lead to markedly negative effects. Further, +we discover that maintaining internal knowledge consistency before and after +IFT is a critical factor for achieving successful IFT. Our findings reveal the +underlying mechanisms of IFT and provide robust support for some very recent +and potential future works. + +
+
+ comment: Camera Ready for ACL2024 +
+
+
+
+
+ + ♻ ☆ SQLFixAgent: Towards Semantic-Accurate Text-to-SQL Parsing via + Consistency-Enhanced Multi-Agent Collaboration + + +
+ While fine-tuned large language models (LLMs) excel in generating +grammatically valid SQL in Text-to-SQL parsing, they often struggle to ensure +semantic accuracy in queries, leading to user confusion and diminished system +usability. To tackle this challenge, we introduce SQLFixAgent, a new +consistency-enhanced multi-agent collaborative framework designed for detecting +and repairing erroneous SQL. Our framework comprises a core agent, SQLRefiner, +alongside two auxiliary agents: SQLReviewer and QueryCrafter. The SQLReviewer +agent employs the rubber duck debugging method to identify potential semantic +mismatches between SQL and user query. If the error is detected, the +QueryCrafter agent generates multiple SQL as candidate repairs using a +fine-tuned SQLTool. Subsequently, leveraging similar repair retrieval and +failure memory reflection, the SQLRefiner agent selects the most fitting SQL +statement from the candidates as the final repair. We evaluated our proposed +framework on five Text-to-SQL benchmarks. The experimental results show that +our method consistently enhances the performance of the baseline model, +specifically achieving an execution accuracy improvement of over 3\% on the +Bird benchmark. Our framework also has a higher token efficiency compared to +other advanced methods, making it more competitive. + +
+
+
+
+
+ + ♻ ☆ CodexGraph: Bridging Large Language Models and Code Repositories via + Code Graph Databases + + +
+ Large Language Models (LLMs) excel in stand-alone code tasks like HumanEval +and MBPP, but struggle with handling entire code repositories. This challenge +has prompted research on enhancing LLM-codebase interaction at a repository +scale. Current solutions rely on similarity-based retrieval or manual tools and +APIs, each with notable drawbacks. Similarity-based retrieval often has low +recall in complex tasks, while manual tools and APIs are typically +task-specific and require expert knowledge, reducing their generalizability +across diverse code tasks and real-world applications. To mitigate these +limitations, we introduce CodexGraph, a system that integrates LLM agents with +graph database interfaces extracted from code repositories. By leveraging the +structural properties of graph databases and the flexibility of the graph query +language, CodexGraph enables the LLM agent to construct and execute queries, +allowing for precise, code structure-aware context retrieval and code +navigation. We assess CodexGraph using three benchmarks: CrossCodeEval, +SWE-bench, and EvoCodeBench. Additionally, we develop five real-world coding +applications. With a unified graph database schema, CodexGraph demonstrates +competitive performance and potential in both academic and real-world +environments, showcasing its versatility and efficacy in software engineering. +Our application demo: +https://github.com/modelscope/modelscope-agent/tree/master/apps/codexgraph_agent. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ A Survey on Employing Large Language Models for Text-to-SQL Tasks + + +
+ The increasing volume of data stored in relational databases has led to the +need for efficient querying and utilization of this data in various sectors. +However, writing SQL queries requires specialized knowledge, which poses a +challenge for non-professional users trying to access and query databases. +Text-to-SQL parsing solves this issue by converting natural language queries +into SQL queries, thus making database access more accessible for non-expert +users. To take advantage of the recent developments in Large Language Models +(LLMs), a range of new methods have emerged, with a primary focus on prompt +engineering and fine-tuning. This survey provides a comprehensive overview of +LLMs in text-to-SQL tasks, discussing benchmark datasets, prompt engineering, +fine-tuning methods, and future research directions. We hope this review will +enable readers to gain a broader understanding of the recent advances in this +field and offer some insights into its future trajectory. + +
+
+
+
+
+ + ♻ ☆ SPARSEFIT: Few-shot Prompting with Sparse Fine-tuning for Jointly + Generating Predictions and Natural Language Explanations + + +
+ Models that generate natural language explanations (NLEs) for their +predictions have recently gained increasing interest. However, this approach +usually demands large datasets of human-written NLEs for the ground-truth +answers at training time, which can be expensive and potentially infeasible for +some applications. When only a few NLEs are available (a few-shot setup), +fine-tuning pre-trained language models (PLMs) in conjunction with prompt-based +learning has recently shown promising results. However, PLMs typically have +billions of parameters, making full fine-tuning expensive. We propose +SparseFit, a sparse few-shot fine-tuning strategy that leverages discrete +prompts to jointly generate predictions and NLEs. We experiment with SparseFit +on three sizes of the T5 language model and four datasets and compare it +against existing state-of-the-art Parameter-Efficient Fine-Tuning (PEFT) +techniques. We find that fine-tuning only 6.8% of the model parameters leads to +competitive results for both the task performance and the quality of the +generated NLEs compared to full fine-tuning of the model and produces better +results on average than other PEFT methods in terms of predictive accuracy and +NLE quality. + +
+
+
+
+
+ + ♻ ☆ Integrating Multi-scale Contextualized Information for Byte-based Neural + Machine Translation ACL2024 + + +
+ Subword tokenization is a common method for vocabulary building in Neural +Machine Translation (NMT) models. However, increasingly complex tasks have +revealed its disadvantages. First, a vocabulary cannot be modified once it is +learned, making it hard to adapt to new words. Second, in multilingual +translation, the imbalance in data volumes across different languages spreads +to the vocabulary, exacerbating translations involving low-resource languages. +While byte-based tokenization addresses these issues, byte-based models +struggle with the low information density inherent in UTF-8 byte sequences. +Previous works enhance token semantics through local contextualization but fail +to select an appropriate contextualizing scope based on the input. +Consequently, we propose the Multi-Scale Contextualization (MSC) method, which +learns contextualized information of varying scales across different hidden +state dimensions. It then leverages the attention module to dynamically +integrate the multi-scale contextualized information. Experiments show that MSC +significantly outperforms subword-based and other byte-based methods in both +multilingual and out-of-domain scenarios. Code can be found in +https://github.com/ictnlp/Multiscale-Contextualization. + +
+
+ comment: Accepted by ACL2024 Findings, renew author's email +
+
+
+
+
+ + ♻ ☆ Digital Socrates: Evaluating LLMs through Explanation Critiques ACL 2024 + + +
+ While LLMs can provide reasoned explanations along with their answers, the +nature and quality of those explanations are still poorly understood. In +response, our goal is to define a detailed way of characterizing the +explanation capabilities of modern models and to create a nuanced, +interpretable explanation evaluation tool that can generate such +characterizations automatically, without relying on expensive API calls or +human annotations. Our approach is to (a) define the new task of explanation +critiquing - identifying and categorizing any main flaw in an explanation and +providing suggestions to address the flaw, (b) create a sizeable, +human-verified dataset for this task, and (c) train an open-source, automatic +critique model (called Digital Socrates) using this data. Through quantitative +and qualitative analysis, we demonstrate how Digital Socrates is useful for +revealing insights about student models by examining their reasoning chains, +and how it can provide high-quality, nuanced, automatic evaluation of those +model explanations for the first time. Digital Socrates thus fills an important +gap in evaluation tools for understanding and improving the explanation +behavior of models. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Time Matters: Examine Temporal Effects on Biomedical Language Models + + +
+ Time roots in applying language models for biomedical applications: models +are trained on historical data and will be deployed for new or future data, +which may vary from training data. While increasing biomedical tasks have +employed state-of-the-art language models, there are very few studies have +examined temporal effects on biomedical models when data usually shifts across +development and deployment. This study fills the gap by statistically probing +relations between language model performance and data shifts across three +biomedical tasks. We deploy diverse metrics to evaluate model performance, +distance methods to measure data drifts, and statistical methods to quantify +temporal effects on biomedical language models. Our study shows that time +matters for deploying biomedical language models, while the degree of +performance degradation varies by biomedical tasks and statistical +quantification approaches. We believe this study can establish a solid +benchmark to evaluate and assess temporal effects on deploying biomedical +language models. + +
+
+ comment: Accepted to AMIA 2024 Annual Symposium +
+
+
+
+
+ + ♻ ☆ Parrot: Multilingual Visual Instruction Tuning + + +
+ The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V +has marked a significant step towards artificial general intelligence. Existing +methods mainly focus on aligning vision encoders with LLMs through supervised +fine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs' +inherent ability to react to multiple languages progressively deteriorate as +the training process evolves. We empirically find that the imbalanced SFT +datasets, primarily composed of English-centric image-text pairs, lead to +significantly reduced performance in non-English languages. This is due to the +failure of aligning the vision encoder and LLM with multilingual tokens during +the SFT process. In this paper, we introduce Parrot, a novel method that +utilizes textual guidance to drive visual token alignment at the language +level. Parrot makes the visual tokens condition on diverse language inputs and +uses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens. +Specifically, to enhance non-English visual tokens alignment, we compute the +cross-attention using the initial visual features and textual embeddings, the +result of which is then fed into the MoE router to select the most relevant +experts. The selected experts subsequently convert the initial visual tokens +into language-specific visual tokens. Moreover, considering the current lack of +benchmarks for evaluating multilingual capabilities within the field, we +collect and make available a Massive Multilingual Multimodal Benchmark which +includes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our +method not only demonstrates state-of-the-art performance on multilingual +MMBench and MMMB, but also excels across a broad range of multimodal tasks. +Both the source code and the training dataset of Parrot will be made publicly +available. Code is available at: https://github.com/AIDC-AI/Parrot. + +
+
+ comment: Code is available at: https://github.com/AIDC-AI/Parrot +
+
+
+
+
+ + ♻ ☆ PK-ICR: Persona-Knowledge Interactive Context Retrieval for Grounded + Dialogue EMNLP 2023 + + +
+ Identifying relevant persona or knowledge for conversational systems is +critical to grounded dialogue response generation. However, each grounding has +been mostly researched in isolation with more practical multi-context dialogue +tasks introduced in recent works. We define Persona and Knowledge Dual Context +Identification as the task to identify persona and knowledge jointly for a +given dialogue, which could be of elevated importance in complex multi-context +dialogue settings. We develop a novel grounding retrieval method that utilizes +all contexts of dialogue simultaneously. Our method requires less computational +power via utilizing neural QA retrieval models. We further introduce our novel +null-positive rank test which measures ranking performance on semantically +dissimilar samples (i.e. hard negatives) in relation to data augmentation. + +
+
+ comment: Accepted to EMNLP 2023 main conference (Oral). Code available at + https://github.com/minsik-ai/PK-ICR +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Have Compositional Ability? An Investigation + into Limitations and Scalability + + +
+ Large language models (LLMs) have emerged as powerful tools for many AI +problems and exhibit remarkable in-context learning (ICL) capabilities. +Compositional ability, solving unseen complex tasks that combine two or more +simple tasks, is an essential reasoning ability for Artificial General +Intelligence. Despite the tremendous success of LLMs, how they approach +composite tasks, especially those not encountered during the pretraining phase, +remains an open and largely underexplored question. In this study, we delve +into the ICL capabilities of LLMs on composite tasks, with only simple tasks as +in-context examples. We develop a test suite of composite tasks including +linguistic and logical challenges and perform empirical studies across +different LLM families. We observe that models exhibit divergent behaviors: (1) +For simpler composite tasks that apply distinct mapping mechanisms to different +input segments, the models demonstrate decent compositional ability, while +scaling up the model enhances this ability; (2) for more complex composite +tasks involving reasoning multiple steps, where each step represents one task, +models typically underperform, and scaling up generally provides no +improvements. We offer theoretical analysis in a simplified setting, explaining +that models exhibit compositional capability when the task handles different +input parts separately. We believe our work sheds new light on the capabilities +of LLMs in solving composite tasks regarding the nature of the tasks and model +scale. Our dataset and code are available at +{\url{https://github.com/OliverXUZY/LLM_Compose}}. + +
+
+
+
+
+ + ♻ ☆ Large Language Model Tokenizer Bias: A Case Study and Solution on GPT-4o + + +
+ Recent advancements in large language models (LLMs), such as GPT-4 and +GPT-4o, have shown exceptional performance, especially in languages with +abundant resources like English, thanks to extensive datasets that ensure +robust training. Conversely, these models exhibit limitations when processing +under-resourced languages such as Chinese and Korean, where issues including +hallucinatory responses remain prevalent. This paper traces the roots of these +disparities to the tokenization process inherent to these models. Specifically, +it explores how the tokenizer vocabulary, often used to speed up the +tokenization process and reduce tokens but constructed independently of the +actual model training data, inadequately represents non-English languages. This +misrepresentation results in the propagation of 'under-trained' or 'untrained' +tokens, which perpetuate biases and pose serious concerns related to data +security and ethical standards. We aim to dissect the tokenization mechanics of +GPT-4o, illustrating how its simplified token-handling methods amplify these +risks and offer strategic solutions to mitigate associated security and ethical +issues. Through this study, we emphasize the critical need to rethink +tokenization frameworks to foster more equitable and secure AI technologies. + +
+
+ comment: 6 pages, 3 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ Learning to (Learn at Test Time): RNNs with Expressive Hidden States + + +
+ Self-attention performs well in long context but has quadratic complexity. +Existing RNN layers have linear complexity, but their performance in long +context is limited by the expressive power of their hidden state. We propose a +new class of sequence modeling layers with linear complexity and an expressive +hidden state. The key idea is to make the hidden state a machine learning model +itself, and the update rule a step of self-supervised learning. Since the +hidden state is updated by training even on test sequences, our layers are +called Test-Time Training (TTT) layers. We consider two instantiations: +TTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer +MLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B +parameters, comparing with a strong Transformer and Mamba, a modern RNN. Both +TTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer, +they can keep reducing perplexity by conditioning on more tokens, while Mamba +cannot after 16k context. With preliminary systems optimization, TTT-Linear is +already faster than Transformer at 8k context and matches Mamba in wall-clock +time. TTT-MLP still faces challenges in memory I/O, but shows larger potential +in long context, pointing to a promising direction for future research. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 42 + +
+
+
+ + ☆ LaWa: Using Latent Space for In-Generation Image Watermarking + + +
+ With generative models producing high quality images that are +indistinguishable from real ones, there is growing concern regarding the +malicious usage of AI-generated images. Imperceptible image watermarking is one +viable solution towards such concerns. Prior watermarking methods map the image +to a latent space for adding the watermark. Moreover, Latent Diffusion Models +(LDM) generate the image in the latent space of a pre-trained autoencoder. We +argue that this latent space can be used to integrate watermarking into the +generation process. To this end, we present LaWa, an in-generation image +watermarking method designed for LDMs. By using coarse-to-fine watermark +embedding modules, LaWa modifies the latent space of pre-trained autoencoders +and achieves high robustness against a wide range of image transformations +while preserving perceptual quality of the image. We show that LaWa can also be +used as a general image watermarking method. Through extensive experiments, we +demonstrate that LaWa outperforms previous works in perceptual quality, +robustness against attacks, and computational complexity, while having very low +false positive rate. Code is available here. + +
+
+
+
+
+ + ☆ SABER-6D: Shape Representation Based Implicit Object Pose Estimation + + +
+ In this paper, we propose a novel encoder-decoder architecture, named SABER, +to learn the 6D pose of the object in the embedding space by learning shape +representation at a given pose. This model enables us to learn pose by +performing shape representation at a target pose from RGB image input. We +perform shape representation as an auxiliary task which helps us in learning +rotations space for an object based on 2D images. An image encoder predicts the +rotation in the embedding space and the DeepSDF based decoder learns to +represent the object's shape at the given pose. As our approach is shape based, +the pipeline is suitable for any type of object irrespective of the symmetry. +Moreover, we need only a CAD model of the objects to train SABER. Our pipeline +is synthetic data based and can also handle symmetric objects without symmetry +labels and, thus, no additional labeled training data is needed. The +experimental evaluation shows that our method achieves close to benchmark +results for both symmetric objects and asymmetric objects on Occlusion-LineMOD, +and T-LESS datasets. + +
+
+
+
+
+ + ☆ Deep Learning in Medical Image Registration: Magic or Mirage? + + +
+ Classical optimization and learning-based methods are the two reigning +paradigms in deformable image registration. While optimization-based methods +boast generalizability across modalities and robust performance, learning-based +methods promise peak performance, incorporating weak supervision and amortized +optimization. However, the exact conditions for either paradigm to perform well +over the other are shrouded and not explicitly outlined in the existing +literature. In this paper, we make an explicit correspondence between the +mutual information of the distribution of per-pixel intensity and labels, and +the performance of classical registration methods. This strong correlation +hints to the fact that architectural designs in learning-based methods is +unlikely to affect this correlation, and therefore, the performance of +learning-based methods. This hypothesis is thoroughly validated with +state-of-the-art classical and learning-based methods. However, learning-based +methods with weak supervision can perform high-fidelity intensity and label +registration, which is not possible with classical methods. Next, we show that +this high-fidelity feature learning does not translate to invariance to domain +shift, and learning-based methods are sensitive to such changes in the data +distribution. Finally, we propose a general recipe to choose the best paradigm +for a given registration problem, based on these observations. + +
+
+
+
+
+ + ☆ Real-Time Drowsiness Detection Using Eye Aspect Ratio and Facial + Landmark Detection + + +
+ Drowsiness detection is essential for improving safety in areas such as +transportation and workplace health. This study presents a real-time system +designed to detect drowsiness using the Eye Aspect Ratio (EAR) and facial +landmark detection techniques. The system leverages Dlibs pre-trained shape +predictor model to accurately detect and monitor 68 facial landmarks, which are +used to compute the EAR. By establishing a threshold for the EAR, the system +identifies when eyes are closed, indicating potential drowsiness. The process +involves capturing a live video stream, detecting faces in each frame, +extracting eye landmarks, and calculating the EAR to assess alertness. Our +experiments show that the system reliably detects drowsiness with high accuracy +while maintaining low computational demands. This study offers a strong +solution for real-time drowsiness detection, with promising applications in +driver monitoring and workplace safety. Future research will investigate +incorporating additional physiological and contextual data to further enhance +detection accuracy and reliability. + +
+
+
+
+
+ + ☆ Robust Domain Generalization for Multi-modal Object Recognition + + +
+ In multi-label classification, machine learning encounters the challenge of +domain generalization when handling tasks with distributions differing from the +training data. Existing approaches primarily focus on vision object recognition +and neglect the integration of natural language. Recent advancements in +vision-language pre-training leverage supervision from extensive +visual-language pairs, enabling learning across diverse domains and enhancing +recognition in multi-modal scenarios. However, these approaches face +limitations in loss function utilization, generality across backbones, and +class-aware visual fusion. This paper proposes solutions to these limitations +by inferring the actual loss, broadening evaluations to larger vision-language +backbones, and introducing Mixup-CLIPood, which incorporates a novel mix-up +loss for enhanced class-aware visual fusion. Our method demonstrates superior +performance in domain generalization across multiple datasets. + +
+
+ comment: 6 pages, 2 figures. This is a preprint version of the article. The + final version will be published in the proceedings of the IEEE conference +
+
+
+
+
+ + ☆ Sampling Foundational Transformer: A Theoretical Perspective + + +
+ The versatility of self-attention mechanism earned transformers great success +in almost all data modalities, with limitations on the quadratic complexity and +difficulty of training. To apply transformers across different data modalities, +practitioners have to make specific clever data-modality-dependent +constructions. In this paper, we propose Sampling Foundational Transformer +(SFT) that can work on multiple data modalities (e.g., point cloud, graph, and +sequence) and constraints (e.g., rotational-invariant). The existence of such +model is important as contemporary foundational modeling requires operability +on multiple data sources. For efficiency on large number of tokens, our model +relies on our context aware sampling-without-replacement mechanism for both +linear asymptotic computational complexity and real inference time gain. For +efficiency, we rely on our newly discovered pseudoconvex formulation of +transformer layer to increase model's convergence rate. As a model working on +multiple data modalities, SFT has achieved competitive results on many +benchmarks, while being faster in inference, compared to other very specialized +models. + +
+
+
+
+
+ + ☆ HySparK: Hybrid Sparse Masking for Large Scale Medical Image + Pre-Training MICCAI 2024 + + +
+ The generative self-supervised learning strategy exhibits remarkable learning +representational capabilities. However, there is limited attention to +end-to-end pre-training methods based on a hybrid architecture of CNN and +Transformer, which can learn strong local and global representations +simultaneously. To address this issue, we propose a generative pre-training +strategy called Hybrid Sparse masKing (HySparK) based on masked image modeling +and apply it to large-scale pre-training on medical images. First, we perform a +bottom-up 3D hybrid masking strategy on the encoder to keep consistency +masking. Then we utilize sparse convolution for the top CNNs and encode +unmasked patches for the bottom vision Transformers. Second, we employ a simple +hierarchical decoder with skip-connections to achieve dense multi-scale feature +reconstruction. Third, we implement our pre-training method on a collection of +multiple large-scale 3D medical imaging datasets. Extensive experiments +indicate that our proposed pre-training strategy demonstrates robust +transfer-ability in supervised downstream tasks and sheds light on HySparK's +promising prospects. The code is available at +https://github.com/FengheTan9/HySparK + +
+
+ comment: Early accept at MICCAI 2024 +
+
+
+
+
+ + ☆ Prototype Learning Guided Hybrid Network for Breast Tumor Segmentation + in DCE-MRI + + +
+ Automated breast tumor segmentation on the basis of dynamic +contrast-enhancement magnetic resonance imaging (DCE-MRI) has shown great +promise in clinical practice, particularly for identifying the presence of +breast disease. However, accurate segmentation of breast tumor is a challenging +task, often necessitating the development of complex networks. To strike an +optimal trade-off between computational costs and segmentation performance, we +propose a hybrid network via the combination of convolution neural network +(CNN) and transformer layers. Specifically, the hybrid network consists of a +encoder-decoder architecture by stacking convolution and decovolution layers. +Effective 3D transformer layers are then implemented after the encoder +subnetworks, to capture global dependencies between the bottleneck features. To +improve the efficiency of hybrid network, two parallel encoder subnetworks are +designed for the decoder and the transformer layers, respectively. To further +enhance the discriminative capability of hybrid network, a prototype learning +guided prediction module is proposed, where the category-specified prototypical +features are calculated through on-line clustering. All learned prototypical +features are finally combined with the features from decoder for tumor mask +prediction. The experimental results on private and public DCE-MRI datasets +demonstrate that the proposed hybrid network achieves superior performance than +the state-of-the-art (SOTA) methods, while maintaining balance between +segmentation accuracy and computation cost. Moreover, we demonstrate that +automatically generated tumor masks can be effectively applied to identify +HER2-positive subtype from HER2-negative subtype with the similar accuracy to +the analysis based on manual tumor segmentation. The source code is available +at https://github.com/ZhouL-lab/PLHN. + +
+
+
+
+
+ + ☆ Egocentric Vision Language Planning + + +
+ We explore leveraging large multi-modal models (LMMs) and text2image models +to build a more general embodied agent. LMMs excel in planning long-horizon +tasks over symbolic abstractions but struggle with grounding in the physical +world, often failing to accurately identify object positions in images. A +bridge is needed to connect LMMs to the physical world. The paper proposes a +novel approach, egocentric vision language planning (EgoPlan), to handle +long-horizon tasks from an egocentric perspective in varying household +scenarios. This model leverages a diffusion model to simulate the fundamental +dynamics between states and actions, integrating techniques like style transfer +and optical flow to enhance generalization across different environmental +dynamics. The LMM serves as a planner, breaking down instructions into +sub-goals and selecting actions based on their alignment with these sub-goals, +thus enabling more generalized and effective decision-making. Experiments show +that EgoPlan improves long-horizon task success rates from the egocentric view +compared to baselines across household scenarios. + +
+
+
+
+
+ + ☆ CURLing the Dream: Contrastive Representations for World Modeling in + Reinforcement Learning + + +
+ In this work, we present Curled-Dreamer, a novel reinforcement learning +algorithm that integrates contrastive learning into the DreamerV3 framework to +enhance performance in visual reinforcement learning tasks. By incorporating +the contrastive loss from the CURL algorithm and a reconstruction loss from +autoencoder, Curled-Dreamer achieves significant improvements in various +DeepMind Control Suite tasks. Our extensive experiments demonstrate that +Curled-Dreamer consistently outperforms state-of-the-art algorithms, achieving +higher mean and median scores across a diverse set of tasks. The results +indicate that the proposed approach not only accelerates learning but also +enhances the robustness of the learned policies. This work highlights the +potential of combining different learning paradigms to achieve superior +performance in reinforcement learning applications. + +
+
+ comment: Paper accepted for 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ☆ U-DECN: End-to-End Underwater Object Detection ConvNet with Improved + DeNoising Training + + +
+ Underwater object detection has higher requirements of running speed and +deployment efficiency for the detector due to its specific environmental +challenges. NMS of two- or one-stage object detectors and transformer +architecture of query-based end-to-end object detectors are not conducive to +deployment on underwater embedded devices with limited processing power. As for +the detrimental effect of underwater color cast noise, recent underwater object +detectors make network architecture or training complex, which also hinders +their application and deployment on underwater vehicle platforms. In this +paper, we propose the Underwater DECO with improved deNoising training +(U-DECN), the query-based end-to-end object detector (with ConvNet +encoder-decoder architecture) for underwater color cast noise that addresses +the above problems. We integrate advanced technologies from DETR variants into +DECO and design optimization methods specifically for the ConvNet architecture, +including Separate Contrastive DeNoising Forward and Deformable Convolution in +SIM. To address the underwater color cast noise issue, we propose an underwater +color denoising query to improve the generalization of the model for the biased +object feature information by different color cast noise. Our U-DECN, with +ResNet-50 backbone, achieves 61.4 AP (50 epochs), 63.3 AP (72 epochs), 64.0 AP +(100 epochs) on DUO, and 21 FPS (5 times faster than Deformable DETR and DINO 4 +FPS) on NVIDIA AGX Orin by TensorRT FP16, outperforming the other +state-of-the-art query-based end-to-end object detectors. The code is available +at https://github.com/LEFTeyex/U-DECN. + +
+
+
+
+
+ + ☆ Seg-CycleGAN : SAR-to-optical image translation guided by a downstream + task + + +
+ Optical remote sensing and Synthetic Aperture Radar(SAR) remote sensing are +crucial for earth observation, offering complementary capabilities. While +optical sensors provide high-quality images, they are limited by weather and +lighting conditions. In contrast, SAR sensors can operate effectively under +adverse conditions. This letter proposes a GAN-based SAR-to-optical image +translation method named Seg-CycleGAN, designed to enhance the accuracy of ship +target translation by leveraging semantic information from a pre-trained +semantic segmentation model. Our method utilizes the downstream task of ship +target semantic segmentation to guide the training of image translation +network, improving the quality of output Optical-styled images. The potential +of foundation-model-annotated datasets in SAR-to-optical translation tasks is +revealed. This work suggests broader research and applications for +downstream-task-guided frameworks. The code will be available at +https://github.com/NPULHH/ + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient Test-Time Prompt Tuning for Vision-Language Models + + +
+ Vision-language models have showcased impressive zero-shot classification +capabilities when equipped with suitable text prompts. Previous studies have +shown the effectiveness of test-time prompt tuning; however, these methods +typically require per-image prompt adaptation during inference, which incurs +high computational budgets and limits scalability and practical deployment. To +overcome this issue, we introduce Self-TPT, a novel framework leveraging +Self-supervised learning for efficient Test-time Prompt Tuning. The key aspect +of Self-TPT is that it turns to efficient predefined class adaptation via +self-supervised learning, thus avoiding computation-heavy per-image adaptation +at inference. Self-TPT begins by co-training the self-supervised and the +classification task using source data, then applies the self-supervised task +exclusively for test-time new class adaptation. Specifically, we propose +Contrastive Prompt Learning (CPT) as the key task for self-supervision. CPT is +designed to minimize the intra-class distances while enhancing inter-class +distinguishability via contrastive learning. Furthermore, empirical evidence +suggests that CPT could closely mimic back-propagated gradients of the +classification task, offering a plausible explanation for its effectiveness. +Motivated by this finding, we further introduce a gradient matching loss to +explicitly enhance the gradient similarity. We evaluated Self-TPT across three +challenging zero-shot benchmarks. The results consistently demonstrate that +Self-TPT not only significantly reduces inference costs but also achieves +state-of-the-art performance, effectively balancing the efficiency-efficacy +trade-off. + +
+
+
+
+
+ + ☆ An analysis of HOI: using a training-free method with multimodal visual + foundation models when only the test set is available, without the training + set + + +
+ Human-Object Interaction (HOI) aims to identify the pairs of humans and +objects in images and to recognize their relationships, ultimately forming +$\langle human, object, verb \rangle$ triplets. Under default settings, HOI +performance is nearly saturated, with many studies focusing on long-tail +distribution and zero-shot/few-shot scenarios. Let us consider an intriguing +problem:``What if there is only test dataset without training dataset, using +multimodal visual foundation model in a training-free manner? '' This study +uses two experimental settings: grounding truth and random arbitrary +combinations. We get some interesting conclusion and find that the open +vocabulary capabilities of the multimodal visual foundation model are not yet +fully realized. Additionally, replacing the feature extraction with grounding +DINO further confirms these findings. + +
+
+
+
+
+ + ☆ PRECISe : Prototype-Reservation for Explainable Classification under + Imbalanced and Scarce-Data Settings + + +
+ Deep learning models used for medical image classification tasks are often +constrained by the limited amount of training data along with severe class +imbalance. Despite these problems, models should be explainable to enable human +trust in the models' decisions to ensure wider adoption in high-risk +situations. In this paper, we propose PRECISe, an explainable-by-design model +meticulously constructed to concurrently address all three challenges. +Evaluation on 2 imbalanced medical image datasets reveals that PRECISe +outperforms the current state-of-the-art methods on data efficient +generalization to minority classes, achieving an accuracy of ~87% in detecting +pneumonia in chest x-rays upon training on <60 images only. Additionally, a +case study is presented to highlight the model's ability to produce easily +interpretable predictions, reinforcing its practical utility and reliability +for medical imaging tasks. + +
+
+
+
+
+ + ☆ RTF-Q: Unsupervised domain adaptation based retraining-free quantization + network + + +
+ Performing unsupervised domain adaptation on resource-constrained edge +devices is a significant task. Although existing research allows edge devices +to use subnets with different computational budgets for inference, they often +require expensive pre-training and do not consider the issues of parameter +precision redundancy in the model, which is not conducive to the deployment of +the model on edge devices. In this paper, we introduce a ReTraining-Free +Quantized (RTF-Q) network based on unsupervised domain adaptation, featuring +quantized subnets of varying computational costs that can operate on devices +with dynamically changing computation budgets. Our network has three switchable +dimensions: width (number of channels), input resolution, and quantization +bit-width. Specifically, we choose subnet dimensions that have minimal impact +on network performance and then directly load the official weight files without +requiring expensive and time-consuming pre-training on Imagenet-1K. To further +reduce the network's computational load and memory usage, we use +quantization-aware training, reducing the BitOPs of full-precision networks by +at least 1/16. We propose a training method called SandwichQ for multiple +quantization bit widths, which can efficiently train multiple quantization +subnets. By training in multiple quantization bit-width spaces simultaneously +and using the proposed SandwichQ rule, we achieve better network performance +compared to using a single quantization bit-width alone. Experimental results +show that our method achieves classification accuracy comparable to SOTA +methods on various UDA tasks, significantly reducing network size and +computational overhead. Code will be available at +https://github.com/dunanyang/RTF-Q. + +
+
+
+
+
+ + ☆ Advancing Re-Ranking with Multimodal Fusion and Target-Oriented + Auxiliary Tasks in E-Commerce Search + + +
+ In the rapidly evolving field of e-commerce, the effectiveness of search +re-ranking models is crucial for enhancing user experience and driving +conversion rates. Despite significant advancements in feature representation +and model architecture, the integration of multimodal information remains +underexplored. This study addresses this gap by investigating the computation +and fusion of textual and visual information in the context of re-ranking. We +propose \textbf{A}dvancing \textbf{R}e-Ranking with +\textbf{M}ulti\textbf{m}odal Fusion and \textbf{T}arget-Oriented Auxiliary +Tasks (ARMMT), which integrates an attention-based multimodal fusion technique +and an auxiliary ranking-aligned task to enhance item representation and +improve targeting capabilities. This method not only enriches the understanding +of product attributes but also enables more precise and personalized +recommendations. Experimental evaluations on JD.com's search platform +demonstrate that ARMMT achieves state-of-the-art performance in multimodal +information integration, evidenced by a 0.22\% increase in the Conversion Rate +(CVR), significantly contributing to Gross Merchandise Volume (GMV). This +pioneering approach has the potential to revolutionize e-commerce re-ranking, +leading to elevated user satisfaction and business growth. + +
+
+
+
+
+ + ☆ FADE: A Dataset for Detecting Falling Objects around Buildings in Video + + +
+ Falling objects from buildings can cause severe injuries to pedestrians due +to the great impact force they exert. Although surveillance cameras are +installed around some buildings, it is challenging for humans to capture such +events in surveillance videos due to the small size and fast motion of falling +objects, as well as the complex background. Therefore, it is necessary to +develop methods to automatically detect falling objects around buildings in +surveillance videos. To facilitate the investigation of falling object +detection, we propose a large, diverse video dataset called FADE (FAlling +Object DEtection around Buildings) for the first time. FADE contains 1,881 +videos from 18 scenes, featuring 8 falling object categories, 4 weather +conditions, and 4 video resolutions. Additionally, we develop a new object +detection method called FADE-Net, which effectively leverages motion +information and produces small-sized but high-quality proposals for detecting +falling objects around buildings. Importantly, our method is extensively +evaluated and analyzed by comparing it with the previous approaches used for +generic object detection, video object detection, and moving object detection +on the FADE dataset. Experimental results show that the proposed FADE-Net +significantly outperforms other methods, providing an effective baseline for +future research. The dataset and code are publicly available at +https://fadedataset.github.io/FADE.github.io/. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Efficient and Versatile Robust Fine-Tuning of Zero-shot Models ECCV 2024 + + +
+ Large-scale image-text pre-trained models enable zero-shot classification and +provide consistent accuracy across various data distributions. Nonetheless, +optimizing these models in downstream tasks typically requires fine-tuning, +which reduces generalization to out-of-distribution (OOD) data and demands +extensive computational resources. We introduce Robust Adapter (R-Adapter), a +novel method for fine-tuning zero-shot models to downstream tasks while +simultaneously addressing both these issues. Our method integrates lightweight +modules into the pre-trained model and employs novel self-ensemble techniques +to boost OOD robustness and reduce storage expenses substantially. Furthermore, +we propose MPM-NCE loss designed for fine-tuning on vision-language downstream +tasks. It ensures precise alignment of multiple image-text pairs and +discriminative feature learning. By extending the benchmark for robust +fine-tuning beyond classification to include diverse tasks such as cross-modal +retrieval and open vocabulary segmentation, we demonstrate the broad +applicability of R-Adapter. Our extensive experiments demonstrate that +R-Adapter achieves state-of-the-art performance across a diverse set of tasks, +tuning only 13% of the parameters of the CLIP encoders. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Improving Adversarial Transferability with Neighbourhood Gradient + Information + + +
+ Deep neural networks (DNNs) are known to be susceptible to adversarial +examples, leading to significant performance degradation. In black-box attack +scenarios, a considerable attack performance gap between the surrogate model +and the target model persists. This work focuses on enhancing the +transferability of adversarial examples to narrow this performance gap. We +observe that the gradient information around the clean image, i.e. +Neighbourhood Gradient Information, can offer high transferability. Leveraging +this, we propose the NGI-Attack, which incorporates Example Backtracking and +Multiplex Mask strategies, to use this gradient information and enhance +transferability fully. Specifically, we first adopt Example Backtracking to +accumulate Neighbourhood Gradient Information as the initial momentum term. +Multiplex Mask, which forms a multi-way attack strategy, aims to force the +network to focus on non-discriminative regions, which can obtain richer +gradient information during only a few iterations. Extensive experiments +demonstrate that our approach significantly enhances adversarial +transferability. Especially, when attacking numerous defense models, we achieve +an average attack success rate of 95.8%. Notably, our method can plugin with +any off-the-shelf algorithm to improve their attack performance without +additional time cost. + +
+
+
+
+
+ + ☆ Neural Architecture Search based Global-local Vision Mamba for Palm-Vein + Recognition + + +
+ Due to the advantages such as high security, high privacy, and liveness +recognition, vein recognition has been received more and more attention in past +years. Recently, deep learning models, e.g., Mamba has shown robust feature +representation with linear computational complexity and successfully applied +for visual tasks. However, vision Manba can capture long-distance feature +dependencies but unfortunately deteriorate local feature details. Besides, +manually designing a Mamba architecture based on human priori knowledge is very +time-consuming and error-prone. In this paper, first, we propose a hybrid +network structure named Global-local Vision Mamba (GLVM), to learn the local +correlations in images explicitly and global dependencies among tokens for vein +feature representation. Secondly, we design a Multi-head Mamba to learn the +dependencies along different directions, so as to improve the feature +representation ability of vision Mamba. Thirdly, to learn the complementary +features, we propose a ConvMamba block consisting of three branches, named +Multi-head Mamba branch (MHMamba), Feature Iteration Unit branch (FIU), and +Convolutional Neural Network (CNN) branch, where the Feature Iteration Unit +branch aims to fuse convolutional local features with Mamba-based global +representations. Finally, a Globallocal Alternate Neural Architecture Search +(GLNAS) method is proposed to search the optimal architecture of GLVM +alternately with the evolutionary algorithm, thereby improving the recognition +performance for vein recognition tasks. We conduct rigorous experiments on +three public palm-vein databases to estimate the performance. The experimental +results demonstrate that the proposed method outperforms the representative +approaches and achieves state-of-the-art recognition accuracy. + +
+
+
+
+
+ + ☆ A Training-Free Framework for Video License Plate Tracking and + Recognition with Only One-Shot + + +
+ Traditional license plate detection and recognition models are often trained +on closed datasets, limiting their ability to handle the diverse license plate +formats across different regions. The emergence of large-scale pre-trained +models has shown exceptional generalization capabilities, enabling few-shot and +zero-shot learning. We propose OneShotLP, a training-free framework for +video-based license plate detection and recognition, leveraging these advanced +models. Starting with the license plate position in the first video frame, our +method tracks this position across subsequent frames using a point tracking +module, creating a trajectory of prompts. These prompts are input into a +segmentation module that uses a promptable large segmentation model to generate +local masks of the license plate regions. The segmented areas are then +processed by multimodal large language models (MLLMs) for accurate license +plate recognition. OneShotLP offers significant advantages, including the +ability to function effectively without extensive training data and +adaptability to various license plate styles. Experimental results on UFPR-ALPR +and SSIG-SegPlate datasets demonstrate the superior accuracy of our approach +compared to traditional methods. This highlights the potential of leveraging +pre-trained models for diverse real-world applications in intelligent +transportation systems. The code is available at +https://github.com/Dinghaoxuan/OneShotLP. + +
+
+
+
+
+ + ☆ Deep Learning with Data Privacy via Residual Perturbation + + +
+ Protecting data privacy in deep learning (DL) is of crucial importance. +Several celebrated privacy notions have been established and used for +privacy-preserving DL. However, many existing mechanisms achieve privacy at the +cost of significant utility degradation and computational overhead. In this +paper, we propose a stochastic differential equation-based residual +perturbation for privacy-preserving DL, which injects Gaussian noise into each +residual mapping of ResNets. Theoretically, we prove that residual perturbation +guarantees differential privacy (DP) and reduces the generalization gap of DL. +Empirically, we show that residual perturbation is computationally efficient +and outperforms the state-of-the-art differentially private stochastic gradient +descent (DPSGD) in utility maintenance without sacrificing membership privacy. + +
+
+
+
+
+ + ☆ Deformable Image Registration with Multi-scale Feature Fusion from + Shared Encoder, Auxiliary and Pyramid Decoders + + +
+ In this work, we propose a novel deformable convolutional pyramid network for +unsupervised image registration. Specifically, the proposed network enhances +the traditional pyramid network by adding an additional shared auxiliary +decoder for image pairs. This decoder provides multi-scale high-level feature +information from unblended image pairs for the registration task. During the +registration process, we also design a multi-scale feature fusion block to +extract the most beneficial features for the registration task from both global +and local contexts. Validation results indicate that this method can capture +complex deformations while achieving higher registration accuracy and +maintaining smooth and plausible deformations. + +
+
+
+
+
+ + ☆ SSL: A Self-similarity Loss for Improving Generative Image + Super-resolution ACM MM 2024 + + +
+ Generative adversarial networks (GAN) and generative diffusion models (DM) +have been widely used in real-world image super-resolution (Real-ISR) to +enhance the image perceptual quality. However, these generative models are +prone to generating visual artifacts and false image structures, resulting in +unnatural Real-ISR results. Based on the fact that natural images exhibit high +self-similarities, i.e., a local patch can have many similar patches to it in +the whole image, in this work we propose a simple yet effective self-similarity +loss (SSL) to improve the performance of generative Real-ISR models, enhancing +the hallucination of structural and textural details while reducing the +unpleasant visual artifacts. Specifically, we compute a self-similarity graph +(SSG) of the ground-truth image, and enforce the SSG of Real-ISR output to be +close to it. To reduce the training cost and focus on edge areas, we generate +an edge mask from the ground-truth image, and compute the SSG only on the +masked pixels. The proposed SSL serves as a general plug-and-play penalty, +which could be easily applied to the off-the-shelf Real-ISR models. Our +experiments demonstrate that, by coupling with SSL, the performance of many +state-of-the-art Real-ISR models, including those GAN and DM based ones, can be +largely improved, reproducing more perceptually realistic image details and +eliminating many false reconstructions and visual artifacts. Codes and +supplementary material can be found at https://github.com/ChrisDud0257/SSL + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Contrastive masked auto-encoders based self-supervised hashing for 2D + image and 3D point cloud cross-modal retrieval ICME 2024 + + +
+ Implementing cross-modal hashing between 2D images and 3D point-cloud data is +a growing concern in real-world retrieval systems. Simply applying existing +cross-modal approaches to this new task fails to adequately capture latent +multi-modal semantics and effectively bridge the modality gap between 2D and +3D. To address these issues without relying on hand-crafted labels, we propose +contrastive masked autoencoders based self-supervised hashing (CMAH) for +retrieval between images and point-cloud data. We start by contrasting 2D-3D +pairs and explicitly constraining them into a joint Hamming space. This +contrastive learning process ensures robust discriminability for the generated +hash codes and effectively reduces the modality gap. Moreover, we utilize +multi-modal auto-encoders to enhance the model's understanding of multi-modal +semantics. By completing the masked image/point-cloud data modeling task, the +model is encouraged to capture more localized clues. In addition, the proposed +multi-modal fusion block facilitates fine-grained interactions among different +modalities. Extensive experiments on three public datasets demonstrate that the +proposed CMAH significantly outperforms all baseline methods. + +
+
+ comment: Accepted by ICME 2024 +
+
+
+
+
+ + ☆ Efficient Diffusion Transformer with Step-wise Dynamic Attention + Mediators ECCV 2024 + + +
+ This paper identifies significant redundancy in the query-key interactions +within self-attention mechanisms of diffusion transformer models, particularly +during the early stages of denoising diffusion steps. In response to this +observation, we present a novel diffusion transformer framework incorporating +an additional set of mediator tokens to engage with queries and keys +separately. By modulating the number of mediator tokens during the denoising +generation phases, our model initiates the denoising process with a precise, +non-ambiguous stage and gradually transitions to a phase enriched with detail. +Concurrently, integrating mediator tokens simplifies the attention module's +complexity to a linear scale, enhancing the efficiency of global attention +processes. Additionally, we propose a time-step dynamic mediator token +adjustment mechanism that further decreases the required computational FLOPs +for generation, simultaneously facilitating the generation of high-quality +images within the constraints of varied inference budgets. Extensive +experiments demonstrate that the proposed method can improve the generated +image quality while also reducing the inference cost of diffusion transformers. +When integrated with the recent work SiT, our method achieves a +state-of-the-art FID score of 2.01. The source code is available at +https://github.com/LeapLabTHU/Attention-Mediators. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Is Generative Communication between Embodied Agents Good for Zero-Shot + ObjectNav? + + +
+ In Zero-Shot ObjectNav, an embodied ground agent is expected to navigate to a +target object specified by a natural language label without any +environment-specific fine-tuning. This is challenging, given the limited view +of a ground agent and its independent exploratory behavior. To address these +issues, we consider an assistive overhead agent with a bounded global view +alongside the ground agent and present two coordinated navigation schemes for +judicious exploration. We establish the influence of the Generative +Communication (GC) between the embodied agents equipped with Vision-Language +Models (VLMs) in improving zero-shot ObjectNav, achieving a 10% improvement in +the ground agent's ability to find the target object in comparison with an +unassisted setup in simulation. We further analyze the GC for unique traits +quantifying the presence of hallucination and cooperation. In particular, we +identify a unique trait of "preemptive hallucination" specific to our embodied +setting, where the overhead agent assumes that the ground agent has executed an +action in the dialogue when it is yet to move. Finally, we conduct real-world +inferences with GC and showcase qualitative examples where countering +pre-emptive hallucination via prompt finetuning improves real-world ObjectNav +performance. + +
+
+
+
+
+ + ♻ ☆ FreqMamba: Viewing Mamba from a Frequency Perspective for Image + Deraining + + +
+ Images corrupted by rain streaks often lose vital frequency information for +perception, and image deraining aims to solve this issue which relies on global +and local degradation modeling. Recent studies have witnessed the effectiveness +and efficiency of Mamba for perceiving global and local information based on +its exploiting local correlation among patches, however, rarely attempts have +been explored to extend it with frequency analysis for image deraining, +limiting its ability to perceive global degradation that is relevant to +frequency modeling (e.g. Fourier transform). In this paper, we propose +FreqMamba, an effective and efficient paradigm that leverages the complementary +between Mamba and frequency analysis for image deraining. The core of our +method lies in extending Mamba with frequency analysis from two perspectives: +extending it with frequency-band for exploiting frequency correlation, and +connecting it with Fourier transform for global degradation modeling. +Specifically, FreqMamba introduces complementary triple interaction structures +including spatial Mamba, frequency band Mamba, and Fourier global modeling. +Frequency band Mamba decomposes the image into sub-bands of different +frequencies to allow 2D scanning from the frequency dimension. Furthermore, +leveraging Mamba's unique data-dependent properties, we use rainy images at +different scales to provide degradation priors to the network, thereby +facilitating efficient training. Extensive experiments show that our method +outperforms state-of-the-art methods both visually and quantitatively. + +
+
+
+
+
+ + ♻ Blockwise Self-Supervised Learning at Scale + + +
+ Current state-of-the-art deep networks are all powered by backpropagation. In +this paper, we explore alternatives to full backpropagation in the form of +blockwise learning rules, leveraging the latest developments in self-supervised +learning. We show that a blockwise pretraining procedure consisting of training +independently the 4 main blocks of layers of a ResNet-50 with Barlow Twins' +loss function at each block performs almost as well as end-to-end +backpropagation on ImageNet: a linear probe trained on top of our blockwise +pretrained model obtains a top-1 classification accuracy of 70.48%, only 1.1% +below the accuracy of an end-to-end pretrained network (71.57% accuracy). We +perform extensive experiments to understand the impact of different components +within our method and explore a variety of adaptations of self-supervised +learning to the blockwise paradigm, building an exhaustive understanding of the +critical avenues for scaling local learning rules to large networks, with +implications ranging from hardware design to neuroscience. + +
+
+
+
+
+ + ♻ ☆ Enhancing Object Coherence in Layout-to-Image Synthesis + + +
+ Layout-to-image synthesis is an emerging technique in conditional image +generation. It aims to generate complex scenes, where users require fine +control over the layout of the objects in a scene. However, it remains +challenging to control the object coherence, including semantic coherence +(e.g., the cat looks at the flowers or not) and physical coherence (e.g., the +hand and the racket should not be misaligned). In this paper, we propose a +novel diffusion model with effective global semantic fusion (GSF) and +self-similarity feature enhancement modules to guide the object coherence for +this task. For semantic coherence, we argue that the image caption contains +rich information for defining the semantic relationship within the objects in +the images. Instead of simply employing cross-attention between captions and +latent images, which addresses the highly relevant layout restriction and +semantic coherence requirement separately and thus leads to unsatisfying +results shown in our experiments, we develop GSF to fuse the supervision from +the layout restriction and semantic coherence requirement and exploit it to +guide the image synthesis process. Moreover, to improve the physical coherence, +we develop a Self-similarity Coherence Attention (SCA) module to explicitly +integrate local contextual physical coherence relation into each pixel's +generation process. Specifically, we adopt a self-similarity map to encode the +physical coherence restrictions and employ it to extract coherent features from +text embedding. Through visualization of our self-similarity map, we explore +the essence of SCA, revealing that its effectiveness is not only in capturing +reliable physical coherence patterns but also in enhancing complex texture +generation. Extensive experiments demonstrate the superiority of our proposed +method in both image generation quality and controllability. + +
+
+ comment: Code: https://github.com/CodeGoat24/EOCNet +
+
+
+
+
+ + ♻ ☆ Exploring Plain ViT Reconstruction for Multi-class Unsupervised Anomaly + Detection + + +
+ This work studies a challenging and practical issue known as multi-class +unsupervised anomaly detection (MUAD). This problem requires only normal images +for training while simultaneously testing both normal and anomaly images across +multiple classes. Existing reconstruction-based methods typically adopt +pyramidal networks as encoders and decoders to obtain multi-resolution +features, often involving complex sub-modules with extensive handcraft +engineering. In contrast, a plain Vision Transformer (ViT) showcasing a more +straightforward architecture has proven effective in multiple domains, +including detection and segmentation tasks. It is simpler, more effective, and +elegant. Following this spirit, we explore the use of only plain ViT features +for MUAD. We first abstract a Meta-AD concept by synthesizing current +reconstruction-based methods. Subsequently, we instantiate a novel ViT-based +ViTAD structure, designed incrementally from both global and local +perspectives. This model provide a strong baseline to facilitate future +research. Additionally, this paper uncovers several intriguing findings for +further investigation. Finally, we comprehensively and fairly benchmark various +approaches using eight metrics. Utilizing a basic training regimen with only an +MSE loss, ViTAD achieves state-of-the-art results and efficiency on MVTec AD, +VisA, and Uni-Medical datasets. \Eg, achieving 85.4 mAD that surpasses UniAD by ++3.0 for the MVTec AD dataset, and it requires only 1.1 hours and 2.3G GPU +memory to complete model training on a single V100 that can serve as a strong +baseline to facilitate the development of future research. Full code is +available at https://zhangzjn.github.io/projects/ViTAD/. + +
+
+
+
+
+ + ♻ ☆ EATFormer: Improving Vision Transformer Inspired by Evolutionary + Algorithm + + +
+ Motivated by biological evolution, this paper explains the rationality of +Vision Transformer by analogy with the proven practical evolutionary algorithm +(EA) and derives that both have consistent mathematical formulation. Then +inspired by effective EA variants, we propose a novel pyramid EATFormer +backbone that only contains the proposed EA-based transformer (EAT) block, +which consists of three residual parts, i.e., Multi-scale region aggregation, +global and local interaction, and feed-forward network modules, to model +multi-scale, interactive, and individual information separately. Moreover, we +design a task-related head docked with transformer backbone to complete final +information fusion more flexibly and improve a modulated deformable MSA to +dynamically model irregular locations. Massive quantitative and quantitative +experiments on image classification, downstream tasks, and explanatory +experiments demonstrate the effectiveness and superiority of our approach over +state-of-the-art methods. E.g., our Mobile (1.8 M), Tiny (6.1 M), Small (24.3 +M), and Base (49.0 M) models achieve 69.4, 78.4, 83.1, and 83.9 Top-1 only +trained on ImageNet-1K with naive training recipe; EATFormer-Tiny/Small/Base +armed Mask-R-CNN obtain 45.4/47.4/49.0 box AP and 41.4/42.9/44.2 mask AP on +COCO detection, surpassing contemporary MPViT-T, Swin-T, and Swin-S by +0.6/1.4/0.5 box AP and 0.4/1.3/0.9 mask AP separately with less FLOPs; Our +EATFormer-Small/Base achieve 47.3/49.3 mIoU on ADE20K by Upernet that exceeds +Swin-T/S by 2.8/1.7. Code is available at +https://github.com/zhangzjn/EATFormer. + +
+
+ comment: IJCV'2024 +
+
+
+
+
+ + ♻ ☆ UNK-VQA: A Dataset and a Probe into the Abstention Ability of + Multi-modal Large Models + + +
+ Teaching Visual Question Answering (VQA) models to refrain from answering +unanswerable questions is necessary for building a trustworthy AI system. +Existing studies, though have explored various aspects of VQA but somewhat +ignored this particular attribute. This paper aims to bridge the research gap +by contributing a comprehensive dataset, called UNK-VQA. The dataset is +specifically designed to address the challenge of questions that models do not +know. To this end, we first augment the existing data via deliberate +perturbations on either the image or question. In specific, we carefully ensure +that the question-image semantics remain close to the original unperturbed +distribution. By this means, the identification of unanswerable questions +becomes challenging, setting our dataset apart from others that involve mere +image replacement. We then extensively evaluate the zero- and few-shot +performance of several emerging multi-modal large models and discover their +significant limitations when applied to our dataset. Additionally, we also +propose a straightforward method to tackle these unanswerable questions. This +dataset, we believe, will serve as a valuable benchmark for enhancing the +abstention capability of VQA models, thereby leading to increased +trustworthiness of AI systems. We have made the dataset +(https://github.com/guoyang9/UNK-VQA) available to facilitate further +exploration in this area. + +
+
+ comment: Accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ Unbridled Icarus: A Survey of the Potential Perils of Image Inputs in + Multimodal Large Language Model Security + + +
+ Multimodal Large Language Models (MLLMs) demonstrate remarkable capabilities +that increasingly influence various aspects of our daily lives, constantly +defining the new boundary of Artificial General Intelligence (AGI). Image +modalities, enriched with profound semantic information and a more continuous +mathematical nature compared to other modalities, greatly enhance the +functionalities of MLLMs when integrated. However, this integration serves as a +double-edged sword, providing attackers with expansive vulnerabilities to +exploit for highly covert and harmful attacks. The pursuit of reliable AI +systems like powerful MLLMs has emerged as a pivotal area of contemporary +research. In this paper, we endeavor to demostrate the multifaceted risks +associated with the incorporation of image modalities into MLLMs. Initially, we +delineate the foundational components and training processes of MLLMs. +Subsequently, we construct a threat model, outlining the security +vulnerabilities intrinsic to MLLMs. Moreover, we analyze and summarize existing +scholarly discourses on MLLMs' attack and defense mechanisms, culminating in +suggestions for the future research on MLLM security. Through this +comprehensive analysis, we aim to deepen the academic understanding of MLLM +security challenges and propel forward the development of trustworthy MLLM +systems. + +
+
+ comment: 8 pages, 1 figure. Accepted to 2024 IEEE International Conference on + Systems, Man, and Cybernetics +
+
+
+
+
+ + ♻ ☆ denoiSplit: a method for joint microscopy image splitting and + unsupervised denoising ECCV 2024 + + +
+ In this work, we present denoiSplit, a method to tackle a new analysis task, +i.e. the challenge of joint semantic image splitting and unsupervised +denoising. This dual approach has important applications in fluorescence +microscopy, where semantic image splitting has important applications but noise +does generally hinder the downstream analysis of image content. Image splitting +involves dissecting an image into its distinguishable semantic structures. We +show that the current state-of-the-art method for this task struggles in the +presence of image noise, inadvertently also distributing the noise across the +predicted outputs. The method we present here can deal with image noise by +integrating an unsupervised denoising subtask. This integration results in +improved semantic image unmixing, even in the presence of notable and realistic +levels of imaging noise. A key innovation in denoiSplit is the use of +specifically formulated noise models and the suitable adjustment of +KL-divergence loss for the high-dimensional hierarchical latent space we are +training. We showcase the performance of denoiSplit across multiple tasks on +real-world microscopy images. Additionally, we perform qualitative and +quantitative evaluations and compare the results to existing benchmarks, +demonstrating the effectiveness of using denoiSplit: a single Variational +Splitting Encoder-Decoder (VSE) Network using two suitable noise models to +jointly perform semantic splitting and denoising. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ High Fidelity Scene Text Synthesis + + +
+ Scene text synthesis involves rendering specified texts onto arbitrary +images. Current methods typically formulate this task in an end-to-end manner +but lack effective character-level guidance during training. Besides, their +text encoders, pre-trained on a single font type, struggle to adapt to the +diverse font styles encountered in practical applications. Consequently, these +methods suffer from character distortion, repetition, and absence, particularly +in polystylistic scenarios. To this end, this paper proposes DreamText for +high-fidelity scene text synthesis. Our key idea is to reconstruct the +diffusion training process, introducing more refined guidance tailored to this +task, to expose and rectify the model's attention at the character level and +strengthen its learning of text regions. This transformation poses a hybrid +optimization challenge, involving both discrete and continuous variables. To +effectively tackle this challenge, we employ a heuristic alternate optimization +strategy. Meanwhile, we jointly train the text encoder and generator to +comprehensively learn and utilize the diverse font present in the training +dataset. This joint training is seamlessly integrated into the alternate +optimization process, fostering a synergistic relationship between learning +character embedding and re-estimating character attention. Specifically, in +each step, we first encode potential character-generated position information +from cross-attention maps into latent character masks. These masks are then +utilized to update the representation of specific characters in the current +step, which, in turn, enables the generator to correct the character's +attention in the subsequent steps. Both qualitative and quantitative results +demonstrate the superiority of our method to the state of the art. + +
+
+ comment: Code: https://github.com/CodeGoat24/DreamText, Project page: + https://codegoat24.github.io/DreamText/ +
+
+
+
+
+ + ♻ ☆ TalkCLIP: Talking Head Generation with Text-Guided Expressive Speaking + Styles + + +
+ Audio-driven talking head generation has drawn growing attention. To produce +talking head videos with desired facial expressions, previous methods rely on +extra reference videos to provide expression information, which may be +difficult to find and hence limits their usage. In this work, we propose +TalkCLIP, a framework that can generate talking heads where the expressions are +specified by natural language, hence allowing for specifying expressions more +conveniently. To model the mapping from text to expressions, we first construct +a text-video paired talking head dataset where each video has diverse text +descriptions that depict both coarse-grained emotions and fine-grained facial +movements. Leveraging the proposed dataset, we introduce a CLIP-based style +encoder that projects natural language-based descriptions to the +representations of expressions. TalkCLIP can even infer expressions for +descriptions unseen during training. TalkCLIP can also use text to modulate +expression intensity and edit expressions. Extensive experiments demonstrate +that TalkCLIP achieves the advanced capability of generating photo-realistic +talking heads with vivid facial expressions guided by text descriptions. + +
+
+
+
+
+ + ♻ ☆ FSL-Rectifier: Rectify Outliers in Few-Shot Learning via Test-Time + Augmentation + + +
+ Few-shot-learning (FSL) commonly requires a model to identify images +(queries) that belong to classes unseen during training, based on a few labeled +samples of the new classes (support set) as reference. So far, plenty of +algorithms involve training data augmentation to improve the generalization +capability of FSL models, but outlier queries or support images during +inference can still pose great generalization challenges. In this work, to +reduce the bias caused by the outlier samples, we generate additional +test-class samples by combining original samples with suitable train-class +samples via a generative image combiner. Then, we obtain averaged features via +an augmentor, which leads to more typical representations through the +averaging. We experimentally and theoretically demonstrate the effectiveness of +our method, e.g., obtaining a test accuracy improvement proportion of around +10% (e.g., from 46.86% to 53.28%) for trained FSL models. Importantly, given +pretrained image combiner, our method is training-free for off-the-shelf FSL +models, whose performance can be improved without extra datasets nor further +training of the models themselves. + +
+
+
+
+
+ + ♻ ☆ RAVEN: Rethinking Adversarial Video Generation with Efficient Tri-plane + Networks + + +
+ We present a novel unconditional video generative model designed to address +long-term spatial and temporal dependencies, with attention to computational +and dataset efficiency. To capture long spatio-temporal dependencies, our +approach incorporates a hybrid explicit-implicit tri-plane representation +inspired by 3D-aware generative frameworks developed for three-dimensional +object representation and employs a single latent code to model an entire video +clip. Individual video frames are then synthesized from an intermediate +tri-plane representation, which itself is derived from the primary latent code. +This novel strategy more than halves the computational complexity measured in +FLOPs compared to the most efficient state-of-the-art methods. Consequently, +our approach facilitates the efficient and temporally coherent generation of +videos. Moreover, our joint frame modeling approach, in contrast to +autoregressive methods, mitigates the generation of visual artifacts. We +further enhance the model's capabilities by integrating an optical flow-based +module within our Generative Adversarial Network (GAN) based generator +architecture, thereby compensating for the constraints imposed by a smaller +generator size. As a result, our model synthesizes high-fidelity video clips at +a resolution of $256\times256$ pixels, with durations extending to more than +$5$ seconds at a frame rate of 30 fps. The efficacy and versatility of our +approach are empirically validated through qualitative and quantitative +assessments across three different datasets comprising both synthetic and real +video clips. We will make our training and inference code public. + +
+
+
+
+
+ + ♻ ☆ Stable Diffusion Exposed: Gender Bias from Prompt to Image + + +
+ Several studies have raised awareness about social biases in image generative +models, demonstrating their predisposition towards stereotypes and imbalances. +This paper contributes to this growing body of research by introducing an +evaluation protocol that analyzes the impact of gender indicators at every step +of the generation process on Stable Diffusion images. Leveraging insights from +prior work, we explore how gender indicators not only affect gender +presentation but also the representation of objects and layouts within the +generated images. Our findings include the existence of differences in the +depiction of objects, such as instruments tailored for specific genders, and +shifts in overall layouts. We also reveal that neutral prompts tend to produce +images more aligned with masculine prompts than their feminine counterparts. We +further explore where bias originates through representational disparities and +how it manifests in the images via prompt-image dependencies, and provide +recommendations for developers and users to mitigate potential bias in image +generation. + +
+
+
+
+
+ + ♻ ☆ Towards Trustworthy Dataset Distillation + + +
+ Efficiency and trustworthiness are two eternal pursuits when applying deep +learning in real-world applications. With regard to efficiency, dataset +distillation (DD) endeavors to reduce training costs by distilling the large +dataset into a tiny synthetic dataset. However, existing methods merely +concentrate on in-distribution (InD) classification in a closed-world setting, +disregarding out-of-distribution (OOD) samples. On the other hand, OOD +detection aims to enhance models' trustworthiness, which is always +inefficiently achieved in full-data settings. For the first time, we +simultaneously consider both issues and propose a novel paradigm called +Trustworthy Dataset Distillation (TrustDD). By distilling both InD samples and +outliers, the condensed datasets are capable of training models competent in +both InD classification and OOD detection. To alleviate the requirement of real +outlier data, we further propose to corrupt InD samples to generate +pseudo-outliers, namely Pseudo-Outlier Exposure (POE). Comprehensive +experiments on various settings demonstrate the effectiveness of TrustDD, and +POE surpasses the state-of-the-art method Outlier Exposure (OE). Compared with +the preceding DD, TrustDD is more trustworthy and applicable to open-world +scenarios. Our code is available at https://github.com/mashijie1028/TrustDD + +
+
+ comment: Accepted to Pattern Recognition 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Iterative Improvement of an Additively Regularized Topic Model + + +
+ Topic modelling is fundamentally a soft clustering problem (of known objects +-- documents, over unknown clusters -- topics). That is, the task is +incorrectly posed. In particular, the topic models are unstable and incomplete. +All this leads to the fact that the process of finding a good topic model +(repeated hyperparameter selection, model training, and topic quality +assessment) can be particularly long and labor-intensive. We aim to simplify +the process, to make it more deterministic and provable. To this end, we +present a method for iterative training of a topic model. The essence of the +method is that a series of related topic models are trained so that each +subsequent model is at least as good as the previous one, i.e., that it retains +all the good topics found earlier. The connection between the models is +achieved by additive regularization. The result of this iterative training is +the last topic model in the series, which we call the iteratively updated +additively regularized topic model (ITAR). Experiments conducted on several +collections of natural language texts show that the proposed ITAR model +performs better than other popular topic models (LDA, ARTM, BERTopic), its +topics are diverse, and its perplexity (ability to "explain" the underlying +data) is moderate. + +
+
+ comment: A full draft of the second version of the article +
+
+
+
+
+ + ☆ GraphTransfer: A Generic Feature Fusion Framework for Collaborative + Filtering + + +
+ Graph Neural Networks (GNNs) have demonstrated effectiveness in collaborative +filtering tasks due to their ability to extract powerful structural features. +However, combining the graph features extracted from user-item interactions and +auxiliary features extracted from user genres and item properties remains a +challenge. Currently available fusion methods face two major issues: 1) simple +methods such as concatenation and summation are generic, but not accurate in +capturing feature relationships; 2) task-specific methods like attention +mechanisms and meta paths may not be suitable for general feature fusion. To +address these challenges, we present GraphTransfer, a simple but universal +feature fusion framework for GNN-based collaborative filtering. Our method +accurately fuses different types of features by first extracting graph features +from the user-item interaction graph and auxiliary features from users and +items using GCN. The proposed cross fusion module then effectively bridges the +semantic gaps between the interaction scores of different features. Theoretical +analysis and experiments on public datasets show that GraphTransfer outperforms +other feature fusion methods in CF tasks. Additionally, we demonstrate the +universality of our framework via empirical studies in three other scenarios, +showing that GraphTransfer leads to significant improvements in the performance +of CF algorithms. + +
+
+
+
+
+ + ☆ Advancing Re-Ranking with Multimodal Fusion and Target-Oriented + Auxiliary Tasks in E-Commerce Search + + +
+ In the rapidly evolving field of e-commerce, the effectiveness of search +re-ranking models is crucial for enhancing user experience and driving +conversion rates. Despite significant advancements in feature representation +and model architecture, the integration of multimodal information remains +underexplored. This study addresses this gap by investigating the computation +and fusion of textual and visual information in the context of re-ranking. We +propose \textbf{A}dvancing \textbf{R}e-Ranking with +\textbf{M}ulti\textbf{m}odal Fusion and \textbf{T}arget-Oriented Auxiliary +Tasks (ARMMT), which integrates an attention-based multimodal fusion technique +and an auxiliary ranking-aligned task to enhance item representation and +improve targeting capabilities. This method not only enriches the understanding +of product attributes but also enables more precise and personalized +recommendations. Experimental evaluations on JD.com's search platform +demonstrate that ARMMT achieves state-of-the-art performance in multimodal +information integration, evidenced by a 0.22\% increase in the Conversion Rate +(CVR), significantly contributing to Gross Merchandise Volume (GMV). This +pioneering approach has the potential to revolutionize e-commerce re-ranking, +leading to elevated user satisfaction and business growth. + +
+
+
+
+
+ + ☆ Moment&Cross: Next-Generation Real-Time Cross-Domain CTR Prediction for + Live-Streaming Recommendation at Kuaishou + + +
+ Kuaishou, is one of the largest short-video and live-streaming platform, +compared with short-video recommendations, live-streaming recommendation is +more complex because of: (1) temporarily-alive to distribution, (2) user may +watch for a long time with feedback delay, (3) content is unpredictable and +changes over time. Actually, even if a user is interested in the live-streaming +author, it still may be an negative watching (e.g., short-view < 3s) since the +real-time content is not attractive enough. Therefore, for live-streaming +recommendation, there exists a challenging task: how do we recommend the +live-streaming at right moment for users? Additionally, our platform's major +exposure content is short short-video, and the amount of exposed short-video is +9x more than exposed live-streaming. Thus users will leave more behaviors on +short-videos, which leads to a serious data imbalance problem making the +live-streaming data could not fully reflect user interests. In such case, there +raises another challenging task: how do we utilize users' short-video behaviors +to make live-streaming recommendation better? + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Decoding Acceleration Framework for Industrial Deployable LLM-based + Recommender Systems + + +
+ Recently, increasing attention has been paid to LLM-based recommender +systems, but their deployment is still under exploration in the industry. Most +deployments utilize LLMs as feature enhancers, generating augmentation +knowledge in the offline stage. However, in recommendation scenarios, involving +numerous users and items, even offline generation with LLMs consumes +considerable time and resources. This generation inefficiency stems from the +autoregressive nature of LLMs, and a promising direction for acceleration is +speculative decoding, a Draft-then-Verify paradigm that increases the number of +generated tokens per decoding step. In this paper, we first identify that +recommendation knowledge generation is suitable for retrieval-based speculative +decoding. Then, we discern two characteristics: (1) extensive items and users +in RSs bring retrieval inefficiency, and (2) RSs exhibit high diversity +tolerance for text generated by LLMs. Based on the above insights, we propose a +Decoding Acceleration Framework for LLM-based Recommendation (dubbed DARE), +with Customized Retrieval Pool to improve retrieval efficiency and Relaxed +Verification to increase the acceptance rate of draft tokens, respectively. +Extensive experiments demonstrate that DARE achieves a 3-5x speedup and is +compatible with various frameworks and backbone LLMs. DARE has also been +deployed to online advertising scenarios within a large-scale commercial +environment, achieving a 3.45x speedup while maintaining the downstream +performance. + +
+
+
+
+
+ + ☆ Utilizing Large Language Models to Optimize the Detection and + Explainability of Phishing Websites + + +
+ In this paper, we introduce PhishLang, an open-source, lightweight Large +Language Model (LLM) specifically designed for phishing website detection +through contextual analysis of the website. Unlike traditional heuristic or +machine learning models that rely on static features and struggle to adapt to +new threats and deep learning models that are computationally intensive, our +model utilizes the advanced language processing capabilities of LLMs to learn +granular features that are characteristic of phishing attacks. Furthermore, +PhishLang operates with minimal data preprocessing and offers performance +comparable to leading deep learning tools, while being significantly faster and +less resource-intensive. Over a 3.5-month testing period, PhishLang +successfully identified approximately 26K phishing URLs, many of which were +undetected by popular antiphishing blocklists, thus demonstrating its potential +to aid current detection measures. We also evaluate PhishLang against several +realistic adversarial attacks and develop six patches that make it very robust +against such threats. Furthermore, we integrate PhishLang with GPT-3.5 Turbo to +create \textit{explainable blocklisting} - warnings that provide users with +contextual information about different features that led to a website being +marked as phishing. Finally, we have open-sourced the PhishLang framework and +developed a Chromium-based browser extension and URL scanner website, which +implement explainable warnings for end-users. + +
+
+
+
+
+ + ☆ Online Matrix Completion: A Collaborative Approach with Hott Items + + +
+ We investigate the low rank matrix completion problem in an online setting +with ${M}$ users, ${N}$ items, ${T}$ rounds, and an unknown rank-$r$ reward +matrix ${R}\in \mathbb{R}^{{M}\times {N}}$. This problem has been well-studied +in the literature and has several applications in practice. In each round, we +recommend ${S}$ carefully chosen distinct items to every user and observe noisy +rewards. In the regime where ${M},{N} >> {T}$, we propose two distinct +computationally efficient algorithms for recommending items to users and +analyze them under the benign \emph{hott items} assumption.1) First, for +${S}=1$, under additional incoherence/smoothness assumptions on ${R}$, we +propose the phased algorithm \textsc{PhasedClusterElim}. Our algorithm obtains +a near-optimal per-user regret of +$\tilde{O}({N}{M}^{-1}(\Delta^{-1}+\Delta_{{hott}}^{-2}))$ where +$\Delta_{{hott}},\Delta$ are problem-dependent gap parameters with +$\Delta_{{hott}} >> \Delta$ almost always. 2) Second, we consider a simplified +setting with ${S}=r$ where we make significantly milder assumptions on ${R}$. +Here, we introduce another phased algorithm, \textsc{DeterminantElim}, to +derive a regret guarantee of $\widetilde{O}({N}{M}^{-1/r}\Delta_{det}^{-1}))$ +where $\Delta_{{det}}$ is another problem-dependent gap. Both algorithms +crucially use collaboration among users to jointly eliminate sub-optimal items +for groups of users successively in phases, but with distinctive and novel +approaches. + +
+
+ comment: Appeared at the Forty-first International Conference on Machine + Learning, 2024 +
+
+
+
+
+ + ♻ ☆ Lifelong Personalized Low-Rank Adaptation of Large Language Models for + Recommendation + + +
+ We primarily focus on the field of large language models (LLMs) for +recommendation, which has been actively explored recently and poses a +significant challenge in effectively enhancing recommender systems with logical +reasoning abilities and open-world knowledge. Current mainstream efforts mainly +center around injecting personalized information from recommendation models +into LLMs by customizing input templates or aligning representations between +semantic and recommendation spaces at the prediction layer. However, they face +three significant limitations: (1) LoRA is mostly used as a core component in +existing works, but personalization is not well established in LoRA parameters +as the LoRA matrix shared by every user may not cater to different users' +characteristics, leading to suboptimal performance. (2) Although lifelong +personalized behavior sequences are ideal for personalization, their use raises +effectiveness and efficiency issues since LLMs require escalating training and +inference time to extend text lengths. (3) Existing approaches aren't scalable +for large datasets due to training efficiency constraints. Thus, LLMs only see +a small fraction of the datasets (e.g., less than 10%) instead of the whole +datasets, limiting their exposure to the full training space. To address these +problems, we propose RecLoRA. This model incorporates a Personalized LoRA +module that maintains independent LoRAs for different users and a Long-Short +Modality Retriever that retrieves different history lengths for different +modalities, significantly improving performance while adding minimal time cost. +Furthermore, we design a Few2Many Learning Strategy, using a conventional +recommendation model as a lens to magnify small training spaces to full spaces. +Extensive experiments on public datasets demonstrate the efficacy of our +RecLoRA compared to existing baseline models. + +
+
+
+
+
+ + ♻ ☆ PK-ICR: Persona-Knowledge Interactive Context Retrieval for Grounded + Dialogue EMNLP 2023 + + +
+ Identifying relevant persona or knowledge for conversational systems is +critical to grounded dialogue response generation. However, each grounding has +been mostly researched in isolation with more practical multi-context dialogue +tasks introduced in recent works. We define Persona and Knowledge Dual Context +Identification as the task to identify persona and knowledge jointly for a +given dialogue, which could be of elevated importance in complex multi-context +dialogue settings. We develop a novel grounding retrieval method that utilizes +all contexts of dialogue simultaneously. Our method requires less computational +power via utilizing neural QA retrieval models. We further introduce our novel +null-positive rank test which measures ranking performance on semantically +dissimilar samples (i.e. hard negatives) in relation to data augmentation. + +
+
+ comment: Accepted to EMNLP 2023 main conference (Oral). Code available at + https://github.com/minsik-ai/PK-ICR +
+
+
+
+
+
+
+
+ + Machine Learning 29 + +
+
+
+ + ☆ LLM-Based Robust Product Classification in Commerce and Compliance + + +
+ Product classification is a crucial task in international trade, as +compliance regulations are verified and taxes and duties are applied based on +product categories. Manual classification of products is time-consuming and +error-prone, and the sheer volume of products imported and exported renders the +manual process infeasible. Consequently, e-commerce platforms and enterprises +involved in international trade have turned to automatic product classification +using machine learning. However, current approaches do not consider the +real-world challenges associated with product classification, such as very +abbreviated and incomplete product descriptions. In addition, recent +advancements in generative Large Language Models (LLMs) and their reasoning +capabilities are mainly untapped in product classification and e-commerce. In +this research, we explore the real-life challenges of industrial classification +and we propose data perturbations that allow for realistic data simulation. +Furthermore, we employ LLM-based product classification to improve the +robustness of the prediction in presence of incomplete data. Our research shows +that LLMs with in-context learning outperform the supervised approaches in the +clean-data scenario. Additionally, we illustrate that LLMs are significantly +more robust than the supervised approaches when data attacks are present. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Leveraging Knowledge Graph-Based Human-Like Memory Systems to Solve + Partially Observable Markov Decision Processes + + +
+ Humans observe only part of their environment at any moment but can still +make complex, long-term decisions thanks to our long-term memory system. To +test how an AI can learn and utilize its long-term memory system, we have +developed a partially observable Markov decision processes (POMDP) environment, +where the agent has to answer questions while navigating a maze. The +environment is completely knowledge graph (KG) based, where the hidden states +are dynamic KGs. A KG is both human- and machine-readable, making it easy to +see what the agents remember and forget. We train and compare agents with +different memory systems, to shed light on how human brains work when it comes +to managing its own memory systems. By repurposing the given learning objective +as learning a memory management policy, we were able to capture the most likely +belief state, which is not only interpretable but also reusable. + +
+
+
+
+
+ + ☆ Comparative Evaluation of Memory Technologies for Synaptic Crossbar + Arrays- Part 2: Design Knobs and DNN Accuracy Trends + + +
+ Crossbar memory arrays have been touted as the workhorse of in-memory +computing (IMC)-based acceleration of Deep Neural Networks (DNNs), but the +associated hardware non-idealities limit their efficacy. To address this, +cross-layer design solutions that reduce the impact of hardware non-idealities +on DNN accuracy are needed. In Part 1 of this paper, we established the +co-optimization strategies for various memory technologies and their crossbar +arrays, and conducted a comparative technology evaluation in the context of IMC +robustness. In this part, we analyze various design knobs such as array size +and bit-slice (number of bits per device) and their impact on the performance +of 8T SRAM, ferroelectric transistor (FeFET), Resistive RAM (ReRAM) and +spin-orbit-torque magnetic RAM (SOT-MRAM) in the context of inference accuracy +at 7nm technology node. Further, we study the effect of circuit design +solutions such as Partial Wordline Activation (PWA) and custom ADC reference +levels that reduce the hardware non-idealities and comparatively analyze the +response of each technology to such accuracy enhancing techniques. Our results +on ResNet-20 (with CIFAR-10) show that PWA increases accuracy by up to 32.56% +while custom ADC reference levels yield up to 31.62% accuracy enhancement. We +observe that compared to the other technologies, FeFET, by virtue of its small +layout height and high distinguishability of its memory states, is best suited +for large arrays. For higher bit-slices and a more complex dataset (ResNet-50 +with Cifar-100) we found that ReRAM matches the performance of FeFET. + +
+
+
+
+
+ + ☆ Using Retriever Augmented Large Language Models for Attack Graph + Generation + + +
+ As the complexity of modern systems increases, so does the importance of +assessing their security posture through effective vulnerability management and +threat modeling techniques. One powerful tool in the arsenal of cybersecurity +professionals is the attack graph, a representation of all potential attack +paths within a system that an adversary might exploit to achieve a certain +objective. Traditional methods of generating attack graphs involve expert +knowledge, manual curation, and computational algorithms that might not cover +the entire threat landscape due to the ever-evolving nature of vulnerabilities +and exploits. This paper explores the approach of leveraging large language +models (LLMs), such as ChatGPT, to automate the generation of attack graphs by +intelligently chaining Common Vulnerabilities and Exposures (CVEs) based on +their preconditions and effects. It also shows how to utilize LLMs to create +attack graphs from threat reports. + +
+
+
+
+
+ + ☆ On the Robustness of Kernel Goodness-of-Fit Tests + + +
+ Goodness-of-fit testing is often criticized for its lack of practical +relevance; since ``all models are wrong'', the null hypothesis that the data +conform to our model is ultimately always rejected when the sample size is +large enough. Despite this, probabilistic models are still used extensively, +raising the more pertinent question of whether the model is good enough for a +specific task. This question can be formalized as a robust goodness-of-fit +testing problem by asking whether the data were generated by a distribution +corresponding to our model up to some mild perturbation. In this paper, we show +that existing kernel goodness-of-fit tests are not robust according to common +notions of robustness including qualitative and quantitative robustness. We +also show that robust techniques based on tilted kernels from the parameter +estimation literature are not sufficient for ensuring both types of robustness +in the context of goodness-of-fit testing. We therefore propose the first +robust kernel goodness-of-fit test which resolves this open problem using +kernel Stein discrepancy balls, which encompass perturbation models such as +Huber contamination models and density uncertainty bands. + +
+
+ comment: 50 pages, 13 figures +
+
+
+
+
+ + ☆ An End-to-End Model for Time Series Classification In the Presence of + Missing Values + + +
+ Time series classification with missing data is a prevalent issue in time +series analysis, as temporal data often contain missing values in practical +applications. The traditional two-stage approach, which handles imputation and +classification separately, can result in sub-optimal performance as label +information is not utilized in the imputation process. On the other hand, a +one-stage approach can learn features under missing information, but feature +representation is limited as imputed errors are propagated in the +classification process. To overcome these challenges, this study proposes an +end-to-end neural network that unifies data imputation and representation +learning within a single framework, allowing the imputation process to take +advantage of label information. Differing from previous methods, our approach +places less emphasis on the accuracy of imputation data and instead prioritizes +classification performance. A specifically designed multi-scale feature +learning module is implemented to extract useful information from the +noise-imputation data. The proposed model is evaluated on 68 univariate time +series datasets from the UCR archive, as well as a multivariate time series +dataset with various missing data ratios and 4 real-world datasets with missing +information. The results indicate that the proposed model outperforms +state-of-the-art approaches for incomplete time series classification, +particularly in scenarios with high levels of missing data. + +
+
+
+
+
+ + ☆ Online Matrix Completion: A Collaborative Approach with Hott Items + + +
+ We investigate the low rank matrix completion problem in an online setting +with ${M}$ users, ${N}$ items, ${T}$ rounds, and an unknown rank-$r$ reward +matrix ${R}\in \mathbb{R}^{{M}\times {N}}$. This problem has been well-studied +in the literature and has several applications in practice. In each round, we +recommend ${S}$ carefully chosen distinct items to every user and observe noisy +rewards. In the regime where ${M},{N} >> {T}$, we propose two distinct +computationally efficient algorithms for recommending items to users and +analyze them under the benign \emph{hott items} assumption.1) First, for +${S}=1$, under additional incoherence/smoothness assumptions on ${R}$, we +propose the phased algorithm \textsc{PhasedClusterElim}. Our algorithm obtains +a near-optimal per-user regret of +$\tilde{O}({N}{M}^{-1}(\Delta^{-1}+\Delta_{{hott}}^{-2}))$ where +$\Delta_{{hott}},\Delta$ are problem-dependent gap parameters with +$\Delta_{{hott}} >> \Delta$ almost always. 2) Second, we consider a simplified +setting with ${S}=r$ where we make significantly milder assumptions on ${R}$. +Here, we introduce another phased algorithm, \textsc{DeterminantElim}, to +derive a regret guarantee of $\widetilde{O}({N}{M}^{-1/r}\Delta_{det}^{-1}))$ +where $\Delta_{{det}}$ is another problem-dependent gap. Both algorithms +crucially use collaboration among users to jointly eliminate sub-optimal items +for groups of users successively in phases, but with distinctive and novel +approaches. + +
+
+ comment: Appeared at the Forty-first International Conference on Machine + Learning, 2024 +
+
+
+
+
+ + ☆ Divide-and-Conquer Predictive Coding: a structured Bayesian inference + algorithm NeurIPS + + +
+ Unexpected stimuli induce "error" or "surprise" signals in the brain. The +theory of predictive coding promises to explain these observations in terms of +Bayesian inference by suggesting that the cortex implements variational +inference in a probabilistic graphical model. However, when applied to machine +learning tasks, this family of algorithms has yet to perform on par with other +variational approaches in high-dimensional, structured inference problems. To +address this, we introduce a novel predictive coding algorithm for structured +generative models, that we call divide-and-conquer predictive coding (DCPC). +DCPC differs from other formulations of predictive coding, as it respects the +correlation structure of the generative model and provably performs +maximum-likelihood updates of model parameters, all without sacrificing +biological plausibility. Empirically, DCPC achieves better numerical +performance than competing algorithms and provides accurate inference in a +number of problems not previously addressed with predictive coding. We provide +an open implementation of DCPC in Pyro on Github. + +
+
+ comment: 22 pages, 5 figures, submitted to Neural Information Processing + Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Sampling Foundational Transformer: A Theoretical Perspective + + +
+ The versatility of self-attention mechanism earned transformers great success +in almost all data modalities, with limitations on the quadratic complexity and +difficulty of training. To apply transformers across different data modalities, +practitioners have to make specific clever data-modality-dependent +constructions. In this paper, we propose Sampling Foundational Transformer +(SFT) that can work on multiple data modalities (e.g., point cloud, graph, and +sequence) and constraints (e.g., rotational-invariant). The existence of such +model is important as contemporary foundational modeling requires operability +on multiple data sources. For efficiency on large number of tokens, our model +relies on our context aware sampling-without-replacement mechanism for both +linear asymptotic computational complexity and real inference time gain. For +efficiency, we rely on our newly discovered pseudoconvex formulation of +transformer layer to increase model's convergence rate. As a model working on +multiple data modalities, SFT has achieved competitive results on many +benchmarks, while being faster in inference, compared to other very specialized +models. + +
+
+
+
+
+ + ☆ On the Convergence of a Federated Expectation-Maximization Algorithm + + +
+ Data heterogeneity has been a long-standing bottleneck in studying the +convergence rates of Federated Learning algorithms. In order to better +understand the issue of data heterogeneity, we study the convergence rate of +the Expectation-Maximization (EM) algorithm for the Federated Mixture of $K$ +Linear Regressions model. We fully characterize the convergence rate of the EM +algorithm under all regimes of $m/n$ where $m$ is the number of clients and $n$ +is the number of data points per client. We show that with a +signal-to-noise-ratio (SNR) of order $\Omega(\sqrt{K})$, the well-initialized +EM algorithm converges within the minimax distance of the ground truth under +each of the regimes. Interestingly, we identify that when $m$ grows +exponentially in $n$, the EM algorithm only requires a constant number of +iterations to converge. We perform experiments on synthetic datasets to +illustrate our results. Surprisingly, the results show that rather than being a +bottleneck, data heterogeneity can accelerate the convergence of federated +learning algorithms. + +
+
+
+
+
+ + ☆ Kernel Density Estimators in Large Dimensions + + +
+ This paper studies Kernel density estimation for a high-dimensional +distribution $\rho(x)$. Traditional approaches have focused on the limit of +large number of data points $n$ and fixed dimension $d$. We analyze instead the +regime where both the number $n$ of data points $y_i$ and their dimensionality +$d$ grow with a fixed ratio $\alpha=(\log n)/d$. Our study reveals three +distinct statistical regimes for the kernel-based estimate of the density $\hat +\rho_h^{\mathcal {D}}(x)=\frac{1}{n h^d}\sum_{i=1}^n +K\left(\frac{x-y_i}{h}\right)$, depending on the bandwidth $h$: a classical +regime for large bandwidth where the Central Limit Theorem (CLT) holds, which +is akin to the one found in traditional approaches. Below a certain value of +the bandwidth, $h_{CLT}(\alpha)$, we find that the CLT breaks down. The +statistics of $\hat \rho_h^{\mathcal {D}}(x)$ for a fixed $x$ drawn from +$\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable +distribution). In particular below a value $h_G(\alpha)$, we find that $\hat +\rho_h^{\mathcal {D}}(x)$ is governed by extreme value statistics: only a few +points in the database matter and give the dominant contribution to the density +estimator. We provide a detailed analysis for high-dimensional multivariate +Gaussian data. We show that the optimal bandwidth threshold based on +Kullback-Leibler divergence lies in the new statistical regime identified in +this paper. Our findings reveal limitations of classical approaches, show the +relevance of these new statistical regimes, and offer new insights for Kernel +density estimation in high-dimensional settings. + +
+
+
+
+
+ + ☆ A Single Goal is All You Need: Skills and Exploration Emerge from + Contrastive RL without Rewards, Demonstrations, or Subgoals + + +
+ In this paper, we present empirical evidence of skills and directed +exploration emerging from a simple RL algorithm long before any successful +trials are observed. For example, in a manipulation task, the agent is given a +single observation of the goal state and learns skills, first for moving its +end-effector, then for pushing the block, and finally for picking up and +placing the block. These skills emerge before the agent has ever successfully +placed the block at the goal location and without the aid of any reward +functions, demonstrations, or manually-specified distance metrics. Once the +agent has learned to reach the goal state reliably, exploration is reduced. +Implementing our method involves a simple modification of prior work and does +not require density estimates, ensembles, or any additional hyperparameters. +Intuitively, the proposed method seems like it should be terrible at +exploration, and we lack a clear theoretical understanding of why it works so +effectively, though our experiments provide some hints. + +
+
+ comment: Code and videos: https://graliuce.github.io/sgcrl/ +
+
+
+
+
+ + ☆ Time Makes Space: Emergence of Place Fields in Networks Encoding + Temporally Continuous Sensory Experiences + + +
+ The vertebrate hippocampus is believed to use recurrent connectivity in area +CA3 to support episodic memory recall from partial cues. This brain area also +contains place cells, whose location-selective firing fields implement maps +supporting spatial memory. Here we show that place cells emerge in networks +trained to remember temporally continuous sensory episodes. We model CA3 as a +recurrent autoencoder that recalls and reconstructs sensory experiences from +noisy and partially occluded observations by agents traversing simulated rooms. +The agents move in realistic trajectories modeled from rodents and environments +are modeled as high-dimensional sensory experience maps. Training our +autoencoder to pattern-complete and reconstruct experiences with a constraint +on total activity causes spatially localized firing fields, i.e., place cells, +to emerge in the encoding layer. The emergent place fields reproduce key +aspects of hippocampal phenomenology: a) remapping (maintenance of and +reversion to distinct learned maps in different environments), implemented via +repositioning of experience manifolds in the network's hidden layer, b) +orthogonality of spatial representations in different arenas, c) robust place +field emergence in differently shaped rooms, with single units showing multiple +place fields in large or complex spaces, and d) slow representational drift of +place fields. We argue that these results arise because continuous traversal of +space makes sensory experience temporally continuous. We make testable +predictions: a) rapidly changing sensory context will disrupt place fields, b) +place fields will form even if recurrent connections are blocked, but reversion +to previously learned representations upon remapping will be abolished, c) the +dimension of temporally smooth experience sets the dimensionality of place +fields, including during virtual navigation of abstract spaces. + +
+
+
+
+
+ + ☆ A Comparative Study of Convolutional and Recurrent Neural Networks for + Storm Surge Prediction in Tampa Bay + + +
+ In this paper, we compare the performance of three common deep learning +architectures, CNN-LSTM, LSTM, and 3D-CNN, in the context of surrogate storm +surge modeling. The study site for this paper is the Tampa Bay area in Florida. +Using high-resolution atmospheric data from the reanalysis models and +historical water level data from NOAA tide stations, we trained and tested +these models to evaluate their performance. Our findings indicate that the +CNN-LSTM model outperforms the other architectures, achieving a test loss of +0.010 and an R-squared (R2) score of 0.84. The LSTM model, although it achieved +the lowest training loss of 0.007 and the highest training R2 of 0.88, +exhibited poorer generalization with a test loss of 0.014 and an R2 of 0.77. +The 3D-CNN model showed reasonable performance with a test loss of 0.011 and an +R2 of 0.82 but displayed instability under extreme conditions. A case study on +Hurricane Ian, which caused a significant negative surge of -1.5 meters in +Tampa Bay indicates the CNN-LSTM model's robustness and accuracy in extreme +scenarios. + +
+
+
+
+
+ + ☆ Continual Learning of Nonlinear Independent Representations + + +
+ Identifying the causal relations between interested variables plays a pivotal +role in representation learning as it provides deep insights into the dataset. +Identifiability, as the central theme of this approach, normally hinges on +leveraging data from multiple distributions (intervention, distribution shift, +time series, etc.). Despite the exciting development in this field, a practical +but often overlooked problem is: what if those distribution shifts happen +sequentially? In contrast, any intelligence possesses the capacity to abstract +and refine learned knowledge sequentially -- lifelong learning. In this paper, +with a particular focus on the nonlinear independent component analysis (ICA) +framework, we move one step forward toward the question of enabling models to +learn meaningful (identifiable) representations in a sequential manner, termed +continual causal representation learning. We theoretically demonstrate that +model identifiability progresses from a subspace level to a component-wise +level as the number of distributions increases. Empirically, we show that our +method achieves performance comparable to nonlinear ICA methods trained jointly +on multiple offline distributions and, surprisingly, the incoming new +distribution does not necessarily benefit the identification of all latent +variables. + +
+
+ comment: 9 pages, 5 Figures +
+
+
+
+
+ + ☆ On zero-shot learning in neural state estimation of power distribution + systems + + +
+ This paper addresses the challenge of neural state estimation in power +distribution systems. We identified a research gap in the current state of the +art, which lies in the inability of models to adapt to changes in the power +grid, such as loss of sensors and branch switching. Our experiments demonstrate +that graph neural networks are the most promising models for this use case and +that their performance can degrade with scale. We propose augmentations to +remedy this issue and perform a comprehensive grid search of different model +configurations for common zero-shot learning scenarios in neural state +estimation. + +
+
+ comment: 13 pages, 2 figures, associated source code available at + https://gitlab.com/transense/nse-tl-paper +
+
+
+
+
+ + ☆ CURLing the Dream: Contrastive Representations for World Modeling in + Reinforcement Learning + + +
+ In this work, we present Curled-Dreamer, a novel reinforcement learning +algorithm that integrates contrastive learning into the DreamerV3 framework to +enhance performance in visual reinforcement learning tasks. By incorporating +the contrastive loss from the CURL algorithm and a reconstruction loss from +autoencoder, Curled-Dreamer achieves significant improvements in various +DeepMind Control Suite tasks. Our extensive experiments demonstrate that +Curled-Dreamer consistently outperforms state-of-the-art algorithms, achieving +higher mean and median scores across a diverse set of tasks. The results +indicate that the proposed approach not only accelerates learning but also +enhances the robustness of the learned policies. This work highlights the +potential of combining different learning paradigms to achieve superior +performance in reinforcement learning applications. + +
+
+ comment: Paper accepted for 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ☆ Pareto Front Shape-Agnostic Pareto Set Learning in Multi-Objective + Optimization + + +
+ Pareto set learning (PSL) is an emerging approach for acquiring the complete +Pareto set of a multi-objective optimization problem. Existing methods +primarily rely on the mapping of preference vectors in the objective space to +Pareto optimal solutions in the decision space. However, the sampling of +preference vectors theoretically requires prior knowledge of the Pareto front +shape to ensure high performance of the PSL methods. Designing a sampling +strategy of preference vectors is difficult since the Pareto front shape cannot +be known in advance. To make Pareto set learning work effectively in any Pareto +front shape, we propose a Pareto front shape-agnostic Pareto Set Learning +(GPSL) that does not require the prior information about the Pareto front. The +fundamental concept behind GPSL is to treat the learning of the Pareto set as a +distribution transformation problem. Specifically, GPSL can transform an +arbitrary distribution into the Pareto set distribution. We demonstrate that +training a neural network by maximizing hypervolume enables the process of +distribution transformation. Our proposed method can handle any shape of the +Pareto front and learn the Pareto set without requiring prior knowledge. +Experimental results show the high performance of our proposed method on +diverse test problems compared with recent Pareto set learning algorithms. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ PoisonedRAG: Knowledge Corruption Attacks to Retrieval-Augmented + Generation of Large Language Models USENIX Security + + +
+ Large language models (LLMs) have achieved remarkable success due to their +exceptional generative capabilities. Despite their success, they also have +inherent limitations such as a lack of up-to-date knowledge and hallucination. +Retrieval-Augmented Generation (RAG) is a state-of-the-art technique to +mitigate these limitations. The key idea of RAG is to ground the answer +generation of an LLM on external knowledge retrieved from a knowledge database. +Existing studies mainly focus on improving the accuracy or efficiency of RAG, +leaving its security largely unexplored. We aim to bridge the gap in this work. +We find that the knowledge database in a RAG system introduces a new and +practical attack surface. Based on this attack surface, we propose PoisonedRAG, +the first knowledge corruption attack to RAG, where an attacker could inject a +few malicious texts into the knowledge database of a RAG system to induce an +LLM to generate an attacker-chosen target answer for an attacker-chosen target +question. We formulate knowledge corruption attacks as an optimization problem, +whose solution is a set of malicious texts. Depending on the background +knowledge (e.g., black-box and white-box settings) of an attacker on a RAG +system, we propose two solutions to solve the optimization problem, +respectively. Our results show PoisonedRAG could achieve a 90% attack success +rate when injecting five malicious texts for each target question into a +knowledge database with millions of texts. We also evaluate several defenses +and our results show they are insufficient to defend against PoisonedRAG, +highlighting the need for new defenses. + +
+
+ comment: To appear in USENIX Security Symposium 2025. The code is available at + https://github.com/sleeepeer/PoisonedRAG +
+
+
+
+
+ + ♻ ☆ Bidirectional Generative Pre-training for Improving Time Series + Representation Learning + + +
+ Learning time-series representations for discriminative tasks, such as +classification and regression, has been a long-standing challenge in the +healthcare domain. Current pre-training methods are limited in either +unidirectional next-token prediction or randomly masked token prediction. We +propose a novel architecture called Bidirectional Timely Generative Pre-trained +Transformer (BiTimelyGPT), which pre-trains on biosignals and longitudinal +clinical records by both next-token and previous-token prediction in +alternating transformer layers. This pre-training task preserves original +distribution and data shapes of the time-series. Additionally, the full-rank +forward and backward attention matrices exhibit more expressive representation +capabilities. Using biosignals and longitudinal clinical records, BiTimelyGPT +demonstrates superior performance in predicting neurological functionality, +disease diagnosis, and physiological signs. By visualizing the attention +heatmap, we observe that the pre-trained BiTimelyGPT can identify +discriminative segments from biosignal time-series sequences, even more so +after fine-tuning on the task. + +
+
+
+
+
+ + ♻ ☆ Residual Corrective Diffusion Modeling for Km-scale Atmospheric + Downscaling + + +
+ The state of the art for physical hazard prediction from weather and climate +requires expensive km-scale numerical simulations driven by coarser resolution +global inputs. Here, a generative diffusion architecture is explored for +downscaling such global inputs to km-scale, as a cost-effective machine +learning alternative. The model is trained to predict 2km data from a regional +weather model over Taiwan, conditioned on a 25km global reanalysis. To address +the large resolution ratio, different physics involved at different scales and +prediction of channels beyond those in the input data, we employ a two-step +approach where a UNet predicts the mean and a corrector diffusion (CorrDiff) +model predicts the residual. CorrDiff exhibits encouraging skill in bulk MAE +and CRPS scores. The predicted spectra and distributions from CorrDiff +faithfully recover important power law relationships in the target data. Case +studies of coherent weather phenomena show that CorrDiff can help sharpen wind +and temperature gradients that co-locate with intense rainfall in cold front, +and can help intensify typhoons and synthesize rain band structures. +Calibration of model uncertainty remains challenging. The prospect of unifying +methods like CorrDiff with coarser resolution global weather models implies a +potential for global-to-regional multi-scale machine learning simulation. + +
+
+
+
+
+ + ♻ ☆ Risk and cross validation in ridge regression with correlated samples + + +
+ Recent years have seen substantial advances in our understanding of +high-dimensional ridge regression, but existing theories assume that training +examples are independent. By leveraging recent techniques from random matrix +theory and free probability, we provide sharp asymptotics for the in- and +out-of-sample risks of ridge regression when the data points have arbitrary +correlations. We demonstrate that in this setting, the generalized cross +validation estimator (GCV) fails to correctly predict the out-of-sample risk. +However, in the case where the noise residuals have the same correlations as +the data points, one can modify the GCV to yield an efficiently-computable +unbiased estimator that concentrates in the high-dimensional limit, which we +dub CorrGCV. We further extend our asymptotic analysis to the case where the +test point has nontrivial correlations with the training set, a setting often +encountered in time series forecasting. Assuming knowledge of the correlation +structure of the time series, this again yields an extension of the GCV +estimator, and sharply characterizes the degree to which such test points yield +an overly optimistic prediction of long-time risk. We validate the predictions +of our theory across a variety of high dimensional data. + +
+
+ comment: 44 pages, 18 figures. v2: updated funding acknowledgements +
+
+
+
+
+ + ♻ ☆ Prompt-prompted Adaptive Structured Pruning for Efficient LLM Generation + + +
+ With the development of transformer-based large language models (LLMs), they +have been applied to many fields due to their remarkable utility, but this +comes at a considerable computational cost at deployment. Fortunately, some +methods such as pruning or constructing a mixture of experts (MoE) aim at +exploiting sparsity in transformer feedforward (FF) blocks to gain boosts in +speed and reduction in memory requirements. However, these techniques can be +very costly and inflexible in practice, as they often require training or are +restricted to specific types of architectures. To address this, we introduce +GRIFFIN, a novel training-free and calibration-free method that selects unique +FF experts at the sequence level for efficient generation across a plethora of +LLMs with different non-ReLU activation functions. This is possible due to a +critical observation that many trained LLMs naturally produce highly structured +FF activation patterns within a sequence, which we call flocking. Despite our +method's simplicity, we show with 50% of the FF parameters, GRIFFIN maintains +the original model's performance with little to no degradation on a variety of +classification and generation tasks, all while improving latency (e.g. +1.29$\times$ and 1.25$\times$ speed-ups in Gemma 7B and Llama 2 13B, +respectively, on an NVIDIA L40). Code is available at +https://github.com/hdong920/GRIFFIN. + +
+
+ comment: Revision 1: Updated abstract with code link; re-ran top-k + sampling + rows in Table 4, conclusions unchanged Revision 2: Reframing and new + experiments, conclusions unchanged +
+
+
+
+
+ + ♻ ☆ Deep Learning Approach for Changepoint Detection: Penalty Parameter + Optimization + + +
+ Changepoint detection, a technique for identifying significant shifts within +data sequences, is crucial in various fields such as finance, genomics, +medicine, etc. Dynamic programming changepoint detection algorithms are +employed to identify the locations of changepoints within a sequence, which +rely on a penalty parameter to regulate the number of changepoints. To estimate +this penalty parameter, previous work uses simple models such as linear models +or decision trees. This study introduces a novel deep learning method for +predicting penalty parameters, leading to demonstrably improved changepoint +detection accuracy on large benchmark supervised labeled datasets compared to +previous methods. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Boolean matrix logic programming for active learning of gene functions + in genome-scale metabolic network models + + +
+ Techniques to autonomously drive research have been prominent in +Computational Scientific Discovery, while Synthetic Biology is a field of +science that focuses on designing and constructing new biological systems for +useful purposes. Here we seek to apply logic-based machine learning techniques +to facilitate cellular engineering and drive biological discovery. +Comprehensive databases of metabolic processes called genome-scale metabolic +network models (GEMs) are often used to evaluate cellular engineering +strategies to optimise target compound production. However, predicted host +behaviours are not always correctly described by GEMs, often due to errors in +the models. The task of learning the intricate genetic interactions within GEMs +presents computational and empirical challenges. To address these, we describe +a novel approach called Boolean Matrix Logic Programming (BMLP) by leveraging +boolean matrices to evaluate large logic programs. We introduce a new system, +$BMLP_{active}$, which efficiently explores the genomic hypothesis space by +guiding informative experimentation through active learning. In contrast to +sub-symbolic methods, $BMLP_{active}$ encodes a state-of-the-art GEM of a +widely accepted bacterial host in an interpretable and logical representation +using datalog logic programs. Notably, $BMLP_{active}$ can successfully learn +the interaction between a gene pair with fewer training examples than random +experimentation, overcoming the increase in experimental design space. +$BMLP_{active}$ enables rapid optimisation of metabolic models to reliably +engineer biological systems for producing useful compounds. It offers a +realistic approach to creating a self-driving lab for microbial engineering. + +
+
+
+
+
+ + ♻ Blockwise Self-Supervised Learning at Scale + + +
+ Current state-of-the-art deep networks are all powered by backpropagation. In +this paper, we explore alternatives to full backpropagation in the form of +blockwise learning rules, leveraging the latest developments in self-supervised +learning. We show that a blockwise pretraining procedure consisting of training +independently the 4 main blocks of layers of a ResNet-50 with Barlow Twins' +loss function at each block performs almost as well as end-to-end +backpropagation on ImageNet: a linear probe trained on top of our blockwise +pretrained model obtains a top-1 classification accuracy of 70.48%, only 1.1% +below the accuracy of an end-to-end pretrained network (71.57% accuracy). We +perform extensive experiments to understand the impact of different components +within our method and explore a variety of adaptations of self-supervised +learning to the blockwise paradigm, building an exhaustive understanding of the +critical avenues for scaling local learning rules to large networks, with +implications ranging from hardware design to neuroscience. + +
+
+
+
+
+ + ♻ ☆ Posterior Sampling for Continuing Environments + + +
+ We develop an extension of posterior sampling for reinforcement learning +(PSRL) that is suited for a continuing agent-environment interface and +integrates naturally into agent designs that scale to complex environments. The +approach, continuing PSRL, maintains a statistically plausible model of the +environment and follows a policy that maximizes expected $\gamma$-discounted +return in that model. At each time, with probability $1-\gamma$, the model is +replaced by a sample from the posterior distribution over environments. For a +choice of discount factor that suitably depends on the horizon $T$, we +establish an $\tilde{O}(\tau S \sqrt{A T})$ bound on the Bayesian regret, where +$S$ is the number of environment states, $A$ is the number of actions, and +$\tau$ denotes the reward averaging time, which is a bound on the duration +required to accurately estimate the average reward of any policy. Our work is +the first to formalize and rigorously analyze the resampling approach with +randomized exploration. + +
+
+ comment: RLC 2024 +
+
+
+
+
+ + ♻ ☆ IoT in the Era of Generative AI: Vision and Challenges + + +
+ Advancements in Generative AI hold immense promise to push Internet of Things +(IoT) to the next level. In this article, we share our vision on IoT in the era +of Generative AI. We discuss some of the most important applications of +Generative AI in IoT-related domains. We also identify some of the most +critical challenges and discuss current gaps as well as promising opportunities +on enabling Generative AI for IoT. We hope this article can inspire new +research on IoT in the era of Generative AI. + +
+
+ comment: 8 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Multiview learning with twin parametric margin SVM + + +
+ Multiview learning (MVL) seeks to leverage the benefits of diverse +perspectives to complement each other, effectively extracting and utilizing the +latent information within the dataset. Several twin support vector +machine-based MVL (MvTSVM) models have been introduced and demonstrated +outstanding performance in various learning tasks. However, MvTSVM-based models +face significant challenges in the form of computational complexity due to four +matrix inversions, the need to reformulate optimization problems in order to +employ kernel-generated surfaces for handling non-linear cases, and the +constraint of uniform noise assumption in the training data. Particularly in +cases where the data possesses a heteroscedastic error structure, these +challenges become even more pronounced. In view of the aforementioned +challenges, we propose multiview twin parametric margin support vector machine +(MvTPMSVM). MvTPMSVM constructs parametric margin hyperplanes corresponding to +both classes, aiming to regulate and manage the impact of the heteroscedastic +noise structure existing within the data. The proposed MvTPMSVM model avoids +the explicit computation of matrix inversions in the dual formulation, leading +to enhanced computational efficiency. We perform an extensive assessment of the +MvTPMSVM model using benchmark datasets such as UCI, KEEL, synthetic, and +Animals with Attributes (AwA). Our experimental results, coupled with rigorous +statistical analyses, confirm the superior generalization capabilities of the +proposed MvTPMSVM model compared to the baseline models. The source code of the +proposed MvTPMSVM model is available at +\url{https://github.com/mtanveer1/MvTPMSVM}. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Single Image Dehazing Using Scene Depth Ordering + + +
+ Images captured in hazy weather generally suffer from quality degradation, +and many dehazing methods have been developed to solve this problem. However, +single image dehazing problem is still challenging due to its ill-posed nature. +In this paper, we propose a depth order guided single image dehazing method, +which utilizes depth order in hazy images to guide the dehazing process to +achieve a similar depth perception in corresponding dehazing results. The +consistency of depth perception ensures that the regions that look farther or +closer in hazy images also appear farther or closer in the corresponding +dehazing results, and thus effectively avoid the undesired visual effects. To +achieve this goal, a simple yet effective strategy is proposed to extract the +depth order in hazy images, which offers a reference for depth perception in +hazy weather. Additionally, a depth order embedded transformation model is +devised, which performs transmission estimation under the guidance of depth +order to realize an unchanged depth order in the dehazing results. The +extracted depth order provides a powerful global constraint for the dehazing +process, which contributes to the efficient utilization of global information, +thereby bringing an overall improvement in restoration quality. Extensive +experiments demonstrate that the proposed method can better recover potential +structure and vivid color with higher computational efficiency than the +state-of-the-art dehazing methods. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ☆ HateSieve: A Contrastive Learning Framework for Detecting and Segmenting + Hateful Content in Multimodal Memes ACL + + +
+ Amidst the rise of Large Multimodal Models (LMMs) and their widespread +application in generating and interpreting complex content, the risk of +propagating biased and harmful memes remains significant. Current safety +measures often fail to detect subtly integrated hateful content within +``Confounder Memes''. To address this, we introduce \textsc{HateSieve}, a new +framework designed to enhance the detection and segmentation of hateful +elements in memes. \textsc{HateSieve} features a novel Contrastive Meme +Generator that creates semantically paired memes, a customized triplet dataset +for contrastive learning, and an Image-Text Alignment module that produces +context-aware embeddings for accurate meme segmentation. Empirical experiments +on the Hateful Meme Dataset show that \textsc{HateSieve} not only surpasses +existing LMMs in performance with fewer trainable parameters but also offers a +robust mechanism for precisely identifying and isolating hateful content. +\textcolor{red}{Caution: Contains academic discussions of hate speech; viewer +discretion advised.} + +
+
+ comment: 8 pages overall, the accepted paper at the 3rd Workshop on Advances + in Language and Vision Research (ALVR 2024) ACL workshops +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`