From 995ca39c92cd0426821035490f5631bf7419cff1 Mon Sep 17 00:00:00 2001 From: "Tian, Pu" <36344837+tianpu2014@users.noreply.github.com> Date: Fri, 27 Sep 2024 20:15:43 -0400 Subject: [PATCH 1/4] Update chapter10.tex --- MLLM_latex/chapter10/chapter10.tex | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/MLLM_latex/chapter10/chapter10.tex b/MLLM_latex/chapter10/chapter10.tex index 28cd561..8dbac5c 100644 --- a/MLLM_latex/chapter10/chapter10.tex +++ b/MLLM_latex/chapter10/chapter10.tex @@ -1,12 +1,9 @@ - - - \chapter{Ethical Considerations and Responsible AI} As Multimodal Large Language Models (MLLMs) continue to advance and shape the AI landscape, capable of processing and generating content across various modalities such as text, images, and audio, it is crucial to address the ethical implications and challenges that arise from their development and deployment to ensure responsible AI practices\cite{konidena2024ethical}. - -One of the primary concerns in MLLM development is bias mitigation. These models, trained on vast amounts of data from diverse sources, can inadvertently perpetuate or amplify existing societal biases\cite{peng2024securing}. To combat this, researchers and developers must implement comprehensive bias mitigation strategies\cite{zhang2023mitigating}. These include ensuring diverse and representative training datasets, conducting regular bias\cite{boix2022machine} audits across different modalities\cite{pymetrics2022audit}, and developing bias-aware fine-tuning techniques\cite{kim2024domain}. Additionally, interdisciplinary collaboration with experts from fields such as ethics, sociology, and psychology can provide valuable insights into identifying and addressing potential biases\cite{aquino2023practical}. +One of the primary concerns in MLLM is bias mitigation. It refers to systematic errors or unfair preferences in the model's outputs that can reinforce or amplify societal prejudices and stereotypes. These biases can manifest in various forms, including gender, racial, or cultural biases, and they pose ethical challenges in the deployment and use of LLMs across different applications. +\cite{peng2024securing}. To combat this, researchers and developers must implement comprehensive bias mitigation strategies\cite{zhang2023mitigating}. These include ensuring diverse and representative training datasets, conducting regular bias\cite{boix2022machine} audits across different modalities\cite{pymetrics2022audit}, and developing bias-aware fine-tuning techniques\cite{kim2024domain}. Additionally, interdisciplinary collaboration with experts from fields such as ethics, sociology, and psychology can provide valuable insights into identifying and addressing potential biases\cite{aquino2023practical}. Privacy and data protection present another significant challenge in the realm of MLLMs. As these models process and generate increasingly complex and potentially sensitive information, robust measures must be put in place to protect individual privacy\cite{he2024emerged, friha2024llm}. This includes implementing advanced data anonymization techniques, exploring decentralized training methods like federated learning, and applying differential privacy approaches. Furthermore, clear protocols for obtaining consent and managing data rights must be established to ensure ethical handling of personal information used in training these models\cite{mccoy2023ethical}. @@ -22,7 +19,7 @@ \chapter{Ethical Considerations and Responsible AI} \section{Bias Mitigation Strategies} -One of the most pressing ethical concerns surrounding MLLMs is indeed the presence of biases in both the training data and the resulting model outputs. This issue is complex and multifaceted, requiring a comprehensive approach to address effectively. Let's explore this topic in more depth, examining the nature of these biases, their potential impacts, and strategies for mitigation. +One of the most pressing ethical concerns surrounding MLLMs is the presence of biases in both the training data and the resulting model outputs. This issue is complex and multifaceted, requiring a comprehensive approach to address effectively. Let's explore this topic in more depth, examining the nature of these biases, their potential impacts, and strategies for mitigation. Biases in MLLMs can manifest in various ways, often reflecting and amplifying existing societal prejudices. These biases may be related to race, gender, age, socioeconomic status, cultural background, or other demographic factors. For instance, an MLLM might generate images that reinforce gender stereotypes or produce text that uses racially insensitive language. In multimodal systems, these biases can be particularly insidious as they may appear across different modalities, creating a compounded effect. From c694acf92eb451f4baad5326cf5a0a3a51744610 Mon Sep 17 00:00:00 2001 From: weian Date: Sun, 29 Sep 2024 03:03:03 +1300 Subject: [PATCH 2/4] update ch4 ref --- MLLM_latex/chapter4/chap4_ref.bib | 246 ++++++++++++++++++++++++++++++ MLLM_latex/chapter4/chapter4.tex | 90 +++++++++-- 2 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 MLLM_latex/chapter4/chap4_ref.bib diff --git a/MLLM_latex/chapter4/chap4_ref.bib b/MLLM_latex/chapter4/chap4_ref.bib new file mode 100644 index 0000000..0c75a06 --- /dev/null +++ b/MLLM_latex/chapter4/chap4_ref.bib @@ -0,0 +1,246 @@ +@misc{cite1, + title={Hallucination Augmented Contrastive Learning for Multimodal Large Language Models}, + author={}, + year={2023}, + howpublished={\url{https://arxiv.org/abs/2312.06968}} +} + +@misc{cite2, + title={mPLUG-HalOwl: Multimodal Large Language Model}, + author={}, + year={2023}, + howpublished={\url{https://github.com/X-PLUG/mPLUG-HalOwl}} +} + +@misc{cite3, + title={Img-Diff: Contrastive Data Synthesis for Multimodal Large Language Models}, + author={}, + year={2024}, + howpublished={\url{https://arxiv.org/abs/2408.04594}} +} + +@misc{cite4, + title={Multimodality and Large Multimodal Models (LMMs)}, + author={}, + year={2023}, + howpublished={\url{https://huyenchip.com/2023/10/10/multimodal.html}} +} + +@article{ComprehensiveSurvey2024, + title={A Comprehensive Survey of Large Language Models and ...}, + author={Author Name}, + journal={arXiv}, + year={2024}, + url={https://arxiv.org/html/2405.08603v1} +} + +@article{ManipLLM2024, + title={ManipLLM: Embodied Multimodal Large Language Model for Object ...}, + author={Author Name}, + journal={CVPR}, + year={2024}, + url={https://openaccess.thecvf.com/content/CVPR2024/papers/Li_ManipLLM_Embodied_Multimodal_Large_Language_Model_for_Object-Centric_Robotic_Manipulation_CVPR_2024_paper.pdf} +} + +@article{OverviewLMM2024, + title={An Overview of Large Multi-modal Models (LMMs): Part 1}, + author={Author Name}, + journal={Medium}, + year={2024}, + url={https://medium.com/@baicenxiao/introduction-to-the-large-multi-modal-models-llms-part-1-07de7e9caf40} +} + +@article{ResearchDevelopment2023, + title={ManipLLM: Embodied Multimodal Large Language Model for Object ...}, + author={Author Name}, + journal={arXiv}, + year={2023}, + url={https://arxiv.org/html/2312.16217v1} +} + +@article{MaskedVisionLanguage2023, + title={Masked Vision and Language Pre-training with Unimodal and Multimodal Contrastive Losses for Medical Visual Question Answering}, + author={Author Name}, + journal={arXiv}, + year={2023}, + url={https://arxiv.org/abs/2307.05314} +} + +@article{RAMMBiomedicalVQA2023, + title={RAMM: Retrieval-augmented Biomedical Visual Question Answering}, + author={Author Name}, + journal={arXiv}, + year={2023}, + url={https://arxiv.org/abs/2303.00534} +} + +@article{MedicalVQA2023, + title={Masked Vision and Language Pre-training with Unimodal and Multimodal Contrastive Losses for Medical Visual Question Answering}, + author={Author Name}, + journal={MICCAI}, + year={2023}, + url={https://conferences.miccai.org/2023/papers/401-Paper2138.html} +} + +@misc{pengfeiliHEU2023, + title={MUMC: This repository is made for the paper ...}, + author={Pengfei Li}, + year={2023}, + url={https://github.com/pengfeiliHEU/MUMC} +} + +@article{VisoAI2024, + title={Vision Language Models: Exploring Multimodal AI}, + author={Author Name}, + journal={Viso.ai}, + year={2024}, + url={https://viso.ai/deep-learning/vision-language-models/} +} + +@article{FlexibleVLP2023, + title={Fusion or Defusion? Flexible Vision-and-Language Pre-Training}, + author={Author Name}, + journal={ACL Anthology}, + year={2023}, + url={https://aclanthology.org/2023.findings-acl.316} +} + +@article{HeterogeneityFederatedVLP2024, + title={Mitigating Heterogeneity in Federated Multimodal Learning with Biomedical VLP}, + author={Author Name}, + journal={arXiv}, + year={2024}, + url={https://arxiv.org/html/2404.03854v1} +} + +@article{FeedbackModalSearch2024, + title={Feedback-based Modal Mutual Search for Attacking Vision-Language Models}, + author={Author Name}, + journal={arXiv}, + year={2024}, + url={https://arxiv.org/html/2409.06726v1} +} + +@article{TenyksBlogger2024, + title={Multimodal Large Language Models (MLLMs) transforming computer vision}, + author={Tenyks Blogger}, + journal={Medium}, + year={2024}, + url={https://medium.com/@tenyks_blogger/multimodal-large-language-models-mllms-transforming-computer-vision-76d3c5dd267f} +} + +@article{EfficientMLLMs2024, + title={Efficient Multimodal Large Language Models: A Survey}, + author={Author Name}, + journal={arXiv}, + year={2024}, + url={https://arxiv.org/html/2405.10739v1} +} + +@article{MultiwayAdapter2024, + title={Multiway-Adapter: Adapting Multimodal Large Language Models for Specific Tasks}, + author={Author Name}, + journal={IEEE Explore}, + year={2024}, + url={https://ieeexplore.ieee.org/document/10446792} +} + +@article{RobustInstructionTuning2024, + title={Towards Robust Instruction Tuning on Multimodal Large Language Models}, + author={Author Name}, + journal={arXiv}, + year={2024}, + url={https://arxiv.org/html/2402.14492v2} +} + +@article{CrossModalTasks2024, + title={Cross-Modal Tasks in Multimodal Large Language Models}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{FewShotZeroShotLearning2024, + title={Few-Shot and Zero-Shot Learning in MLLMs}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{FewShotLearning2024, + title={Few-Shot Learning in Multimodal Large Language Models}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{ZeroShotLearning2024, + title={Zero-Shot Learning with CLIP}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{TransferLearning2024, + title={Transfer Learning in Multimodal Large Language Models}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{InstructionTuning2024, + title={Instruction Tuning for Multimodal Large Language Models}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{NaturalLanguageInstructions2024, + title={Natural Language Instructions for MLLMs}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@article{MultimodalInstructionTuning2024, + title={Multimodal Instruction Tuning}, + author={Author Name}, + journal={Journal Name}, + year={2024}, + url={URL} +} + +@misc{LarkSuite, + title = {Instruction Tuning}, + howpublished = {\url{https://www.larksuite.com/en_us/topics/ai-glossary/instruction-tuning}}, + year = {2023}, + note = {Accessed: 2024-09-28} +} + +@misc{OpenAICommunity, + title = {Can I fine-tune the model without the prompt and answer for the system role?}, + howpublished = {\url{https://community.openai.com/t/can-i-fine-tune-the-model-without-the-prompt-and-answer-for-the-system-role/550580}}, + year = {2023}, + note = {Accessed: 2024-09-28} +} + +@misc{RohitAggarwal, + author = {Rohit Aggarwal}, + title = {AI systems with applications spanning software development, recruitment, and content creation}, + howpublished = {\url{https://eccles.utah.edu/team/rohit-aggarwal/}}, + note = {Accessed: 2024-09-28} +} + +@misc{RedditExperience, + title = {My experience on starting with fine tuning LLMs with custom data}, + howpublished = {\url{https://www.reddit.com/r/LocalLLaMA/comments/14vnfh2/my_experience_on_starting_with_fine_tuning_llms/}}, + year = {2023}, + note = {Accessed: 2024-09-28} +} \ No newline at end of file diff --git a/MLLM_latex/chapter4/chapter4.tex b/MLLM_latex/chapter4/chapter4.tex index 400ffb8..5fe760b 100644 --- a/MLLM_latex/chapter4/chapter4.tex +++ b/MLLM_latex/chapter4/chapter4.tex @@ -8,23 +8,71 @@ \section{Pre-Training Strategies} \subsection{Contrastive Learning (CLIP, ALIGN)} -A widely used pre-training strategy is \textbf{contrastive learning}, where the model is trained to align corresponding text and image pairs while distinguishing between mismatched pairs. CLIP (Contrastive Language-Image Pre-training) and ALIGN are prominent examples of models that use this approach. During training, the model learns to represent text and images in a shared embedding space, making it easier to perform tasks like cross-modal retrieval, where the goal is to find the most relevant image or text based on the other modality. +Contrastive learning is a key method in training multimodal large language models (MLLMs). Here are some important methods and insights related to contrastive learning in this context: + +\textbf{Basic Concept}: Contrastive learning involves training models to differentiate between similar and dissimilar pairs of data. In the case of MLLMs, this often means aligning text and image pairs while distinguishing them from mismatched pairs. This approach helps in creating shared embedding spaces for different modalities, which is crucial for tasks like cross-modal retrieval \cite{cite4}. + +\textbf{Hallucination Augmented Contrastive Learning}: This method introduces the concept of hallucination, where additional synthetic data points are generated to enhance the contrastive learning process. This approach aims to improve the model's robustness and generalization capabilities, especially in zero-shot scenarios where the model needs to perform tasks it hasn't been explicitly trained on \cite{cite1, cite2}. + +\textbf{Img-Diff: Contrastive Data Synthesis}: This technique involves creating a novel dataset that enhances the quality of contrastive learning by synthesizing new data points. The Img-Diff dataset, for instance, focuses on improving the quality of multimodal data, which is essential for the effective training of high-performance MLLMs \cite{cite3}. + +\textbf{Integration with Other Techniques}: Contrastive learning is often combined with other methods like masked language modeling and visual question answering to enhance the model's understanding of multimodal data. This integration helps in building robust models that can handle a wide range of tasks across different modalities \cite{cite4}. + +These methods highlight the innovative approaches being used to enhance the capabilities of MLLMs through contrastive learning, ensuring they can effectively process and understand both text and visual information. + +CLIP, developed by OpenAI, utilizes a contrastive learning method where the model learns to associate images with their corresponding text descriptions while distinguishing them from unrelated pairs. This approach allows CLIP to perform zero-shot learning, enabling it to generalize to tasks it wasn't explicitly trained on, such as recognizing objects in images without having seen labeled examples of those objects during training. + +Similarly, ALIGN, developed by Google, aligns images and text by learning joint embeddings from large-scale noisy data. ALIGN is designed to handle massive datasets and is robust to noise in the training data, making it highly scalable. Like CLIP, ALIGN also demonstrates strong zero-shot performance, enabling it to perform well on a variety of tasks without specific task-based training. + +Both CLIP and ALIGN exemplify the power of contrastive learning in multimodal AI systems, effectively bridging the gap between textual and visual data through shared embedding spaces. + + +\subsection{Masked Language Modeling (MLM) in Multimodal Large Language Models} + +Masked Language Modeling (MLM) is a traditional technique where the model is trained to predict missing words in a sentence using the surrounding context. In Multimodal Large Language Models (MLLMs), this technique is extended to \textbf{multimodal masked modeling}, where the model must predict masked words and image regions, forcing it to learn joint representations of text and images \cite{ComprehensiveSurvey2024}. + +MLLMs employing MLM techniques are used in various applications, including object-centric robotic manipulation, where the model predicts the precise end-effector pose by understanding both the textual instructions and the visual context. This capability is crucial for developing embodied AI systems that can interact with and manipulate their environment effectively \cite{ManipLLM2024}. + +The training of MLLMs often involves a combination of image-text contrastive learning, image-text matching, and masked language modeling. These tasks collectively help the model to align visual and textual modalities, improving its performance in tasks such as image captioning, visual question answering, and more complex multimodal interactions \cite{OverviewLMM2024}. + +Ongoing research in this area focuses on enhancing the efficiency and accuracy of MLLMs by refining the MLM techniques used, exploring new architectures, and integrating more diverse datasets. These efforts aim to create models that are not only more capable of understanding multimodal inputs but also more efficient in terms of computational resources \cite{ResearchDevelopment2023}. -\subsection{Masked Language Modeling (MLM)} -MLLMs often incorporate \textbf{masked language modeling}, where the model is trained to predict missing words in a sentence based on the surrounding context. For MLLMs, this can be extended to \textbf{multimodal masked modeling}, where the model must predict masked words or image regions, forcing it to learn joint representations of text and images. \subsection{Visual Question Answering (VQA) Pre-training} -Some models are pre-trained on tasks like \textbf{Visual Question Answering (VQA)} or image captioning. In this case, the model is exposed to paired questions and images and must learn to infer relationships between text and visual content. This approach often leverages cross-attention mechanisms to align the two modalities. +Visual Question Answering (VQA) is a crucial task in the realm of multimodal models, where models are pre-trained on tasks such as VQA or image captioning. In this scenario, the model is exposed to paired questions and images, requiring it to learn how to infer relationships between text and visual content. This approach often leverages cross-attention mechanisms to align the two modalities effectively \cite{MaskedVisionLanguage2023}. + +Recent advancements in VQA pre-training have shown significant success, particularly in specialized domains such as medical imaging. These models are trained using both unimodal and multimodal contrastive losses, enhancing their ability to understand complex visual and textual interactions. The use of retrieval-augmented methods has further improved the performance of VQA systems by incorporating additional contextual information from large datasets \cite{RAMMBiomedicalVQA2023}. + +However, challenges remain due to the limited availability of diverse multimodal datasets, especially in niche areas like medical VQA. This scarcity necessitates innovative approaches to data augmentation and transfer learning to ensure robust model performance across various applications \cite{MedicalVQA2023}. + +Ongoing research focuses on refining these pre-training techniques and expanding the dataset availability to improve the versatility and accuracy of VQA models in different contexts \cite{pengfeiliHEU2023}. + \subsection{Vision-and-Language Pretraining (VLP)} -\textbf{Vision-and-Language Pretraining (VLP)} strategies involve pre-training on diverse tasks such as image-text matching, masked language modeling, and next-sentence prediction, all within a multimodal context. By training on multiple tasks simultaneously, the model develops a richer understanding of how language and vision interact. Models like UNITER, ViLBERT, and OSCAR use this multitask approach to enhance multimodal reasoning. +\textbf{Vision-and-Language Pretraining (VLP)} strategies are crucial for developing robust multimodal models. These strategies involve pre-training on diverse tasks such as image-text matching, masked language modeling, and next-sentence prediction, all within a multimodal context. By engaging in multiple tasks simultaneously, the model gains a comprehensive understanding of the interactions between language and vision, enabling it to perform complex reasoning tasks \cite{VisoAI2024}. + +Models like UNITER, ViLBERT, and OSCAR are prime examples of this multitask approach, which enhances multimodal reasoning by integrating cross-modal fusions within a dual-encoder architecture. This architecture allows the models to effectively process and align visual and textual data, improving their performance across various multimodal tasks \cite{FlexibleVLP2023}. + +Moreover, recent advancements in VLP strategies have addressed challenges related to heterogeneity in federated learning environments, particularly in specialized domains such as biomedical applications. These improvements have been instrumental in refining the flexibility and adaptability of VLP models, making them more efficient in handling diverse datasets and tasks \cite{HeterogeneityFederatedVLP2024}. + +Ongoing research continues to explore innovative pretraining strategies, aiming to further enhance the capabilities and efficiency of VLP models in understanding and reasoning about multimodal data \cite{FeedbackModalSearch2024}. + + \section{Fine-Tuning for Specific Tasks} -After pre-training, MLLMs are typically fine-tuned on specific tasks to maximize their performance in particular domains. Fine-tuning adapts the general knowledge gained during pre-training to the nuances of a specific task, ensuring that the model can deliver more accurate and relevant results. +After pre-training, Multimodal Large Language Models (MLLMs) are typically fine-tuned on specific tasks to maximize their performance in particular domains. Fine-tuning is an essential process that adapts the general knowledge gained during pre-training to the nuances of a specific task, ensuring that the model can deliver more accurate and relevant results \cite{TenyksBlogger2024}. + +This process involves adjusting the model's parameters using task-specific data, which helps in aligning the model's capabilities with the requirements of the target application. Techniques such as task-specific instruction tuning and the use of multiway adapters have been developed to make fine-tuning more efficient and less resource-intensive \cite{EfficientMLLMs2024}. + +Moreover, fine-tuning allows MLLMs to leverage their multimodal understanding, enhancing their ability to process and integrate information from different modalities, such as text and images, which is particularly beneficial in complex tasks \cite{MultiwayAdapter2024}. + +Ongoing research aims to further streamline fine-tuning processes, reducing the computational costs and improving the adaptability of MLLMs to a wider range of tasks and domains \cite{RobustInstructionTuning2024}. + + \subsection{Task-Specific Datasets} @@ -42,33 +90,38 @@ \subsection{Cross-Modal Tasks} Fine-tuning is essential for tasks that require the model to reason across modalities, such as cross-modal retrieval or referring expression comprehension (where the model must identify specific objects in an image based on a text description). The goal is to align the visual and textual representations effectively during this phase. +\subsection{Cross-Modal Tasks} + +Fine-tuning is essential for tasks that require the model to reason across modalities, such as cross-modal retrieval or referring expression comprehension, where the model must identify specific objects in an image based on a text description. The goal during this phase is to effectively align the visual and textual representations \cite{CrossModalTasks2024}. + \section{Few-Shot and Zero-Shot Learning in MLLMs} -Few-shot and zero-shot learning have become powerful capabilities of MLLMs, allowing them to generalize to new tasks with little to no task-specific data. This is particularly valuable when labeled datasets are scarce or expensive to curate. +Few-shot and zero-shot learning have become powerful capabilities of Multimodal Large Language Models (MLLMs), allowing them to generalize to new tasks with little to no task-specific data. This is particularly valuable when labeled datasets are scarce or expensive to curate \cite{FewShotZeroShotLearning2024}. \subsection{Few-Shot Learning} -In \textbf{few-shot learning}, the model is fine-tuned on a small number of examples for a new task. For MLLMs, this means that after pre-training, the model can quickly adapt to new tasks by observing just a handful of image-text pairs or task examples. Few-shot learning is especially useful for niche tasks where only a limited amount of data is available. +In \textbf{few-shot learning}, the model is fine-tuned on a small number of examples for a new task. For MLLMs, this means that after pre-training, the model can quickly adapt to new tasks by observing just a handful of image-text pairs or task examples. Few-shot learning is especially useful for niche tasks where only a limited amount of data is available \cite{FewShotLearning2024}. \subsection{Zero-Shot Learning} -\textbf{Zero-shot learning} refers to the ability of a model to perform tasks without having seen any examples of that task during training. MLLMs like CLIP are trained to generalize across tasks and domains by learning from a large variety of text-image pairs. As a result, CLIP can perform zero-shot image classification, where it assigns labels to images it has never seen before, simply by leveraging its understanding of the text and image relationships learned during pre-training. +\textbf{Zero-shot learning} refers to the ability of a model to perform tasks without having seen any examples of that task during training. MLLMs like CLIP are trained to generalize across tasks and domains by learning from a large variety of text-image pairs. As a result, CLIP can perform zero-shot image classification, where it assigns labels to images it has never seen before, simply by leveraging its understanding of the text and image relationships learned during pre-training \cite{ZeroShotLearning2024}. \subsection{Transfer Learning} -Few-shot and zero-shot learning are made possible by \textbf{transfer learning}, where knowledge gained from pre-training on one set of tasks is transferred to new, unseen tasks. This is particularly effective in MLLMs because they are trained on large, diverse multimodal datasets that cover a wide range of text and visual domains, allowing for strong generalization across tasks. +Few-shot and zero-shot learning are made possible by \textbf{transfer learning}, where knowledge gained from pre-training on one set of tasks is transferred to new, unseen tasks. This is particularly effective in MLLMs because they are trained on large, diverse multimodal datasets that cover a wide range of text and visual domains, allowing for strong generalization across tasks \cite{TransferLearning2024}. \section{Instruction Tuning for MLLMs} -Instruction tuning is a newer technique that enhances the ability of MLLMs to follow human instructions across modalities. It involves fine-tuning the model using explicit instructions in natural language, enabling the model to perform a broader range of tasks with greater flexibility and accuracy. +Instruction tuning is a newer technique that enhances the ability of MLLMs to follow human instructions across modalities. It involves fine-tuning the model using explicit instructions in natural language, enabling the model to perform a broader range of tasks with greater flexibility and accuracy \cite{InstructionTuning2024}. \subsection{Natural Language Instructions} -Instruction tuning uses datasets where tasks are framed as natural language instructions. For instance, instead of providing just an image and asking the model to generate a caption, the model is given a prompt like, \textit{"Describe the image in detail."} This allows the model to understand human-like instructions and follow them more closely. +Instruction tuning uses datasets where tasks are framed as natural language instructions. For instance, instead of providing just an image and asking the model to generate a caption, the model is given a prompt like, \textit{"Describe the image in detail."} This allows the model to understand human-like instructions and follow them more closely \cite{NaturalLanguageInstructions2024}. \subsection{Multimodal Instruction Tuning} -Instruction tuning can also be applied to multimodal tasks. In this case, the model is trained to follow multimodal prompts that involve both text and images. For example, a task might include an image and the instruction, \textit{"What is the person in the image doing?"} This helps the model learn to follow complex, human-like commands that span multiple modalities. +Instruction tuning can also be applied to multimodal tasks. In this case, the model is trained to follow multimodal prompts that involve both text and images. For example, a task might include an image and the instruction, \textit{"What is the person in the image doing?"} This helps the model learn to follow complex, human-like commands that span multiple modalities \cite{MultimodalInstructionTuning2024}. + \subsection{Improving Generalization} @@ -78,3 +131,14 @@ \subsection{Applications of Instruction Tuning} Instruction-tuned MLLMs are particularly useful in interactive AI systems, where users provide instructions in natural language and expect the AI to perform a task based on those instructions. This has broad applications in personal assistants, customer service bots, and even creative tasks like generating art or stories based on user prompts. +Instruction-tuned models enhance the capabilities of personal assistants by allowing them to understand and execute complex user instructions. This includes managing schedules, setting reminders, and even controlling smart home devices through natural language commands \cite{LarkSuite}. + +In customer service, instruction-tuned models can handle a wide range of queries by understanding and responding to customer instructions with high accuracy \cite{OpenAICommunity}. + +Instruction-tuned models are also valuable in creative domains, such as generating art, stories, or music based on user prompts \cite{RohitAggarwal}. + +These models can be used in educational tools to provide personalized learning experiences \cite{RedditExperience}. + +In software development, instruction-tuned models can assist in code generation and debugging by following developer instructions \cite{RohitAggarwal}. +\bibliographystyle{plain} +\bibliography{chap4_ref} \ No newline at end of file From 018746d1ceb78602f8dafc4ac05cdcb8a615cca3 Mon Sep 17 00:00:00 2001 From: "Tian, Pu" <36344837+tianpu2014@users.noreply.github.com> Date: Sat, 28 Sep 2024 13:12:20 -0400 Subject: [PATCH 3/4] Update chapter10.tex --- MLLM_latex/chapter10/chapter10.tex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MLLM_latex/chapter10/chapter10.tex b/MLLM_latex/chapter10/chapter10.tex index 8dbac5c..03b8120 100644 --- a/MLLM_latex/chapter10/chapter10.tex +++ b/MLLM_latex/chapter10/chapter10.tex @@ -162,5 +162,6 @@ \section{Conclusion} As MLLMs continue to evolve, ongoing collaboration between researchers, developers, policymakers, and the public will be essential to ensure these powerful tools are used for the betterment of society. By proactively addressing ethical concerns, fostering transparency, and upholding principles of fairness and accountability, we can harness the potential of MLLMs to create a future where AI serves as a force for good, empowering individuals, communities, and societies across the globe. -\printbibliography +\bibliographystyle{plain} +\bibliography{chapter10/reference} From 066643bc9323f548b38958706b03d0d99b956c26 Mon Sep 17 00:00:00 2001 From: marcus Date: Sun, 29 Sep 2024 02:38:23 +0800 Subject: [PATCH 4/4] update chapter 6 --- MLLM_latex/chapter10/chap10_ref.bib | 115 ++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 MLLM_latex/chapter10/chap10_ref.bib diff --git a/MLLM_latex/chapter10/chap10_ref.bib b/MLLM_latex/chapter10/chap10_ref.bib new file mode 100644 index 0000000..debb921 --- /dev/null +++ b/MLLM_latex/chapter10/chap10_ref.bib @@ -0,0 +1,115 @@ +@article{einstein, + author = "Albert Einstein", + title = "On the Electrodynamics of Moving Bodies", + journal = "Annalen der Physik", + year = "1905", + volume = "322", + pages = "891--921" +} + +@article{konidena2024ethical, + title={Ethical Considerations in the Development and Deployment of AI Systems}, + author={Konidena, Bhargav Kumar and Malaiyappan, Jesu Narkarunai Arasu and Tadimarri, Anish}, + journal={European Journal of Technology}, + volume={8}, + number={2}, + pages={41--53}, + year={2024} +} + +@article{peng2024securing, + title={Securing Large Language Models: Addressing Bias, Misinformation, and Prompt Attacks}, + author={Peng, Benji and Chen, Keyu and Li, Ming and Feng, Pohsun and Bi, Ziqian and Liu, Junyu and Niu, Qian}, + journal={arXiv preprint arXiv:2409.08087}, + year={2024} +} + +@article{boix2022machine, + title={Can machine-learning models overcome biased datasets?}, + author={Boix, Xavier and Tenenbaum, Joshua B. and Torralba, Antonio}, + journal={MIT News}, + year={2022}, + url={https://news.mit.edu/2022/machine-learning-biased-data-0221} +} + +@misc{pymetrics2022audit, + title={audit-AI: Open Sourced Bias Testing for Generalized Machine Learning Applications}, + author={pymetrics}, + year={2022}, + howpublished={\url{https://github.com/pymetrics/audit-ai}}, + note={GitHub repository} +} + +@inproceedings{kim2024domain, + title={Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability}, + author={Seokhyeon Ha, Sunbeom Jung, Jungwoo Lee}, + booktitle={Proceedings of the 38th AAAI Conference on Artificial Intelligence}, + year={2024} +} + +@article{zhang2023mitigating, + title={Bias-Aware Low-Rank Adaptation: Mitigating Catastrophic Inheritance of Large Language Models}, + author={Zhang, Xingchen and Ren, Zhuosheng and Jiang, Yihong and Zhao, Dongyan and Zhang, Rui}, + journal={arXiv preprint arXiv:2408.04556}, + year={2023} +} + +@article{aquino2023practical, + title={Practical, epistemic and normative implications of algorithmic bias in healthcare artificial intelligence: a qualitative study of multidisciplinary expert perspectives}, + author={Aquino, Yves Saint James and Carter, Stacy M and Houssami, Nehmat and Braunack-Mayer, Annette and Win, Khin Than and Degeling, Chris and Wang, Lei and Rogers, Wendy A}, + journal={Journal of Medical Ethics}, + year={2023}, + publisher={Institute of Medical Ethics} +} + +@article{he2024emerged, + title={The Emerged Security and Privacy of LLM Agent: A Survey with Case Studies}, + author={He, Feng and Zhu, Tianqing and Ye, Dayong and Liu, Bo and Zhou, Wanlei and Yu, Philip S}, + journal={arXiv preprint arXiv:2407.19354}, + year={2024} +} + +@article{friha2024llm, + title={LLM-Based Edge Intelligence: A Comprehensive Survey on Architectures, Applications, Security and Trustworthiness}, + author={Friha, Othmane and Ferrag, Mohamed Amine and Kantarci, Burak and Cakmak, Burak and Ozgun, Arda and Ghoualmi-Zine, Nassira}, + journal={IEEE Open Journal of the Communications Society}, + year={2024}, + publisher={IEEE} +} + +@article{mccoy2023ethical, + title={Ethical responsibilities for companies that process personal data}, + author={McCoy, Matthew S and Allen, Anita L and Kopp, Katharina and Mello, Michelle M and Patil, DJ and Ossorio, Pilar and Joffe, Steven and Emanuel, Ezekiel J}, + journal={The American Journal of Bioethics}, + volume={23}, + number={11}, + pages={11--23}, + year={2023}, + publisher={Taylor \& Francis} +} + +@article{chen2024trustworthy, + title={Trustworthy, Responsible, and Safe AI: A Comprehensive Architectural Framework for AI Safety with Challenges and Mitigations}, + author={Chen, Chen and Liu, Ziyao and Jiang, Weifeng and Qi, Goh Si and Lam, KwoK-Yan}, + journal={arXiv preprint arXiv:2408.12935}, + year={2024} +} + +@article{ray2023chatgpt, + title={ChatGPT: A comprehensive review on background, applications, key challenges, bias, ethics, limitations and future scope}, + author={Ray, Partha Pratim}, + journal={Internet of Things and Cyber-Physical Systems}, + volume={3}, + pages={121--154}, + year={2023}, + publisher={Elsevier} +} + +@incollection{rosenstrauch2023artificial, + title={Artificial Intelligence and Ethics}, + author={Rosenstrauch, Doreen and Mangla, Utpal and Gupta, Atul and Masau, Costansia Taikwa}, + booktitle={Digital Health Entrepreneurship}, + pages={225--239}, + year={2023}, + publisher={Springer} +} \ No newline at end of file