RLHF and DPO Compared - CROWDWORKS Blog

1.0CROWDWORKS Bloghttp://crowdworks.blog/en/ahrawriterhttp://crowdworks.blog/en/author/ahrawriter/RLHF and DPO Compared - CROWDWORKS Blogrich600338<blockquote class="wp-embedded-content" data-secret="72uzCxfM2Q"><a href="http://crowdworks.blog/en/rlhf-and-dpo-compared/">RLHF and DPO Compared</a></blockquote><iframe sandbox="allow-scripts" security="restricted" src="http://crowdworks.blog/en/rlhf-and-dpo-compared/embed/#?secret=72uzCxfM2Q" width="600" height="338" title="“RLHF and DPO Compared” — CROWDWORKS Blog" data-secret="72uzCxfM2Q" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" class="wp-embedded-content"></iframe><script> /*! This file is auto-generated */ !function(d,l){"use strict";l.querySelector&&d.addEventListener&&"undefined"!=typeof URL&&(d.wp=d.wp||{},d.wp.receiveEmbedMessage||(d.wp.receiveEmbedMessage=function(e){var t=e.data;if((t||t.secret||t.message||t.value)&&!/[^a-zA-Z0-9]/.test(t.secret)){for(var s,r,n,a=l.querySelectorAll('iframe[data-secret="'+t.secret+'"]'),o=l.querySelectorAll('blockquote[data-secret="'+t.secret+'"]'),c=new RegExp("^https?:$","i"),i=0;i<o.length;i++)o[i].style.display="none";for(i=0;i<a.length;i++)s=a[i],e.source===s.contentWindow&&(s.removeAttribute("style"),"height"===t.message?(1e3<(r=parseInt(t.value,10))?r=1e3:~~r<200&&(r=200),s.height=r):"link"===t.message&&(r=new URL(s.getAttribute("src")),n=new URL(t.value),c.test(n.protocol))&&n.host===r.host&&l.activeElement===s&&(d.top.location.href=t.value))}},d.addEventListener("message",d.wp.receiveEmbedMessage,!1),l.addEventListener("DOMContentLoaded",function(){for(var e,t,s=l.querySelectorAll("iframe.wp-embedded-content"),r=0;r<s.length;r++)(t=(e=s[r]).getAttribute("data-secret"))||(t=Math.random().toString(36).substring(2,12),e.src+="#?secret="+t,e.setAttribute("data-secret",t)),e.contentWindow.postMessage({message:"ready",secret:t},"*")},!1)))}(window,document); </script> https://i0.wp.com/crowdworks.blog/wp-content/uploads/2024/01/블로그_트랜드썸네일03-1.png?fit=600%2C99999Introduction Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) are two approaches in the field of large-scale language models used to enhance models through human guidance. This exploration on the two approaches aims to delve into the distinctive features of RLHF and DPO, providing insights into their applications, mechanisms as well as […]