[{"data":1,"prerenderedAt":853},["ShallowReactive",2],{"project-dl-dropout-reduces-underfitting":3},{"id":4,"title":5,"description":6,"extension":7,"favorite":8,"icon":9,"meta":10,"publishedAt":841,"readingTime":448,"shortDescription":842,"slug":843,"status":844,"stem":845,"tags":846,"type":851,"__hash__":852},"projects\u002Fprojects\u002Fdl-dropout-reduces-underfitting.md","Dropout Reduces Underfitting","TensorFlow\u002FKeras implementation and reproduction of \"Dropout Reduces Underfitting\" (Liu et al., 2023). A comparative study of Early and Late Dropout strategies to optimize model convergence.","md",false,"i-ph-share-network-duotone",{"body":11},{"type":12,"value":13,"toc":823},"minimark",[14,29,38,53,58,61,87,91,98,103,154,158,168,172,212,216,244,248,251,255,258,343,347,350,451,455,458,647,651,654,747,751,754,762,766,772,776,813,816,819],[15,16,17],"blockquote",{},[18,19,20,24,25,28],"p",{},[21,22,23],"strong",{},"Study and reproduction of the paper:"," Liu, Z., et al. (2023). ",[26,27,5],"em",{},". arXiv:2303.01500.",[18,30,31,32],{},"The paper is available at: ",[33,34,35],"a",{"href":35,"rel":36},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.01500",[37],"nofollow",[18,39,40,41,44,45,48,49,52],{},"This repository contains a robust, modular ",[21,42,43],{},"TensorFlow\u002FKeras"," implementation of ",[21,46,47],{},"Early Dropout"," and ",[21,50,51],{},"Late Dropout"," strategies. The goal is to verify the hypothesis that dropout, traditionally used to reduce overfitting, can also combat underfitting when applied only during the initial training phase.",[54,55,57],"h2",{"id":56},"scientific-objectives","Scientific Objectives",[18,59,60],{},"The study aims to validate the operating regimes of Dropout described in the paper:",[62,63,64,70,75,81],"ol",{},[65,66,67,69],"li",{},[21,68,47],{}," (Targeting Underfitting): Active only during the initial phase to reduce gradient variance and align their direction, enabling better final optimization.",[65,71,72,74],{},[21,73,51],{}," (Targeting Overfitting): Disabled at the start to allow rapid learning, then activated to regularize final convergence.",[65,76,77,80],{},[21,78,79],{},"Standard Dropout",": Constant rate throughout training (baseline).",[65,82,83,86],{},[21,84,85],{},"No Dropout",": Control experiment without dropout.",[54,88,90],{"id":89},"technical-architecture","Technical Architecture",[18,92,93,94,97],{},"Unlike naive Keras callback implementations, this project uses a ",[21,95,96],{},"dynamic approach via the TensorFlow graph"," to ensure the dropout rate updates on the GPU without model recompilation.",[99,100,102],"h3",{"id":101},"key-components","Key Components",[104,105,106,123,146],"ul",{},[65,107,108,114,115,118,119,122],{},[21,109,110],{},[111,112,113],"code",{},"DynamicDropout",": A custom layer inheriting from ",[111,116,117],{},"keras.layers.Layer"," that reads its rate from a shared ",[111,120,121],{},"tf.Variable",".",[65,124,125,130,131,134,135,138,139,138,142,145],{},[21,126,127],{},[111,128,129],{},"DropoutScheduler",": A Keras ",[111,132,133],{},"Callback"," that drives the rate variable based on the current epoch and the chosen strategy (",[111,136,137],{},"early",", ",[111,140,141],{},"late",[111,143,144],{},"standard",").",[65,147,148,153],{},[21,149,150],{},[111,151,152],{},"ExperimentPipeline",": An orchestrator class that handles data loading (MNIST, CIFAR-10, Fashion MNIST), model creation (Dense or CNN), and execution of comparative benchmarks.",[54,155,157],{"id":156},"file-structure","File Structure",[159,160,165],"pre",{"className":161,"code":163,"language":164},[162],"language-text",".\n├── README.md                         # This documentation file\n├── Dropout reduces underfitting.pdf  # Original research paper\n├── pipeline.py                       # Main experiment pipeline\n├── pipeline.ipynb                    # Jupyter notebook for experiments\n├── pipeline_mnist.ipynb              # Jupyter notebook for MNIST experiments\n├── pipeline_cifar10.ipynb            # Jupyter notebook for CIFAR-10 experiments\n├── pipeline_cifar100.ipynb           # Jupyter notebook for CIFAR-100 experiments\n├── pipeline_fashion_mnist.ipynb      # Jupyter notebook for Fashion MNIST experiments\n├── requirements.txt                  # Python dependencies\n├── .python-version                   # Python version specification\n└── uv.lock                           # Dependency lock file\n","text",[111,166,163],{"__ignoreMap":167},"",[54,169,171],{"id":170},"installation","Installation",[159,173,177],{"className":174,"code":175,"language":176,"meta":167,"style":167},"language-bash shiki shiki-themes material-theme-lighter catppuccin-latte catppuccin-macchiato","# Clone the repository\ngit clone https:\u002F\u002Fgithub.com\u002Farthurdanjou\u002Fdropoutreducesunderfitting.git\ncd dropoutreducesunderfitting\n","bash",[111,178,179,188,202],{"__ignoreMap":167},[180,181,184],"span",{"class":182,"line":183},"line",1,[180,185,187],{"class":186},"sv490","# Clone the repository\n",[180,189,191,195,199],{"class":182,"line":190},2,[180,192,194],{"class":193},"sqbHp","git",[180,196,198],{"class":197},"sJlHP"," clone",[180,200,201],{"class":197}," https:\u002F\u002Fgithub.com\u002Farthurdanjou\u002Fdropoutreducesunderfitting.git\n",[180,203,205,209],{"class":182,"line":204},3,[180,206,208],{"class":207},"sMj0x","cd",[180,210,211],{"class":197}," dropoutreducesunderfitting\n",[54,213,215],{"id":214},"install-dependencies","Install dependencies",[159,217,219],{"className":174,"code":218,"language":176,"meta":167,"style":167},"pip install tensorflow numpy matplotlib seaborn scikit-learn\n",[111,220,221],{"__ignoreMap":167},[180,222,223,226,229,232,235,238,241],{"class":182,"line":183},[180,224,225],{"class":193},"pip",[180,227,228],{"class":197}," install",[180,230,231],{"class":197}," tensorflow",[180,233,234],{"class":197}," numpy",[180,236,237],{"class":197}," matplotlib",[180,239,240],{"class":197}," seaborn",[180,242,243],{"class":197}," scikit-learn\n",[54,245,247],{"id":246},"usage","Usage",[18,249,250],{},"The main notebook pipeline.ipynb contains all necessary code. Here is how to run a typical experiment via the pipeline API.",[99,252,254],{"id":253},"_1-initialization","1. Initialization",[18,256,257],{},"Choose your dataset (cifar10, fashion_mnist, mnist) and architecture (cnn, dense).",[159,259,263],{"className":260,"code":261,"language":262,"meta":167,"style":167},"language-python shiki shiki-themes material-theme-lighter catppuccin-latte catppuccin-macchiato","from pipeline import ExperimentPipeline\n\n# Fashion MNIST is recommended to observe underfitting\u002Foverfitting nuances\nexp = ExperimentPipeline(dataset_name=\"fashion_mnist\", model_type=\"cnn\")\n","python",[111,264,265,281,287,292],{"__ignoreMap":167},[180,266,267,271,275,278],{"class":182,"line":183},[180,268,270],{"class":269},"sthAO","from",[180,272,274],{"class":273},"s0g_q"," pipeline ",[180,276,277],{"class":269},"import",[180,279,280],{"class":273}," ExperimentPipeline\n",[180,282,283],{"class":182,"line":190},[180,284,286],{"emptyLinePlaceholder":285},true,"\n",[180,288,289],{"class":182,"line":204},[180,290,291],{"class":186},"# Fashion MNIST is recommended to observe underfitting\u002Foverfitting nuances\n",[180,293,295,298,302,306,310,314,316,320,323,325,328,331,333,335,338,340],{"class":182,"line":294},4,[180,296,297],{"class":273},"exp ",[180,299,301],{"class":300},"sn2um","=",[180,303,305],{"class":304},"sung0"," ExperimentPipeline",[180,307,309],{"class":308},"sMKYs","(",[180,311,313],{"class":312},"smoPz","dataset_name",[180,315,301],{"class":300},[180,317,319],{"class":318},"srDDN","\"",[180,321,322],{"class":197},"fashion_mnist",[180,324,319],{"class":318},[180,326,327],{"class":308},",",[180,329,330],{"class":312}," model_type",[180,332,301],{"class":300},[180,334,319],{"class":318},[180,336,337],{"class":197},"cnn",[180,339,319],{"class":318},[180,341,342],{"class":308},")\n",[99,344,346],{"id":345},"_2-learning-curves-comparison","2. Learning Curves Comparison",[18,348,349],{},"Compare training dynamics (loss and accuracy) of the three strategies.",[159,351,353],{"className":260,"code":352,"language":262,"meta":167,"style":167},"exp.compare_learning_curves(\n    modes=[\"standard\", \"early\", \"late\"],\n    switch_epoch=10,  # The epoch where dropout state changes\n    rate=0.4,         # Dropout rate\n    epochs=30\n)\n",[111,354,355,368,404,420,435,446],{"__ignoreMap":167},[180,356,357,360,362,365],{"class":182,"line":183},[180,358,359],{"class":273},"exp",[180,361,122],{"class":308},[180,363,364],{"class":304},"compare_learning_curves",[180,366,367],{"class":308},"(\n",[180,369,370,373,375,378,380,382,384,386,389,391,393,395,397,399,401],{"class":182,"line":190},[180,371,372],{"class":312},"    modes",[180,374,301],{"class":300},[180,376,377],{"class":308},"[",[180,379,319],{"class":318},[180,381,144],{"class":197},[180,383,319],{"class":318},[180,385,327],{"class":308},[180,387,388],{"class":318}," \"",[180,390,137],{"class":197},[180,392,319],{"class":318},[180,394,327],{"class":308},[180,396,388],{"class":318},[180,398,141],{"class":197},[180,400,319],{"class":318},[180,402,403],{"class":308},"],\n",[180,405,406,409,411,415,417],{"class":182,"line":204},[180,407,408],{"class":312},"    switch_epoch",[180,410,301],{"class":300},[180,412,414],{"class":413},"sZm5v","10",[180,416,327],{"class":308},[180,418,419],{"class":186},"  # The epoch where dropout state changes\n",[180,421,422,425,427,430,432],{"class":182,"line":294},[180,423,424],{"class":312},"    rate",[180,426,301],{"class":300},[180,428,429],{"class":413},"0.4",[180,431,327],{"class":308},[180,433,434],{"class":186},"         # Dropout rate\n",[180,436,438,441,443],{"class":182,"line":437},5,[180,439,440],{"class":312},"    epochs",[180,442,301],{"class":300},[180,444,445],{"class":413},"30\n",[180,447,449],{"class":182,"line":448},6,[180,450,342],{"class":308},[99,452,454],{"id":453},"_3-ablation-studies","3. Ablation Studies",[18,456,457],{},"Study the impact of the \"Early\" phase duration or Dropout intensity.",[159,459,461],{"className":260,"code":460,"language":262,"meta":167,"style":167},"# Impact of the switch epoch on final performance\nexp.compare_switch_epochs(\n    switch_epochs=[5, 10, 15, 20],\n    modes=[\"early\"],\n    rate=0.4,\n    epochs=30\n)\n\n# Impact of the dropout rate\nexp.compare_drop_rates(\n    rates=[0.2, 0.4, 0.6],\n    modes=[\"standard\", \"early\"],\n    switch_epoch=10,\n    epochs=25\n)\n",[111,462,463,468,479,508,524,535,543,548,553,559,571,596,621,632,642],{"__ignoreMap":167},[180,464,465],{"class":182,"line":183},[180,466,467],{"class":186},"# Impact of the switch epoch on final performance\n",[180,469,470,472,474,477],{"class":182,"line":190},[180,471,359],{"class":273},[180,473,122],{"class":308},[180,475,476],{"class":304},"compare_switch_epochs",[180,478,367],{"class":308},[180,480,481,484,486,488,491,493,496,498,501,503,506],{"class":182,"line":204},[180,482,483],{"class":312},"    switch_epochs",[180,485,301],{"class":300},[180,487,377],{"class":308},[180,489,490],{"class":413},"5",[180,492,327],{"class":308},[180,494,495],{"class":413}," 10",[180,497,327],{"class":308},[180,499,500],{"class":413}," 15",[180,502,327],{"class":308},[180,504,505],{"class":413}," 20",[180,507,403],{"class":308},[180,509,510,512,514,516,518,520,522],{"class":182,"line":294},[180,511,372],{"class":312},[180,513,301],{"class":300},[180,515,377],{"class":308},[180,517,319],{"class":318},[180,519,137],{"class":197},[180,521,319],{"class":318},[180,523,403],{"class":308},[180,525,526,528,530,532],{"class":182,"line":437},[180,527,424],{"class":312},[180,529,301],{"class":300},[180,531,429],{"class":413},[180,533,534],{"class":308},",\n",[180,536,537,539,541],{"class":182,"line":448},[180,538,440],{"class":312},[180,540,301],{"class":300},[180,542,445],{"class":413},[180,544,546],{"class":182,"line":545},7,[180,547,342],{"class":308},[180,549,551],{"class":182,"line":550},8,[180,552,286],{"emptyLinePlaceholder":285},[180,554,556],{"class":182,"line":555},9,[180,557,558],{"class":186},"# Impact of the dropout rate\n",[180,560,562,564,566,569],{"class":182,"line":561},10,[180,563,359],{"class":273},[180,565,122],{"class":308},[180,567,568],{"class":304},"compare_drop_rates",[180,570,367],{"class":308},[180,572,574,577,579,581,584,586,589,591,594],{"class":182,"line":573},11,[180,575,576],{"class":312},"    rates",[180,578,301],{"class":300},[180,580,377],{"class":308},[180,582,583],{"class":413},"0.2",[180,585,327],{"class":308},[180,587,588],{"class":413}," 0.4",[180,590,327],{"class":308},[180,592,593],{"class":413}," 0.6",[180,595,403],{"class":308},[180,597,599,601,603,605,607,609,611,613,615,617,619],{"class":182,"line":598},12,[180,600,372],{"class":312},[180,602,301],{"class":300},[180,604,377],{"class":308},[180,606,319],{"class":318},[180,608,144],{"class":197},[180,610,319],{"class":318},[180,612,327],{"class":308},[180,614,388],{"class":318},[180,616,137],{"class":197},[180,618,319],{"class":318},[180,620,403],{"class":308},[180,622,624,626,628,630],{"class":182,"line":623},13,[180,625,408],{"class":312},[180,627,301],{"class":300},[180,629,414],{"class":413},[180,631,534],{"class":308},[180,633,635,637,639],{"class":182,"line":634},14,[180,636,440],{"class":312},[180,638,301],{"class":300},[180,640,641],{"class":413},"25\n",[180,643,645],{"class":182,"line":644},15,[180,646,342],{"class":308},[99,648,650],{"id":649},"_4-data-regimes-data-scarcity","4. Data Regimes (Data Scarcity)",[18,652,653],{},"Verify the paper's hypothesis that Early Dropout shines on large datasets (or limited models) while Standard Dropout protects small datasets.",[159,655,657],{"className":260,"code":656,"language":262,"meta":167,"style":167},"# Training on 10%, 50% and 100% of the dataset\nexp.run_dataset_size_comparison(\n    fractions=[0.1, 0.5, 1.0],\n    modes=[\"standard\", \"early\"],\n    rate=0.3,\n    switch_epoch=10\n)\n",[111,658,659,664,675,699,723,734,743],{"__ignoreMap":167},[180,660,661],{"class":182,"line":183},[180,662,663],{"class":186},"# Training on 10%, 50% and 100% of the dataset\n",[180,665,666,668,670,673],{"class":182,"line":190},[180,667,359],{"class":273},[180,669,122],{"class":308},[180,671,672],{"class":304},"run_dataset_size_comparison",[180,674,367],{"class":308},[180,676,677,680,682,684,687,689,692,694,697],{"class":182,"line":204},[180,678,679],{"class":312},"    fractions",[180,681,301],{"class":300},[180,683,377],{"class":308},[180,685,686],{"class":413},"0.1",[180,688,327],{"class":308},[180,690,691],{"class":413}," 0.5",[180,693,327],{"class":308},[180,695,696],{"class":413}," 1.0",[180,698,403],{"class":308},[180,700,701,703,705,707,709,711,713,715,717,719,721],{"class":182,"line":294},[180,702,372],{"class":312},[180,704,301],{"class":300},[180,706,377],{"class":308},[180,708,319],{"class":318},[180,710,144],{"class":197},[180,712,319],{"class":318},[180,714,327],{"class":308},[180,716,388],{"class":318},[180,718,137],{"class":197},[180,720,319],{"class":318},[180,722,403],{"class":308},[180,724,725,727,729,732],{"class":182,"line":437},[180,726,424],{"class":312},[180,728,301],{"class":300},[180,730,731],{"class":413},"0.3",[180,733,534],{"class":308},[180,735,736,738,740],{"class":182,"line":448},[180,737,408],{"class":312},[180,739,301],{"class":300},[180,741,742],{"class":413},"10\n",[180,744,745],{"class":182,"line":545},[180,746,342],{"class":308},[54,748,750],{"id":749},"expected-results","Expected Results",[18,752,753],{},"According to the paper, you should observe:",[104,755,756,759],{},[65,757,758],{},"Early Dropout: Higher initial loss, followed by a sharp drop after the switch_epoch, often reaching a lower minimum than Standard Dropout (reduction of underfitting).",[65,760,761],{},"Late Dropout: Rapid rise in accuracy at the start (potential overfitting), then stabilized by the activation of dropout.",[54,763,765],{"id":764},"detailed-report","Detailed Report",[767,768],"iframe",{"src":769,"width":770,"height":771},"\u002Fprojects\u002Fdropout-reduces-underfitting.pdf","100%","1000px",[54,773,775],{"id":774},"authors","Authors",[104,777,778,785,792,799,806],{},[65,779,780],{},[33,781,784],{"href":782,"rel":783},"https:\u002F\u002Fgithub.com\u002FArthurDanjou",[37],"Arthur Danjou",[65,786,787],{},[33,788,791],{"href":789,"rel":790},"https:\u002F\u002Fgithub.com\u002FAlex6535",[37],"Alexis Mathieu",[65,793,794],{},[33,795,798],{"href":796,"rel":797},"https:\u002F\u002Fgithub.com\u002FAxelleMeric",[37],"Axelle Meric",[65,800,801],{},[33,802,805],{"href":803,"rel":804},"https:\u002F\u002Fgithub.com\u002FPhilippine35890",[37],"Philippine Quellec",[65,807,808],{},[33,809,812],{"href":810,"rel":811},"https:\u002F\u002Fgithub.com\u002FMoritzSiem",[37],"Moritz Von Siemens",[18,814,815],{},"M.Sc. Statistical and Financial Engineering (ISF) - Data Science Track at Université Paris-Dauphine PSL",[18,817,818],{},"Based on the work of Liu, Z., et al. (2023). Dropout Reduces Underfitting.",[820,821,822],"style",{},"html pre.shiki code .sv490, html code.shiki .sv490{--shiki-light:#90A4AE;--shiki-light-font-style:italic;--shiki-default:#7C7F93;--shiki-default-font-style:italic;--shiki-dark:#939AB7;--shiki-dark-font-style:italic}html pre.shiki code .sqbHp, html code.shiki .sqbHp{--shiki-light:#E2931D;--shiki-light-font-style:inherit;--shiki-default:#1E66F5;--shiki-default-font-style:italic;--shiki-dark:#8AADF4;--shiki-dark-font-style:italic}html pre.shiki code .sJlHP, html code.shiki .sJlHP{--shiki-light:#91B859;--shiki-default:#40A02B;--shiki-dark:#A6DA95}html pre.shiki code .sMj0x, html code.shiki .sMj0x{--shiki-light:#6182B8;--shiki-light-font-style:inherit;--shiki-default:#D20F39;--shiki-default-font-style:italic;--shiki-dark:#ED8796;--shiki-dark-font-style:italic}html .light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html.light .shiki span {color: var(--shiki-light);background: var(--shiki-light-bg);font-style: var(--shiki-light-font-style);font-weight: var(--shiki-light-font-weight);text-decoration: var(--shiki-light-text-decoration);}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html pre.shiki code .sthAO, html code.shiki .sthAO{--shiki-light:#39ADB5;--shiki-light-font-style:italic;--shiki-default:#8839EF;--shiki-default-font-style:inherit;--shiki-dark:#C6A0F6;--shiki-dark-font-style:inherit}html pre.shiki code .s0g_q, html code.shiki .s0g_q{--shiki-light:#90A4AE;--shiki-default:#4C4F69;--shiki-dark:#CAD3F5}html pre.shiki code .sn2um, html code.shiki .sn2um{--shiki-light:#39ADB5;--shiki-default:#179299;--shiki-dark:#8BD5CA}html pre.shiki code .sung0, html code.shiki .sung0{--shiki-light:#6182B8;--shiki-default:#1E66F5;--shiki-dark:#8AADF4}html pre.shiki code .sMKYs, html code.shiki .sMKYs{--shiki-light:#39ADB5;--shiki-default:#7C7F93;--shiki-dark:#939AB7}html pre.shiki code .smoPz, html code.shiki .smoPz{--shiki-light:#90A4AE;--shiki-light-font-style:italic;--shiki-default:#E64553;--shiki-default-font-style:italic;--shiki-dark:#EE99A0;--shiki-dark-font-style:italic}html pre.shiki code .srDDN, html code.shiki .srDDN{--shiki-light:#39ADB5;--shiki-default:#40A02B;--shiki-dark:#A6DA95}html pre.shiki code .sZm5v, html code.shiki .sZm5v{--shiki-light:#F76D47;--shiki-default:#FE640B;--shiki-dark:#F5A97F}",{"title":167,"searchDepth":190,"depth":190,"links":824},[825,826,829,830,831,832,838,839,840],{"id":56,"depth":190,"text":57},{"id":89,"depth":190,"text":90,"children":827},[828],{"id":101,"depth":204,"text":102},{"id":156,"depth":190,"text":157},{"id":170,"depth":190,"text":171},{"id":214,"depth":190,"text":215},{"id":246,"depth":190,"text":247,"children":833},[834,835,836,837],{"id":253,"depth":204,"text":254},{"id":345,"depth":204,"text":346},{"id":453,"depth":204,"text":454},{"id":649,"depth":204,"text":650},{"id":749,"depth":190,"text":750},{"id":764,"depth":190,"text":765},{"id":774,"depth":190,"text":775},"2024-12-10","Reproduction of \"Dropout Reduces Underfitting\" with TensorFlow\u002FKeras, comparing Early and Late Dropout strategies.","dl-dropout-reduces-underfitting","Completed","projects\u002Fdl-dropout-reduces-underfitting",[847,848,849,850],"Python","TensorFlow","Deep Learning","Research","Research Project","9yHtYbWhvhwaDH0UHXgis3DxQVLk4kzaGGPxyyvIqFc",1777982163834]