bitsandbytes-rocm/tests/triton_tests/info.jsonl
Mitchell Wortsman 5f3d9ada8d triton-v1
2023-03-29 06:47:08 +00:00

143 lines
113 KiB
JSON

{"repeat": 64, "batch_size": 1024, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.047907233238220215, "standard_gw": 0.04326179623603821, "standard_gx": 0.042986124753952026, "rowwise_fwd": 0.03902614116668701, "rowwise_bwd": 0.038955360651016235, "global_fwd": 0.03974884748458862, "global_bwd": 0.0391639769077301, "x_quantize_rowwise": 0.02619624137878418, "g_quantize_rowwise": 0.02695620059967041, "w_quantize_rowwise": 0.02631545066833496, "w_quantize_colwise_transpose": 0.08677691221237183, "w_quantize_global": 0.07359683513641357, "w_quantize_global_transpose": 0.08226558566093445, "cast_x": 0.007815659046173096, "cast_g": 0.016041100025177002, "cast_w": 0.01600012183189392, "time_standard": 0.13415515422821045, "time_rowwise": 0.28748810291290283, "time_global": 0.33118948340415955}
{"repeat": 64, "batch_size": 1024, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.04236400127410889, "standard_gw": 0.04898756742477417, "standard_gx": 0.04731118679046631, "rowwise_fwd": 0.03933534026145935, "rowwise_bwd": 0.03947317600250244, "global_fwd": 0.03688037395477295, "global_bwd": 0.039167702198028564, "x_quantize_rowwise": 0.02533942461013794, "g_quantize_rowwise": 0.02516806125640869, "w_quantize_rowwise": 0.02528354525566101, "w_quantize_colwise_transpose": 0.0903792679309845, "w_quantize_global": 0.0997595489025116, "w_quantize_global_transpose": 0.10209530591964722, "cast_x": 0.01626834273338318, "cast_g": 0.011973083019256592, "cast_w": 0.016044825315475464, "time_standard": 0.13866275548934937, "time_rowwise": 0.2939663827419281, "time_global": 0.37739798426628113}
{"repeat": 64, "batch_size": 2048, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.07753819227218628, "standard_gw": 0.08026883006095886, "standard_gx": 0.0906921923160553, "rowwise_fwd": 0.0630207359790802, "rowwise_bwd": 0.058263540267944336, "global_fwd": 0.06167963147163391, "global_bwd": 0.05801767110824585, "x_quantize_rowwise": 0.034205615520477295, "g_quantize_rowwise": 0.03341957926750183, "w_quantize_rowwise": 0.03244727849960327, "w_quantize_colwise_transpose": 0.08665025234222412, "w_quantize_global": 0.09483471512794495, "w_quantize_global_transpose": 0.10108202695846558, "cast_x": 0.012032687664031982, "cast_g": 0.03752484917640686, "cast_w": 0.01605972647666931, "time_standard": 0.24849921464920044, "time_rowwise": 0.3882758319377899, "time_global": 0.46350806951522827}
{"repeat": 64, "batch_size": 2048, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.09099021553993225, "standard_gw": 0.0799819827079773, "standard_gx": 0.07644668221473694, "rowwise_fwd": 0.05840510129928589, "rowwise_bwd": 0.06359070539474487, "global_fwd": 0.057831406593322754, "global_bwd": 0.06148591637611389, "x_quantize_rowwise": 0.03434717655181885, "g_quantize_rowwise": 0.03361701965332031, "w_quantize_rowwise": 0.03209337592124939, "w_quantize_colwise_transpose": 0.09028613567352295, "w_quantize_global": 0.0944770872592926, "w_quantize_global_transpose": 0.0994168221950531, "cast_x": 0.03769621253013611, "cast_g": 0.012010335922241211, "cast_w": 0.01600012183189392, "time_standard": 0.24741888046264648, "time_rowwise": 0.39232149720191956, "time_global": 0.4611574113368988}
{"repeat": 64, "batch_size": 4096, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.14450401067733765, "standard_gw": 0.14326348900794983, "standard_gx": 0.14762207865715027, "rowwise_fwd": 0.10525062680244446, "rowwise_bwd": 0.09800493717193604, "global_fwd": 0.10229647159576416, "global_bwd": 0.09718164801597595, "x_quantize_rowwise": 0.03429874777793884, "g_quantize_rowwise": 0.04567950963973999, "w_quantize_rowwise": 0.03365054726600647, "w_quantize_colwise_transpose": 0.08654966950416565, "w_quantize_global": 0.09663775563240051, "w_quantize_global_transpose": 0.10383129119873047, "cast_x": 0.01605972647666931, "cast_g": 0.08305534720420837, "cast_w": 0.01624971628189087, "time_standard": 0.43538957834243774, "time_rowwise": 0.5466975271701813, "time_global": 0.6231889128684998}
{"repeat": 64, "batch_size": 4096, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.14496594667434692, "standard_gw": 0.1412704586982727, "standard_gx": 0.14446303248405457, "rowwise_fwd": 0.10041892528533936, "rowwise_bwd": 0.10674074292182922, "global_fwd": 0.09856373071670532, "global_bwd": 0.10319426655769348, "x_quantize_rowwise": 0.045571476221084595, "g_quantize_rowwise": 0.03273040056228638, "w_quantize_rowwise": 0.033464282751083374, "w_quantize_colwise_transpose": 0.09154900908470154, "w_quantize_global": 0.0964440405368805, "w_quantize_global_transpose": 0.1031048595905304, "cast_x": 0.0835023820400238, "cast_g": 0.016242265701293945, "cast_w": 0.016283243894577026, "time_standard": 0.4306994378566742, "time_rowwise": 0.5517452955245972, "time_global": 0.6208792328834534}
{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.28106942772865295, "standard_gw": 0.2841465175151825, "standard_gx": 0.301852822303772, "rowwise_fwd": 0.19879266619682312, "rowwise_bwd": 0.16228482127189636, "global_fwd": 0.19488856196403503, "global_bwd": 0.1607760787010193, "x_quantize_rowwise": 0.033974647521972656, "g_quantize_rowwise": 0.08221715688705444, "w_quantize_rowwise": 0.03248825669288635, "w_quantize_colwise_transpose": 0.08646398782730103, "w_quantize_global": 0.0939294695854187, "w_quantize_global_transpose": 0.09895861148834229, "cast_x": 0.03753975033760071, "cast_g": 0.15900656580924988, "cast_w": 0.01603737473487854, "time_standard": 0.8670687675476074, "time_rowwise": 0.8803680539131165, "time_global": 0.9488910436630249}
{"repeat": 64, "batch_size": 8192, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.26415660977363586, "standard_gw": 0.2679601311683655, "standard_gx": 0.30617788434028625, "rowwise_fwd": 0.180121511220932, "rowwise_bwd": 0.21555647253990173, "global_fwd": 0.17506256699562073, "global_bwd": 0.2116672694683075, "x_quantize_rowwise": 0.08289515972137451, "g_quantize_rowwise": 0.033795833587646484, "w_quantize_rowwise": 0.03366544842720032, "w_quantize_colwise_transpose": 0.09965524077415466, "w_quantize_global": 0.09595602750778198, "w_quantize_global_transpose": 0.1024976372718811, "cast_x": 0.1602955162525177, "cast_g": 0.03787502646446228, "cast_w": 0.016216188669204712, "time_standard": 0.8382946252822876, "time_rowwise": 0.9136497974395752, "time_global": 0.9698346257209778}
{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.5719438195228577, "standard_gw": 0.524863600730896, "standard_gx": 0.6005167961120605, "rowwise_fwd": 0.3750324249267578, "rowwise_bwd": 0.28166547417640686, "global_fwd": 0.3674700856208801, "global_bwd": 0.2798214554786682, "x_quantize_rowwise": 0.04655122756958008, "g_quantize_rowwise": 0.1555122435092926, "w_quantize_rowwise": 0.03437697887420654, "w_quantize_colwise_transpose": 0.08634477853775024, "w_quantize_global": 0.09759142994880676, "w_quantize_global_transpose": 0.10081753134727478, "cast_x": 0.0828765332698822, "cast_g": 0.31184032559394836, "cast_w": 0.016063451766967773, "time_standard": 1.6973242163658142, "time_rowwise": 1.5043467283248901, "time_global": 1.5726275742053986}
{"repeat": 64, "batch_size": 16384, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.5423910915851593, "standard_gw": 0.5674734711647034, "standard_gx": 0.5907565355300903, "rowwise_fwd": 0.3149174153804779, "rowwise_bwd": 0.3899820148944855, "global_fwd": 0.2909451723098755, "global_bwd": 0.3783814609050751, "x_quantize_rowwise": 0.15584751963615417, "g_quantize_rowwise": 0.04688650369644165, "w_quantize_rowwise": 0.031463801860809326, "w_quantize_colwise_transpose": 0.09072571992874146, "w_quantize_global": 0.09774044156074524, "w_quantize_global_transpose": 0.10405108332633972, "cast_x": 0.3111511468887329, "cast_g": 0.08282437920570374, "cast_w": 0.015992671251296997, "time_standard": 1.700621098279953, "time_rowwise": 1.5972964465618134, "time_global": 1.6413256525993347}
{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 1.2115389108657837, "standard_gw": 1.1259466409683228, "standard_gx": 1.1027492582798004, "rowwise_fwd": 0.7407031953334808, "rowwise_bwd": 0.5539208650588989, "global_fwd": 0.7214657962322235, "global_bwd": 0.5515590310096741, "x_quantize_rowwise": 0.08765608072280884, "g_quantize_rowwise": 0.3022328019142151, "w_quantize_rowwise": 0.03347545862197876, "w_quantize_colwise_transpose": 0.08694455027580261, "w_quantize_global": 0.09706243872642517, "w_quantize_global_transpose": 0.10102614760398865, "cast_x": 0.1592189073562622, "cast_g": 0.6166175007820129, "cast_w": 0.01607835292816162, "time_standard": 3.440234810113907, "time_rowwise": 2.930879592895508, "time_global": 2.986948937177658}
{"repeat": 64, "batch_size": 32768, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 1.1010989546775818, "standard_gw": 1.1352524161338806, "standard_gx": 1.1676251888275146, "rowwise_fwd": 0.5864761769771576, "rowwise_bwd": 0.7485374808311462, "global_fwd": 0.5547590553760529, "global_bwd": 0.7249303162097931, "x_quantize_rowwise": 0.3021731972694397, "g_quantize_rowwise": 0.08751824498176575, "w_quantize_rowwise": 0.033952295780181885, "w_quantize_colwise_transpose": 0.09011104702949524, "w_quantize_global": 0.09443238377571106, "w_quantize_global_transpose": 0.10376051068305969, "cast_x": 0.6167255342006683, "cast_g": 0.15922263264656067, "cast_w": 0.016070902347564697, "time_standard": 3.403976559638977, "time_rowwise": 2.984020859003067, "time_global": 3.0028261244297028}
{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 2.472013235092163, "standard_gw": 2.218998968601227, "standard_gx": 2.2116564214229584, "rowwise_fwd": 1.466125249862671, "rowwise_bwd": 1.0577328503131866, "global_fwd": 1.431729644536972, "global_bwd": 1.0476894676685333, "x_quantize_rowwise": 0.16929209232330322, "g_quantize_rowwise": 0.5952082574367523, "w_quantize_rowwise": 0.032100826501846313, "w_quantize_colwise_transpose": 0.08670613169670105, "w_quantize_global": 0.09590759873390198, "w_quantize_global_transpose": 0.10358169674873352, "cast_x": 0.31175464391708374, "cast_g": 1.2264922261238098, "cast_w": 0.016067177057266235, "time_standard": 6.902668625116348, "time_rowwise": 5.626164376735687, "time_global": 5.662407726049423}
{"repeat": 64, "batch_size": 65536, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 2.181064337491989, "standard_gw": 2.2256113588809967, "standard_gx": 2.3229196667671204, "rowwise_fwd": 1.0886266827583313, "rowwise_bwd": 1.4654062688350677, "global_fwd": 1.0472461581230164, "global_bwd": 1.433148980140686, "x_quantize_rowwise": 0.5954094231128693, "g_quantize_rowwise": 0.16921386122703552, "w_quantize_rowwise": 0.03442913293838501, "w_quantize_colwise_transpose": 0.09007751941680908, "w_quantize_global": 0.09575113654136658, "w_quantize_global_transpose": 0.10503828525543213, "cast_x": 1.2264810502529144, "cast_g": 0.3119036555290222, "cast_w": 0.01605600118637085, "time_standard": 6.729595363140106, "time_rowwise": 5.668774247169495, "time_global": 5.671419203281403}
{"repeat": 64, "batch_size": 1024, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.08157268166542053, "standard_gw": 0.07601454854011536, "standard_gx": 0.09059160947799683, "rowwise_fwd": 0.053066760301589966, "rowwise_bwd": 0.04787370562553406, "global_fwd": 0.05243346095085144, "global_bwd": 0.04809349775314331, "x_quantize_rowwise": 0.02571195363998413, "g_quantize_rowwise": 0.025898218154907227, "w_quantize_rowwise": 0.02714991569519043, "w_quantize_colwise_transpose": 0.19773468375205994, "w_quantize_global": 0.07273256778717041, "w_quantize_global_transpose": 0.08068978786468506, "cast_x": 0.008046627044677734, "cast_g": 0.0252649188041687, "cast_w": 0.0393986701965332, "time_standard": 0.24817883968353271, "time_rowwise": 0.4534497857093811, "time_global": 0.38157403469085693}
{"repeat": 64, "batch_size": 1024, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.09134411811828613, "standard_gw": 0.07602199912071228, "standard_gx": 0.09555742144584656, "rowwise_fwd": 0.047691166400909424, "rowwise_bwd": 0.05320459604263306, "global_fwd": 0.04759058356285095, "global_bwd": 0.0521540641784668, "x_quantize_rowwise": 0.025313347578048706, "g_quantize_rowwise": 0.025119632482528687, "w_quantize_rowwise": 0.0269375741481781, "w_quantize_colwise_transpose": 0.1857280731201172, "w_quantize_global": 0.07451698184013367, "w_quantize_global_transpose": 0.08009746670722961, "cast_x": 0.02547726035118103, "cast_g": 0.007897615432739258, "cast_w": 0.039536505937576294, "time_standard": 0.26292353868484497, "time_rowwise": 0.44001638889312744, "time_global": 0.3808140754699707}
{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 4.940010607242584, "standard_gw": 4.434864968061447, "standard_gx": 4.4097937643527985, "rowwise_fwd": 2.9467344284057617, "rowwise_bwd": 2.09181010723114, "global_fwd": 2.8806477785110474, "global_bwd": 2.0816922187805176, "x_quantize_rowwise": 0.33279508352279663, "g_quantize_rowwise": 1.1817067861557007, "w_quantize_rowwise": 0.03306567668914795, "w_quantize_colwise_transpose": 0.08666515350341797, "w_quantize_global": 0.0957287847995758, "w_quantize_global_transpose": 0.10242313146591187, "cast_x": 0.6165988743305206, "cast_g": 2.446405589580536, "cast_w": 0.016100704669952393, "time_standard": 13.78466933965683, "time_rowwise": 11.107642203569412, "time_global": 11.109858751296997}
{"repeat": 64, "batch_size": 131072, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 4.293464124202728, "standard_gw": 4.461295902729034, "standard_gx": 4.638340324163437, "rowwise_fwd": 2.116892486810684, "rowwise_bwd": 2.9479674994945526, "global_fwd": 2.0760856568813324, "global_bwd": 2.8755851089954376, "x_quantize_rowwise": 1.1818408966064453, "g_quantize_rowwise": 0.33276528120040894, "w_quantize_rowwise": 0.03287568688392639, "w_quantize_colwise_transpose": 0.09038299322128296, "w_quantize_global": 0.09598955512046814, "w_quantize_global_transpose": 0.100649893283844, "cast_x": 2.4467408657073975, "cast_g": 0.6165951490402222, "cast_w": 0.016082078218460083, "time_standard": 13.3931003510952, "time_rowwise": 11.164020746946335, "time_global": 11.12421229481697}
{"repeat": 64, "batch_size": 2048, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.1699887216091156, "standard_gw": 0.14045089483261108, "standard_gx": 0.17407909035682678, "rowwise_fwd": 0.10082125663757324, "rowwise_bwd": 0.08344277739524841, "global_fwd": 0.09941309690475464, "global_bwd": 0.08352473378181458, "x_quantize_rowwise": 0.025317072868347168, "g_quantize_rowwise": 0.03849714994430542, "w_quantize_rowwise": 0.02596527338027954, "w_quantize_colwise_transpose": 0.19767135381698608, "w_quantize_global": 0.07257238030433655, "w_quantize_global_transpose": 0.08127838373184204, "cast_x": 0.012032687664031982, "cast_g": 0.06345659494400024, "cast_w": 0.03953278064727783, "time_standard": 0.48451870679855347, "time_rowwise": 0.612165778875351, "time_global": 0.5410537123680115}
{"repeat": 64, "batch_size": 2048, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.14855340123176575, "standard_gw": 0.15553459525108337, "standard_gx": 0.16282498836517334, "rowwise_fwd": 0.09259581565856934, "rowwise_bwd": 0.11080875992774963, "global_fwd": 0.09166449308395386, "global_bwd": 0.10796263813972473, "x_quantize_rowwise": 0.03939121961593628, "g_quantize_rowwise": 0.025227665901184082, "w_quantize_rowwise": 0.027202069759368896, "w_quantize_colwise_transpose": 0.1940988004207611, "w_quantize_global": 0.07397681474685669, "w_quantize_global_transpose": 0.08178502321243286, "cast_x": 0.065632164478302, "cast_g": 0.01268833875656128, "cast_w": 0.04057586193084717, "time_standard": 0.46691298484802246, "time_rowwise": 0.6448589265346527, "time_global": 0.5755424499511719}
{"repeat": 64, "batch_size": 4096, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.32291561365127563, "standard_gw": 0.2875030040740967, "standard_gx": 0.3379322588443756, "rowwise_fwd": 0.19295886158943176, "rowwise_bwd": 0.16265735030174255, "global_fwd": 0.19031018018722534, "global_bwd": 0.16187503933906555, "x_quantize_rowwise": 0.02730637788772583, "g_quantize_rowwise": 0.06797909736633301, "w_quantize_rowwise": 0.02642720937728882, "w_quantize_colwise_transpose": 0.19745901226997375, "w_quantize_global": 0.07253512740135193, "w_quantize_global_transpose": 0.08047744631767273, "cast_x": 0.022336840629577637, "cast_g": 0.1209154725074768, "cast_w": 0.039268285036087036, "time_standard": 0.9483508765697479, "time_rowwise": 0.9622909128665924, "time_global": 0.8879862725734711}
{"repeat": 64, "batch_size": 4096, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.3019683063030243, "standard_gw": 0.288400799036026, "standard_gx": 0.3154948353767395, "rowwise_fwd": 0.18264353275299072, "rowwise_bwd": 0.2075284719467163, "global_fwd": 0.17072632908821106, "global_bwd": 0.1960061490535736, "x_quantize_rowwise": 0.06893649697303772, "g_quantize_rowwise": 0.02561509609222412, "w_quantize_rowwise": 0.026594847440719604, "w_quantize_colwise_transpose": 0.18575787544250488, "w_quantize_global": 0.07266923785209656, "w_quantize_global_transpose": 0.08060410618782043, "cast_x": 0.12182071805000305, "cast_g": 0.022590160369873047, "cast_w": 0.04000961780548096, "time_standard": 0.9058639407157898, "time_rowwise": 0.9854771196842194, "time_global": 0.9029582142829895}
{"repeat": 64, "batch_size": 8192, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 0.6489232182502747, "standard_gw": 0.5987770855426788, "standard_gx": 0.6644465029239655, "rowwise_fwd": 0.35867467522621155, "rowwise_bwd": 0.31855329871177673, "global_fwd": 0.353105366230011, "global_bwd": 0.31349435448646545, "x_quantize_rowwise": 0.03382191061973572, "g_quantize_rowwise": 0.12668967247009277, "w_quantize_rowwise": 0.02681836485862732, "w_quantize_colwise_transpose": 0.19756704568862915, "w_quantize_global": 0.07336586713790894, "w_quantize_global_transpose": 0.08036196231842041, "cast_x": 0.0583939254283905, "cast_g": 0.23520365357398987, "cast_w": 0.03935396671295166, "time_standard": 1.912146806716919, "time_rowwise": 1.660902053117752, "time_global": 1.579616218805313}
{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 0.5789436399936676, "standard_gw": 0.6130896508693695, "standard_gx": 0.6558857858181, "rowwise_fwd": 0.3464221954345703, "rowwise_bwd": 0.3650560975074768, "global_fwd": 0.3174394369125366, "global_bwd": 0.35758689045906067, "x_quantize_rowwise": 0.12686848640441895, "g_quantize_rowwise": 0.034302473068237305, "w_quantize_rowwise": 0.02745911478996277, "w_quantize_colwise_transpose": 0.1847483217716217, "w_quantize_global": 0.07192790508270264, "w_quantize_global_transpose": 0.08050352334976196, "cast_x": 0.23534893989562988, "cast_g": 0.05846098065376282, "cast_w": 0.03949552774429321, "time_standard": 1.847919076681137, "time_rowwise": 1.6979463398456573, "time_global": 1.6017183661460876}
{"repeat": 64, "batch_size": 1024, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.0573769211769104, "standard_gw": 0.061042606830596924, "standard_gx": 0.0783093273639679, "rowwise_fwd": 0.046797096729278564, "rowwise_bwd": 0.04620850086212158, "global_fwd": 0.04521384835243225, "global_bwd": 0.04425644874572754, "x_quantize_rowwise": 0.03257766366004944, "g_quantize_rowwise": 0.03449246287345886, "w_quantize_rowwise": 0.033657997846603394, "w_quantize_colwise_transpose": 0.1426301896572113, "w_quantize_global": 0.09257346391677856, "w_quantize_global_transpose": 0.10266527533531189, "cast_x": 0.011991709470748901, "cast_g": 0.020314007997512817, "cast_w": 0.027321279048919678, "time_standard": 0.19672885537147522, "time_rowwise": 0.39740651845932007, "time_global": 0.41282176971435547}
{"repeat": 64, "batch_size": 1024, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.07858872413635254, "standard_gw": 0.06122514605522156, "standard_gx": 0.05758553743362427, "rowwise_fwd": 0.04598498344421387, "rowwise_bwd": 0.04618242383003235, "global_fwd": 0.04597380757331848, "global_bwd": 0.046450644731521606, "x_quantize_rowwise": 0.03332272171974182, "g_quantize_rowwise": 0.033274292945861816, "w_quantize_rowwise": 0.0337548553943634, "w_quantize_colwise_transpose": 0.14807656407356262, "w_quantize_global": 0.09948387742042542, "w_quantize_global_transpose": 0.10120868682861328, "cast_x": 0.020120292901992798, "cast_g": 0.011488795280456543, "cast_w": 0.027466565370559692, "time_standard": 0.19739940762519836, "time_rowwise": 0.40182098746299744, "time_global": 0.420939177274704}
{"repeat": 64, "batch_size": 16384, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 1.3515166938304901, "standard_gw": 1.1536777019500732, "standard_gx": 1.224767416715622, "rowwise_fwd": 0.6912238895893097, "rowwise_bwd": 0.5562454462051392, "global_fwd": 0.67867711186409, "global_bwd": 0.5518943071365356, "x_quantize_rowwise": 0.06204098463058472, "g_quantize_rowwise": 0.24417787790298462, "w_quantize_rowwise": 0.025238841772079468, "w_quantize_colwise_transpose": 0.19756704568862915, "w_quantize_global": 0.07240846753120422, "w_quantize_global_transpose": 0.08046254515647888, "cast_x": 0.11138245463371277, "cast_g": 0.4637613892555237, "cast_w": 0.03935769200325012, "time_standard": 3.7299618124961853, "time_rowwise": 2.9301717877388, "time_global": 2.8433389961719513}
{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 1.2090615928173065, "standard_gw": 1.1396333575248718, "standard_gx": 1.2223869562149048, "rowwise_fwd": 0.5849376320838928, "rowwise_bwd": 0.6985403597354889, "global_fwd": 0.5565173923969269, "global_bwd": 0.6789751350879669, "x_quantize_rowwise": 0.2445802092552185, "g_quantize_rowwise": 0.06200745701789856, "w_quantize_rowwise": 0.027727335691452026, "w_quantize_colwise_transpose": 0.18501654267311096, "w_quantize_global": 0.07182732224464417, "w_quantize_global_transpose": 0.08069723844528198, "cast_x": 0.4638172686100006, "cast_g": 0.11136755347251892, "cast_w": 0.039517879486083984, "time_standard": 3.571081906557083, "time_rowwise": 2.9424428939819336, "time_global": 2.834238111972809}
{"repeat": 64, "batch_size": 32768, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 2.683013677597046, "standard_gw": 2.2987723350524902, "standard_gx": 2.4510622024536133, "rowwise_fwd": 1.359008252620697, "rowwise_bwd": 1.1018887162208557, "global_fwd": 1.3311207294464111, "global_bwd": 1.0954029858112335, "x_quantize_rowwise": 0.11804327368736267, "g_quantize_rowwise": 0.479232519865036, "w_quantize_rowwise": 0.026308000087738037, "w_quantize_colwise_transpose": 0.1975223422050476, "w_quantize_global": 0.07223710417747498, "w_quantize_global_transpose": 0.08019432425498962, "cast_x": 0.2161264419555664, "cast_g": 0.9207837283611298, "cast_w": 0.03929063677787781, "time_standard": 7.432848215103149, "time_rowwise": 5.580775439739227, "time_global": 5.475003272294998}
{"repeat": 64, "batch_size": 2048, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.11088326573371887, "standard_gw": 0.10994821786880493, "standard_gx": 0.12367218732833862, "rowwise_fwd": 0.07392093539237976, "rowwise_bwd": 0.07127970457077026, "global_fwd": 0.0730752944946289, "global_bwd": 0.07089227437973022, "x_quantize_rowwise": 0.03361701965332031, "g_quantize_rowwise": 0.03525242209434509, "w_quantize_rowwise": 0.03341585397720337, "w_quantize_colwise_transpose": 0.14318525791168213, "w_quantize_global": 0.09704753756523132, "w_quantize_global_transpose": 0.10221078991889954, "cast_x": 0.012002885341644287, "cast_g": 0.05240738391876221, "cast_w": 0.027313828468322754, "time_standard": 0.3445036709308624, "time_rowwise": 0.5006194114685059, "time_global": 0.5220435559749603}
{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 2.4625882506370544, "standard_gw": 2.421922981739044, "standard_gx": 2.380847930908203, "rowwise_fwd": 1.1231191456317902, "rowwise_bwd": 1.360483467578888, "global_fwd": 1.0947436094284058, "global_bwd": 1.3314113020896912, "x_quantize_rowwise": 0.4795975983142853, "g_quantize_rowwise": 0.11777132749557495, "w_quantize_rowwise": 0.02699345350265503, "w_quantize_colwise_transpose": 0.18484890460968018, "w_quantize_global": 0.07201358675956726, "w_quantize_global_transpose": 0.0803135335445404, "cast_x": 0.920858234167099, "cast_g": 0.21616369485855103, "cast_w": 0.03937259316444397, "time_standard": 7.265359163284302, "time_rowwise": 5.714736878871918, "time_global": 5.597773939371109}
{"repeat": 64, "batch_size": 2048, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.12437254190444946, "standard_gw": 0.11018291115760803, "standard_gx": 0.10970607399940491, "rowwise_fwd": 0.07167831063270569, "rowwise_bwd": 0.07583573460578918, "global_fwd": 0.07314234972000122, "global_bwd": 0.07501617074012756, "x_quantize_rowwise": 0.035624951124191284, "g_quantize_rowwise": 0.0333636999130249, "w_quantize_rowwise": 0.03264099359512329, "w_quantize_colwise_transpose": 0.14795735478401184, "w_quantize_global": 0.09621679782867432, "w_quantize_global_transpose": 0.10380148887634277, "cast_x": 0.05278363823890686, "cast_g": 0.01249462366104126, "cast_w": 0.02767890691757202, "time_standard": 0.3442615270614624, "time_rowwise": 0.5072839558124542, "time_global": 0.5273483693599701}
{"repeat": 64, "batch_size": 4096, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.21922588348388672, "standard_gw": 0.20731613039970398, "standard_gx": 0.23101642727851868, "rowwise_fwd": 0.1423358917236328, "rowwise_bwd": 0.1195073127746582, "global_fwd": 0.1401938498020172, "global_bwd": 0.11940300464630127, "x_quantize_rowwise": 0.03353878855705261, "g_quantize_rowwise": 0.06387382745742798, "w_quantize_rowwise": 0.03428757190704346, "w_quantize_colwise_transpose": 0.14376267790794373, "w_quantize_global": 0.09389594197273254, "w_quantize_global_transpose": 0.10196119546890259, "cast_x": 0.020060688257217407, "cast_g": 0.10236725211143494, "cast_w": 0.02732500433921814, "time_standard": 0.6575584411621094, "time_rowwise": 0.7446222007274628, "time_global": 0.7601827383041382}
{"repeat": 64, "batch_size": 4096, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.20026043057441711, "standard_gw": 0.21172687411308289, "standard_gx": 0.2276189625263214, "rowwise_fwd": 0.12956932187080383, "rowwise_bwd": 0.15310943126678467, "global_fwd": 0.12427568435668945, "global_bwd": 0.14432892203330994, "x_quantize_rowwise": 0.06471946835517883, "g_quantize_rowwise": 0.03309175372123718, "w_quantize_rowwise": 0.03242120146751404, "w_quantize_colwise_transpose": 0.14733895659446716, "w_quantize_global": 0.09280815720558167, "w_quantize_global_transpose": 0.10265037417411804, "cast_x": 0.10267645120620728, "cast_g": 0.020150095224380493, "cast_w": 0.027399510145187378, "time_standard": 0.6396062672138214, "time_rowwise": 0.7719770073890686, "time_global": 0.773601233959198}
{"repeat": 64, "batch_size": 65536, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 5.324859172105789, "standard_gw": 4.977177828550339, "standard_gx": 4.468705505132675, "rowwise_fwd": 2.7004145085811615, "rowwise_bwd": 2.121664583683014, "global_fwd": 2.648312598466873, "global_bwd": 2.111390233039856, "x_quantize_rowwise": 0.22934377193450928, "g_quantize_rowwise": 0.9496547281742096, "w_quantize_rowwise": 0.02555176615715027, "w_quantize_colwise_transpose": 0.1977868378162384, "w_quantize_global": 0.0727437436580658, "w_quantize_global_transpose": 0.08098781108856201, "cast_x": 0.4259459674358368, "cast_g": 1.8352754414081573, "cast_w": 0.039637088775634766, "time_standard": 14.770742505788803, "time_rowwise": 11.201594024896622, "time_global": 11.069610714912415}
{"repeat": 64, "batch_size": 8192, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.49151480197906494, "standard_gw": 0.4681535065174103, "standard_gx": 0.42366236448287964, "rowwise_fwd": 0.2766512334346771, "rowwise_bwd": 0.2083033323287964, "global_fwd": 0.2709813416004181, "global_bwd": 0.20718947052955627, "x_quantize_rowwise": 0.034555792808532715, "g_quantize_rowwise": 0.11969730257987976, "w_quantize_rowwise": 0.03300607204437256, "w_quantize_colwise_transpose": 0.14345720410346985, "w_quantize_global": 0.09280070662498474, "w_quantize_global_transpose": 0.10214745998382568, "cast_x": 0.052288174629211426, "cast_g": 0.19747763872146606, "cast_w": 0.027339905500411987, "time_standard": 1.3833306729793549, "time_rowwise": 1.2838244438171387, "time_global": 1.2955255806446075}
{"repeat": 64, "batch_size": 8192, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.39635971188545227, "standard_gw": 0.44353678822517395, "standard_gx": 0.4724152386188507, "rowwise_fwd": 0.22813305258750916, "rowwise_bwd": 0.2868436276912689, "global_fwd": 0.2119205892086029, "global_bwd": 0.2749413251876831, "x_quantize_rowwise": 0.12082979083061218, "g_quantize_rowwise": 0.03444403409957886, "w_quantize_rowwise": 0.03444403409957886, "w_quantize_colwise_transpose": 0.14675036072731018, "w_quantize_global": 0.09495392441749573, "w_quantize_global_transpose": 0.1009330153465271, "cast_x": 0.19745156168937683, "cast_g": 0.05227327346801758, "cast_w": 0.027336180210113525, "time_standard": 1.312311738729477, "time_rowwise": 1.294981688261032, "time_global": 1.2815594673156738}
{"repeat": 64, "batch_size": 16384, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 1.0207034647464752, "standard_gw": 0.897720456123352, "standard_gx": 0.8374936878681183, "rowwise_fwd": 0.5457103252410889, "rowwise_bwd": 0.4088357090950012, "global_fwd": 0.5308091640472412, "global_bwd": 0.40555745363235474, "x_quantize_rowwise": 0.05984678864479065, "g_quantize_rowwise": 0.2306811511516571, "w_quantize_rowwise": 0.0334717333316803, "w_quantize_colwise_transpose": 0.14356523752212524, "w_quantize_global": 0.09340420365333557, "w_quantize_global_transpose": 0.09996071457862854, "cast_x": 0.10207295417785645, "cast_g": 0.3880411386489868, "cast_w": 0.027671456336975098, "time_standard": 2.7559176087379456, "time_rowwise": 2.3198314011096954, "time_global": 2.31797993183136}
{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 4.502948373556137, "standard_gw": 4.418112337589264, "standard_gx": 4.748217761516571, "rowwise_fwd": 2.1329298615455627, "rowwise_bwd": 2.6968345046043396, "global_fwd": 2.102244645357132, "global_bwd": 2.6461556553840637, "x_quantize_rowwise": 0.9493157267570496, "g_quantize_rowwise": 0.2290569245815277, "w_quantize_rowwise": 0.02551451325416565, "w_quantize_colwise_transpose": 0.18491223454475403, "w_quantize_global": 0.07426366209983826, "w_quantize_global_transpose": 0.08058920502662659, "cast_x": 1.8352717161178589, "cast_g": 0.425681471824646, "cast_w": 0.039402395486831665, "time_standard": 13.669278472661972, "time_rowwise": 10.636676102876663, "time_global": 10.499738156795502}
{"repeat": 64, "batch_size": 16384, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.8179470896720886, "standard_gw": 0.8687414228916168, "standard_gx": 0.9276494383811951, "rowwise_fwd": 0.4481859505176544, "rowwise_bwd": 0.5557462573051453, "global_fwd": 0.4100687801837921, "global_bwd": 0.5317367613315582, "x_quantize_rowwise": 0.2301819622516632, "g_quantize_rowwise": 0.05963817238807678, "w_quantize_rowwise": 0.033523887395858765, "w_quantize_colwise_transpose": 0.14462321996688843, "w_quantize_global": 0.094633549451828, "w_quantize_global_transpose": 0.10088086128234863, "cast_x": 0.3879927098751068, "cast_g": 0.10205060243606567, "cast_w": 0.02714991569519043, "time_standard": 2.6143379509449005, "time_rowwise": 2.3406408727169037, "time_global": 2.295881509780884}
{"repeat": 64, "batch_size": 32768, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 2.0698904991149902, "standard_gw": 1.7200261354446411, "standard_gx": 1.663345843553543, "rowwise_fwd": 1.0664835572242737, "rowwise_bwd": 0.8059032261371613, "global_fwd": 1.0454729199409485, "global_bwd": 0.801432877779007, "x_quantize_rowwise": 0.1127384603023529, "g_quantize_rowwise": 0.4529319703578949, "w_quantize_rowwise": 0.03398582339286804, "w_quantize_colwise_transpose": 0.14343857765197754, "w_quantize_global": 0.09441003203392029, "w_quantize_global_transpose": 0.09993091225624084, "cast_x": 0.19744038581848145, "cast_g": 0.769149512052536, "cast_w": 0.02734735608100891, "time_standard": 5.453262478113174, "time_rowwise": 4.335507750511169, "time_global": 4.3269433081150055}
{"repeat": 64, "batch_size": 32768, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 2.758193761110306, "standard_gw": 1.6880109906196594, "standard_gx": 1.8163062632083893, "rowwise_fwd": 0.8343160152435303, "rowwise_bwd": 1.073598861694336, "global_fwd": 0.8045099675655365, "global_bwd": 1.0492689907550812, "x_quantize_rowwise": 0.453021377325058, "g_quantize_rowwise": 0.11304020881652832, "w_quantize_rowwise": 0.0337064266204834, "w_quantize_colwise_transpose": 0.1452416181564331, "w_quantize_global": 0.09451434016227722, "w_quantize_global_transpose": 0.0998079776763916, "cast_x": 0.769101083278656, "cast_g": 0.19731372594833374, "cast_w": 0.027332454919815063, "time_standard": 6.2625110149383545, "time_rowwise": 4.340935498476028, "time_global": 4.302173852920532}
{"repeat": 64, "batch_size": 131072, "dim_out": 6144, "dim_in": 1408, "wm": 4.3637, "switch": false, "standard_fwd": 10.728541761636734, "standard_gw": 9.228862822055817, "standard_gx": 8.837487548589706, "rowwise_fwd": 5.4414160549640656, "rowwise_bwd": 4.186157137155533, "global_fwd": 5.329187959432602, "global_bwd": 4.150416702032089, "x_quantize_rowwise": 0.4517659544944763, "g_quantize_rowwise": 1.890372484922409, "w_quantize_rowwise": 0.027563422918319702, "w_quantize_colwise_transpose": 0.1980513334274292, "w_quantize_global": 0.0733695924282074, "w_quantize_global_transpose": 0.08009746670722961, "cast_x": 0.8449330925941467, "cast_g": 3.6641769111156464, "cast_w": 0.03945454955101013, "time_standard": 28.794892132282257, "time_rowwise": 21.42418920993805, "time_global": 21.20407298207283}
{"repeat": 64, "batch_size": 65536, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 4.127204418182373, "standard_gw": 3.359321504831314, "standard_gx": 5.557261407375336, "rowwise_fwd": 2.1365806460380554, "rowwise_bwd": 1.6042962670326233, "global_fwd": 2.0923763513565063, "global_bwd": 1.5939176082611084, "x_quantize_rowwise": 0.21954253315925598, "g_quantize_rowwise": 0.8971206843852997, "w_quantize_rowwise": 0.03357976675033569, "w_quantize_colwise_transpose": 0.1431293785572052, "w_quantize_global": 0.10574981570243835, "w_quantize_global_transpose": 0.10281801223754883, "cast_x": 0.38795173168182373, "cast_g": 1.5318207442760468, "cast_w": 0.027142465114593506, "time_standard": 13.043787330389023, "time_rowwise": 8.39357078075409, "time_global": 8.370846509933472}
{"repeat": 64, "batch_size": 65536, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 5.576469004154205, "standard_gw": 3.361724317073822, "standard_gx": 3.6300085484981537, "rowwise_fwd": 1.6183294355869293, "rowwise_bwd": 2.1462254226207733, "global_fwd": 1.5953555703163147, "global_bwd": 2.0915642380714417, "x_quantize_rowwise": 0.8973218500614166, "g_quantize_rowwise": 0.2197064459323883, "w_quantize_rowwise": 0.03402307629585266, "w_quantize_colwise_transpose": 0.14822185039520264, "w_quantize_global": 0.09706616401672363, "w_quantize_global_transpose": 0.10339170694351196, "cast_x": 1.5312805771827698, "cast_g": 0.3879964351654053, "cast_w": 0.0269375741481781, "time_standard": 12.568201869726181, "time_rowwise": 8.425552397966385, "time_global": 8.366130292415619}
{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 6144, "wm": 4.3637, "switch": true, "standard_fwd": 8.900497108697891, "standard_gw": 9.188394993543625, "standard_gx": 9.503517299890518, "rowwise_fwd": 4.189815372228622, "rowwise_bwd": 5.426768213510513, "global_fwd": 4.155576229095459, "global_bwd": 5.329132080078125, "x_quantize_rowwise": 1.8885880708694458, "g_quantize_rowwise": 0.45193731784820557, "w_quantize_rowwise": 0.025987625122070312, "w_quantize_colwise_transpose": 0.1842118799686432, "w_quantize_global": 0.07349997758865356, "w_quantize_global_transpose": 0.08074194192886353, "cast_x": 3.6639943718910217, "cast_g": 0.8447282016277313, "cast_w": 0.03973767161369324, "time_standard": 27.592409402132034, "time_rowwise": 21.355703473091125, "time_global": 21.167870610952377}
{"repeat": 64, "batch_size": 131072, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 8.2329623401165, "standard_gw": 6.799045950174332, "standard_gx": 6.893906742334366, "rowwise_fwd": 4.252739250659943, "rowwise_bwd": 3.2025352120399475, "global_fwd": 4.176046699285507, "global_bwd": 3.173377364873886, "x_quantize_rowwise": 0.43221935629844666, "g_quantize_rowwise": 1.7872042953968048, "w_quantize_rowwise": 0.03328174352645874, "w_quantize_colwise_transpose": 0.1431480050086975, "w_quantize_global": 0.09707733988761902, "w_quantize_global_transpose": 0.10161846876144409, "cast_x": 0.7692091166973114, "cast_g": 3.057178109884262, "cast_w": 0.027302652597427368, "time_standard": 21.9259150326252, "time_rowwise": 16.65017381310463, "time_global": 16.56658947467804}
{"repeat": 64, "batch_size": 131072, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 11.278409510850906, "standard_gw": 6.815284490585327, "standard_gx": 7.280956953763962, "rowwise_fwd": 3.206692636013031, "rowwise_bwd": 4.246953874826431, "global_fwd": 3.1801797449588776, "global_bwd": 4.169579595327377, "x_quantize_rowwise": 1.7862766981124878, "g_quantize_rowwise": 0.4329495131969452, "w_quantize_rowwise": 0.03413483500480652, "w_quantize_colwise_transpose": 0.14493241906166077, "w_quantize_global": 0.09881332516670227, "w_quantize_global_transpose": 0.10376423597335815, "cast_x": 3.057088702917099, "cast_g": 0.7693544030189514, "cast_w": 0.027261674404144287, "time_standard": 25.374650955200195, "time_rowwise": 16.66722446680069, "time_global": 16.586847603321075}
{"repeat": 64, "batch_size": 1024, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.11636316776275635, "standard_gw": 0.11816620826721191, "standard_gx": 0.11482089757919312, "rowwise_fwd": 0.08482113480567932, "rowwise_bwd": 0.06284937262535095, "global_fwd": 0.08296221494674683, "global_bwd": 0.061664730310440063, "x_quantize_rowwise": 0.026706606149673462, "g_quantize_rowwise": 0.025641173124313354, "w_quantize_rowwise": 0.03740563988685608, "w_quantize_colwise_transpose": 0.2965778112411499, "w_quantize_global": 0.11304393410682678, "w_quantize_global_transpose": 0.12390688061714172, "cast_x": 0.008635222911834717, "cast_g": 0.037532299757003784, "cast_w": 0.06856024265289307, "time_standard": 0.3493502736091614, "time_rowwise": 0.652167946100235, "time_global": 0.5520917475223541}
{"repeat": 64, "batch_size": 1024, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.11609122157096863, "standard_gw": 0.11704489588737488, "standard_gx": 0.11566653847694397, "rowwise_fwd": 0.06706640124320984, "rowwise_bwd": 0.09074807167053223, "global_fwd": 0.06621330976486206, "global_bwd": 0.0859871506690979, "x_quantize_rowwise": 0.027574598789215088, "g_quantize_rowwise": 0.02520531415939331, "w_quantize_rowwise": 0.04095584154129028, "w_quantize_colwise_transpose": 0.37036463618278503, "w_quantize_global": 0.11350959539413452, "w_quantize_global_transpose": 0.12202560901641846, "cast_x": 0.03780052065849304, "cast_g": 0.00860169529914856, "cast_w": 0.06864592432975769, "time_standard": 0.3488026559352875, "time_rowwise": 0.7389597594738007, "time_global": 0.5575604736804962}
{"repeat": 64, "batch_size": 2048, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.22610649466514587, "standard_gw": 0.2229548990726471, "standard_gx": 0.22150203585624695, "rowwise_fwd": 0.1421608030796051, "rowwise_bwd": 0.10771304368972778, "global_fwd": 0.13930723071098328, "global_bwd": 0.10715052485466003, "x_quantize_rowwise": 0.02812594175338745, "g_quantize_rowwise": 0.04733726382255554, "w_quantize_rowwise": 0.03758445382118225, "w_quantize_colwise_transpose": 0.29515475034713745, "w_quantize_global": 0.11344626545906067, "w_quantize_global_transpose": 0.12392178177833557, "cast_x": 0.013589859008789062, "cast_g": 0.08285418152809143, "cast_w": 0.06850436329841614, "time_standard": 0.6705634295940399, "time_rowwise": 0.8810311555862427, "time_global": 0.7822439074516296}
{"repeat": 64, "batch_size": 2048, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.20173192024230957, "standard_gw": 0.2351999282836914, "standard_gx": 0.24710968136787415, "rowwise_fwd": 0.12035667896270752, "rowwise_bwd": 0.153418630361557, "global_fwd": 0.11473894119262695, "global_bwd": 0.14553219079971313, "x_quantize_rowwise": 0.04762038588523865, "g_quantize_rowwise": 0.02557411789894104, "w_quantize_rowwise": 0.04055723547935486, "w_quantize_colwise_transpose": 0.32641738653182983, "w_quantize_global": 0.1138448715209961, "w_quantize_global_transpose": 0.12255832552909851, "cast_x": 0.08405372500419617, "cast_g": 0.013835728168487549, "cast_w": 0.06961449980735779, "time_standard": 0.6840415298938751, "time_rowwise": 0.9491443634033203, "time_global": 0.8050687611103058}
{"repeat": 64, "batch_size": 4096, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 0.48126280307769775, "standard_gw": 0.46824291348457336, "standard_gx": 0.45252591371536255, "rowwise_fwd": 0.2749897539615631, "rowwise_bwd": 0.2111680805683136, "global_fwd": 0.2689175307750702, "global_bwd": 0.2104043960571289, "x_quantize_rowwise": 0.02676248550415039, "g_quantize_rowwise": 0.0842660665512085, "w_quantize_rowwise": 0.037495046854019165, "w_quantize_colwise_transpose": 0.2952851355075836, "w_quantize_global": 0.11366978287696838, "w_quantize_global_transpose": 0.12461841106414795, "cast_x": 0.0283755362033844, "cast_g": 0.1590624451637268, "cast_w": 0.06854161620140076, "time_standard": 1.4020316302776337, "time_rowwise": 1.3982094824314117, "time_global": 1.2968815863132477}
{"repeat": 64, "batch_size": 4096, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.4076175391674042, "standard_gw": 0.45526400208473206, "standard_gx": 0.4996545612812042, "rowwise_fwd": 0.238761305809021, "rowwise_bwd": 0.2913624048233032, "global_fwd": 0.2149641513824463, "global_bwd": 0.2717897295951843, "x_quantize_rowwise": 0.0845976173877716, "g_quantize_rowwise": 0.0266246497631073, "w_quantize_rowwise": 0.04038959741592407, "w_quantize_colwise_transpose": 0.33299997448921204, "w_quantize_global": 0.11374801397323608, "w_quantize_global_transpose": 0.12202560901641846, "cast_x": 0.15895813703536987, "cast_g": 0.028312206268310547, "cast_w": 0.06841868162155151, "time_standard": 1.3625361025333405, "time_rowwise": 1.4699995517730713, "time_global": 1.2890137732028961}
{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 1.02214515209198, "standard_gw": 0.9412020444869995, "standard_gx": 0.883936882019043, "rowwise_fwd": 0.5209781229496002, "rowwise_bwd": 0.41617080569267273, "global_fwd": 0.5089044570922852, "global_bwd": 0.4142932593822479, "x_quantize_rowwise": 0.03763660788536072, "g_quantize_rowwise": 0.15798211097717285, "w_quantize_rowwise": 0.0375211238861084, "w_quantize_colwise_transpose": 0.2973228693008423, "w_quantize_global": 0.11317431926727295, "w_quantize_global_transpose": 0.12396648526191711, "cast_x": 0.0685863196849823, "cast_g": 0.311531126499176, "cast_w": 0.0685080885887146, "time_standard": 2.8472840785980225, "time_rowwise": 2.4088136851787567, "time_global": 2.2971592843532562}
{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 0.8539073169231415, "standard_gw": 0.9352751076221466, "standard_gx": 0.9567439556121826, "rowwise_fwd": 0.4599541425704956, "rowwise_bwd": 0.531073659658432, "global_fwd": 0.42063742876052856, "global_bwd": 0.5125999450683594, "x_quantize_rowwise": 0.1581348478794098, "g_quantize_rowwise": 0.03755837678909302, "w_quantize_rowwise": 0.04056468605995178, "w_quantize_colwise_transpose": 0.3295913338661194, "w_quantize_global": 0.11314079165458679, "w_quantize_global_transpose": 0.12153387069702148, "cast_x": 0.3114752471446991, "cast_g": 0.06850063800811768, "cast_w": 0.06839632987976074, "time_standard": 2.7459263801574707, "time_rowwise": 2.492152154445648, "time_global": 2.2988803684711456}
{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 2.0550191402435303, "standard_gw": 1.7850138247013092, "standard_gx": 1.7571337521076202, "rowwise_fwd": 1.026798039674759, "rowwise_bwd": 0.8242167532444, "global_fwd": 1.0042376816272736, "global_bwd": 0.8189938962459564, "x_quantize_rowwise": 0.0688992440700531, "g_quantize_rowwise": 0.3054179251194, "w_quantize_rowwise": 0.03757700324058533, "w_quantize_colwise_transpose": 0.2973712980747223, "w_quantize_global": 0.11324509978294373, "w_quantize_global_transpose": 0.12398511171340942, "cast_x": 0.13050436973571777, "cast_g": 0.6165280938148499, "cast_w": 0.06848573684692383, "time_standard": 5.59716671705246, "time_rowwise": 4.345294088125229, "time_global": 4.2197927832603455}
{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 1.79310142993927, "standard_gw": 1.7801076173782349, "standard_gx": 1.9140169024467468, "rowwise_fwd": 0.8629709482192993, "rowwise_bwd": 1.0353922843933105, "global_fwd": 0.8200556039810181, "global_bwd": 1.002725213766098, "x_quantize_rowwise": 0.30517578125, "g_quantize_rowwise": 0.06880238652229309, "w_quantize_rowwise": 0.040318816900253296, "w_quantize_colwise_transpose": 0.3413744270801544, "w_quantize_global": 0.11326000094413757, "w_quantize_global_transpose": 0.12197345495223999, "cast_x": 0.6162337958812714, "cast_g": 0.13053417205810547, "cast_w": 0.06848946213722229, "time_standard": 5.487225949764252, "time_rowwise": 4.4341422617435455, "time_global": 4.212100058794022}
{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 4.0736086666584015, "standard_gw": 3.595758229494095, "standard_gx": 3.7020929157733917, "rowwise_fwd": 2.0306408405303955, "rowwise_bwd": 1.635722815990448, "global_fwd": 1.9890740513801575, "global_bwd": 1.627359539270401, "x_quantize_rowwise": 0.13131648302078247, "g_quantize_rowwise": 0.6001107394695282, "w_quantize_rowwise": 0.03781542181968689, "w_quantize_colwise_transpose": 0.2975836396217346, "w_quantize_global": 0.11357292532920837, "w_quantize_global_transpose": 0.12416765093803406, "cast_x": 0.2544410526752472, "cast_g": 1.2265890836715698, "cast_w": 0.06866827607154846, "time_standard": 11.371459811925888, "time_rowwise": 8.32894816994667, "time_global": 8.181359618902206}
{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 3.525231033563614, "standard_gw": 3.489706665277481, "standard_gx": 3.9937011897563934, "rowwise_fwd": 1.6627348959445953, "rowwise_bwd": 2.0311400294303894, "global_fwd": 1.6270726919174194, "global_bwd": 1.988884061574936, "x_quantize_rowwise": 0.5999915301799774, "g_quantize_rowwise": 0.1310594379901886, "w_quantize_rowwise": 0.04043802618980408, "w_quantize_colwise_transpose": 0.32950565218925476, "w_quantize_global": 0.11298432946205139, "w_quantize_global_transpose": 0.12201443314552307, "cast_x": 1.2257546186447144, "cast_g": 0.25444477796554565, "cast_w": 0.06848573684692383, "time_standard": 11.008638888597488, "time_rowwise": 8.28457623720169, "time_global": 8.071713149547577}
{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 8.123598992824554, "standard_gw": 8.085217326879501, "standard_gx": 7.293816655874252, "rowwise_fwd": 4.07782569527626, "rowwise_bwd": 3.196723759174347, "global_fwd": 4.001103341579437, "global_bwd": 3.1843744218349457, "x_quantize_rowwise": 0.2560615539550781, "g_quantize_rowwise": 1.1893659830093384, "w_quantize_rowwise": 0.037297606468200684, "w_quantize_colwise_transpose": 0.29668211936950684, "w_quantize_global": 0.11358782649040222, "w_quantize_global_transpose": 0.12476742267608643, "cast_x": 0.5020052194595337, "cast_g": 2.4454034864902496, "cast_w": 0.0684782862663269, "time_standard": 23.502632975578308, "time_rowwise": 17.139174044132233, "time_global": 16.95447787642479}
{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 6.932958960533142, "standard_gw": 7.0609524846076965, "standard_gx": 7.460080087184906, "rowwise_fwd": 3.1809918582439423, "rowwise_bwd": 4.078391939401627, "global_fwd": 3.185112029314041, "global_bwd": 3.99089977145195, "x_quantize_rowwise": 1.1891834437847137, "g_quantize_rowwise": 0.25588274002075195, "w_quantize_rowwise": 0.0406019389629364, "w_quantize_colwise_transpose": 0.3389529883861542, "w_quantize_global": 0.11313334107398987, "w_quantize_global_transpose": 0.12241676449775696, "cast_x": 2.4446770548820496, "cast_g": 0.5022138357162476, "cast_w": 0.06857141852378845, "time_standard": 21.453991532325745, "time_rowwise": 16.14495739340782, "time_global": 15.9175805747509}
{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 1664, "wm": 4.9231, "switch": false, "standard_fwd": 16.38999581336975, "standard_gw": 15.075922012329102, "standard_gx": 14.479495584964752, "rowwise_fwd": 8.128684014081955, "rowwise_bwd": 6.41091912984848, "global_fwd": 7.977847009897232, "global_bwd": 6.362702697515488, "x_quantize_rowwise": 0.5057230591773987, "g_quantize_rowwise": 2.3681968450546265, "w_quantize_rowwise": 0.037435442209243774, "w_quantize_colwise_transpose": 0.29555708169937134, "w_quantize_global": 0.11360272765159607, "w_quantize_global_transpose": 0.12426823377609253, "cast_x": 0.997692346572876, "cast_g": 4.8848651349544525, "cast_w": 0.0685565173625946, "time_standard": 45.945413410663605, "time_rowwise": 32.82243758440018, "time_global": 32.528262585401535}
{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 8192, "wm": 4.9231, "switch": true, "standard_fwd": 14.838922768831253, "standard_gw": 15.112213790416718, "standard_gx": 14.869242906570435, "rowwise_fwd": 6.402213126420975, "rowwise_bwd": 8.132629096508026, "global_fwd": 6.36359304189682, "global_bwd": 7.9823993146419525, "x_quantize_rowwise": 2.367999404668808, "g_quantize_rowwise": 0.5056969821453094, "w_quantize_rowwise": 0.04053488373756409, "w_quantize_colwise_transpose": 0.3559887409210205, "w_quantize_global": 0.1136288046836853, "w_quantize_global_transpose": 0.125102698802948, "cast_x": 4.880473017692566, "cast_g": 0.9965412318706512, "cast_w": 0.06855279207229614, "time_standard": 44.820379465818405, "time_rowwise": 32.91727602481842, "time_global": 32.57063403725624}
{"repeat": 64, "batch_size": 1024, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.15426427125930786, "standard_gw": 0.14531239867210388, "standard_gx": 0.1703128218650818, "rowwise_fwd": 0.09618699550628662, "rowwise_bwd": 0.10633841156959534, "global_fwd": 0.09483471512794495, "global_bwd": 0.10636076331138611, "x_quantize_rowwise": 0.02434849739074707, "g_quantize_rowwise": 0.026009976863861084, "w_quantize_rowwise": 0.04366040229797363, "w_quantize_colwise_transpose": 0.34148991107940674, "w_quantize_global": 0.13587623834609985, "w_quantize_global_transpose": 0.14698877930641174, "cast_x": 0.009745359420776367, "cast_g": 0.03773719072341919, "cast_w": 0.08277222514152527, "time_standard": 0.46988949179649353, "time_rowwise": 0.7833465933799744, "time_global": 0.6797313690185547}
{"repeat": 64, "batch_size": 1024, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.16738846898078918, "standard_gw": 0.14199689030647278, "standard_gx": 0.15476346015930176, "rowwise_fwd": 0.11660531163215637, "rowwise_bwd": 0.1050308346748352, "global_fwd": 0.11050701141357422, "global_bwd": 0.09868666529655457, "x_quantize_rowwise": 0.02781301736831665, "g_quantize_rowwise": 0.024966895580291748, "w_quantize_rowwise": 0.047437846660614014, "w_quantize_colwise_transpose": 0.5995631217956543, "w_quantize_global": 0.1362822949886322, "w_quantize_global_transpose": 0.14807283878326416, "cast_x": 0.0377558171749115, "cast_g": 0.00973045825958252, "cast_w": 0.0828281044960022, "time_standard": 0.4641488194465637, "time_rowwise": 1.063413918018341, "time_global": 0.6883256137371063}
{"repeat": 64, "batch_size": 2048, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.2727396786212921, "standard_gw": 0.2711080014705658, "standard_gx": 0.3120154142379761, "rowwise_fwd": 0.16424059867858887, "rowwise_bwd": 0.17686933279037476, "global_fwd": 0.161685049533844, "global_bwd": 0.17517060041427612, "x_quantize_rowwise": 0.025484710931777954, "g_quantize_rowwise": 0.047635287046432495, "w_quantize_rowwise": 0.04380941390991211, "w_quantize_colwise_transpose": 0.3401711583137512, "w_quantize_global": 0.13605505228042603, "w_quantize_global_transpose": 0.14705583453178406, "cast_x": 0.01584365963935852, "cast_g": 0.08274242281913757, "cast_w": 0.08281320333480835, "time_standard": 0.855863094329834, "time_rowwise": 1.0693185031414032, "time_global": 0.9641945362091064}
{"repeat": 64, "batch_size": 2048, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.28916075825691223, "standard_gw": 0.29472261667251587, "standard_gx": 0.30096620321273804, "rowwise_fwd": 0.19618868827819824, "rowwise_bwd": 0.17556175589561462, "global_fwd": 0.18328800797462463, "global_bwd": 0.16647577285766602, "x_quantize_rowwise": 0.047441571950912476, "g_quantize_rowwise": 0.026609748601913452, "w_quantize_rowwise": 0.04766508936882019, "w_quantize_colwise_transpose": 0.6060972809791565, "w_quantize_global": 0.1363418996334076, "w_quantize_global_transpose": 0.14806538820266724, "cast_x": 0.08295103907585144, "cast_g": 0.015836209058761597, "cast_w": 0.08285045623779297, "time_standard": 0.8848495781421661, "time_rowwise": 1.3942867517471313, "time_global": 1.0029450058937073}
{"repeat": 64, "batch_size": 4096, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 0.6430819630622864, "standard_gw": 0.5622953176498413, "standard_gx": 0.5780421197414398, "rowwise_fwd": 0.318676233291626, "rowwise_bwd": 0.29438361525535583, "global_fwd": 0.31290948390960693, "global_bwd": 0.290747731924057, "x_quantize_rowwise": 0.027455389499664307, "g_quantize_rowwise": 0.08405372500419617, "w_quantize_rowwise": 0.04369765520095825, "w_quantize_colwise_transpose": 0.34110620617866516, "w_quantize_global": 0.1360774040222168, "w_quantize_global_transpose": 0.14697015285491943, "cast_x": 0.037614256143569946, "cast_g": 0.15922263264656067, "cast_w": 0.08288025856018066, "time_standard": 1.7834194004535675, "time_rowwise": 1.671668142080307, "time_global": 1.560509204864502}
{"repeat": 64, "batch_size": 4096, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 0.551275908946991, "standard_gw": 0.591665506362915, "standard_gx": 0.6067268550395966, "rowwise_fwd": 0.33493712544441223, "rowwise_bwd": 0.32918527722358704, "global_fwd": 0.29528141021728516, "global_bwd": 0.31659379601478577, "x_quantize_rowwise": 0.08441135287284851, "g_quantize_rowwise": 0.025656074285507202, "w_quantize_rowwise": 0.04745647311210632, "w_quantize_colwise_transpose": 0.5993843078613281, "w_quantize_global": 0.1359879970550537, "w_quantize_global_transpose": 0.14815106987953186, "cast_x": 0.15932321548461914, "cast_g": 0.037439167499542236, "cast_w": 0.08288398385047913, "time_standard": 1.7496682703495026, "time_rowwise": 2.0126961171627045, "time_global": 1.5977472066879272}
{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 1.2295916676521301, "standard_gw": 1.116037368774414, "standard_gx": 1.1164769530296326, "rowwise_fwd": 0.603698194026947, "rowwise_bwd": 0.5168020725250244, "global_fwd": 0.5922466516494751, "global_bwd": 0.5151033401489258, "x_quantize_rowwise": 0.0437907874584198, "g_quantize_rowwise": 0.157918781042099, "w_quantize_rowwise": 0.044032931327819824, "w_quantize_colwise_transpose": 0.34073740243911743, "w_quantize_global": 0.13559311628341675, "w_quantize_global_transpose": 0.14679506421089172, "cast_x": 0.08263811469078064, "cast_g": 0.3115162253379822, "cast_w": 0.08287280797958374, "time_standard": 3.4621059894561768, "time_rowwise": 2.8230175375938416, "time_global": 2.707485109567642}
{"repeat": 64, "batch_size": 8192, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 1.090865582227707, "standard_gw": 1.1468492448329926, "standard_gx": 1.1166594922542572, "rowwise_fwd": 0.5559474229812622, "rowwise_bwd": 0.6105974316596985, "global_fwd": 0.5200020968914032, "global_bwd": 0.592011958360672, "x_quantize_rowwise": 0.15802308917045593, "g_quantize_rowwise": 0.04357844591140747, "w_quantize_rowwise": 0.04709511995315552, "w_quantize_colwise_transpose": 0.5969703197479248, "w_quantize_global": 0.13620033860206604, "w_quantize_global_transpose": 0.148136168718338, "cast_x": 0.31115859746932983, "cast_g": 0.08263811469078064, "cast_w": 0.08268281817436218, "time_standard": 3.3543743193149567, "time_rowwise": 3.159061074256897, "time_global": 2.744801342487335}
{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 2.4665743112564087, "standard_gw": 2.1993443369865417, "standard_gx": 2.1993033587932587, "rowwise_fwd": 1.192428171634674, "rowwise_bwd": 1.023314893245697, "global_fwd": 1.1711902916431427, "global_bwd": 1.0202191770076752, "x_quantize_rowwise": 0.08077174425125122, "g_quantize_rowwise": 0.30520185828208923, "w_quantize_rowwise": 0.043783336877822876, "w_quantize_colwise_transpose": 0.339999794960022, "w_quantize_global": 0.13628602027893066, "w_quantize_global_transpose": 0.14696642756462097, "cast_x": 0.15902891755104065, "cast_g": 0.6164535880088806, "cast_w": 0.08285418152809143, "time_standard": 6.865222007036209, "time_rowwise": 5.184844136238098, "time_global": 5.059979856014252}
{"repeat": 64, "batch_size": 16384, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 2.1861791610717773, "standard_gw": 2.157818526029587, "standard_gx": 2.321537584066391, "rowwise_fwd": 1.0536126792430878, "rowwise_bwd": 1.1971630156040192, "global_fwd": 1.02127343416214, "global_bwd": 1.1707991361618042, "x_quantize_rowwise": 0.30522048473358154, "g_quantize_rowwise": 0.08065253496170044, "w_quantize_rowwise": 0.04741176962852478, "w_quantize_colwise_transpose": 0.5979575216770172, "w_quantize_global": 0.1362040638923645, "w_quantize_global_transpose": 0.14854222536087036, "cast_x": 0.6162486970424652, "cast_g": 0.1591891050338745, "cast_w": 0.08288398385047913, "time_standard": 6.665535271167755, "time_rowwise": 5.439836531877518, "time_global": 5.020510405302048}
{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 4.891645163297653, "standard_gw": 4.233300685882568, "standard_gx": 4.2071714997291565, "rowwise_fwd": 2.3616664111614227, "rowwise_bwd": 1.9419342279434204, "global_fwd": 2.3244209587574005, "global_bwd": 1.9598640501499176, "x_quantize_rowwise": 0.15483051538467407, "g_quantize_rowwise": 0.6008371710777283, "w_quantize_rowwise": 0.043839216232299805, "w_quantize_colwise_transpose": 0.3400743007659912, "w_quantize_global": 0.1362822949886322, "w_quantize_global_transpose": 0.14691054821014404, "cast_x": 0.31141936779022217, "cast_g": 1.2254081666469574, "cast_w": 0.08280202746391296, "time_standard": 13.332117348909378, "time_rowwise": 9.676482528448105, "time_global": 9.556446224451065}
{"repeat": 64, "batch_size": 32768, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 4.267625510692596, "standard_gw": 4.237007349729538, "standard_gx": 4.666488617658615, "rowwise_fwd": 1.9670464098453522, "rowwise_bwd": 2.362079918384552, "global_fwd": 1.9469596445560455, "global_bwd": 2.32585147023201, "x_quantize_rowwise": 0.6000921130180359, "g_quantize_rowwise": 0.15481188893318176, "w_quantize_rowwise": 0.04725530743598938, "w_quantize_colwise_transpose": 0.5976222455501556, "w_quantize_global": 0.13619661331176758, "w_quantize_global_transpose": 0.14815852046012878, "cast_x": 1.2261345982551575, "cast_g": 0.3117173910140991, "cast_w": 0.08279457688331604, "time_standard": 13.17112147808075, "time_rowwise": 9.965915232896805, "time_global": 9.549077600240707}
{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 9.787477552890778, "standard_gw": 8.533861488103867, "standard_gx": 8.979786187410355, "rowwise_fwd": 4.741787910461426, "rowwise_bwd": 3.871854394674301, "global_fwd": 4.674319177865982, "global_bwd": 3.9110779762268066, "x_quantize_rowwise": 0.3025829792022705, "g_quantize_rowwise": 1.1898204684257507, "w_quantize_rowwise": 0.043705105781555176, "w_quantize_colwise_transpose": 0.33997371792793274, "w_quantize_global": 0.13592839241027832, "w_quantize_global_transpose": 0.14724954962730408, "cast_x": 0.6160177290439606, "cast_g": 2.4440810084342957, "cast_w": 0.08280575275421143, "time_standard": 27.301125228405, "time_rowwise": 19.023586064577103, "time_global": 18.89484003186226}
{"repeat": 64, "batch_size": 65536, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 8.461769670248032, "standard_gw": 8.428700268268585, "standard_gx": 9.447630494832993, "rowwise_fwd": 3.881257027387619, "rowwise_bwd": 4.7471001744270325, "global_fwd": 3.9101652801036835, "global_bwd": 4.662122577428818, "x_quantize_rowwise": 1.1892355978488922, "g_quantize_rowwise": 0.3024376928806305, "w_quantize_rowwise": 0.04708021879196167, "w_quantize_colwise_transpose": 0.5982778966426849, "w_quantize_global": 0.13624131679534912, "w_quantize_global_transpose": 0.1484602689743042, "cast_x": 2.4463236331939697, "cast_g": 0.6163865327835083, "cast_w": 0.08278340101242065, "time_standard": 26.33810043334961, "time_rowwise": 19.194088876247406, "time_global": 18.777363002300262}
{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 19.699689000844955, "standard_gw": 16.89574122428894, "standard_gx": 17.907552421092987, "rowwise_fwd": 9.453803300857544, "rowwise_bwd": 7.8153833746910095, "global_fwd": 9.313825517892838, "global_bwd": 7.8215524554252625, "x_quantize_rowwise": 0.5986690521240234, "g_quantize_rowwise": 2.368006855249405, "w_quantize_rowwise": 0.043682754039764404, "w_quantize_colwise_transpose": 0.3406330943107605, "w_quantize_global": 0.13626739382743835, "w_quantize_global_transpose": 0.14715641736984253, "cast_x": 1.2262165546417236, "cast_g": 4.8834048211574554, "cast_w": 0.08272379636764526, "time_standard": 54.50298264622688, "time_rowwise": 37.51591965556145, "time_global": 37.28121891617775}
{"repeat": 64, "batch_size": 131072, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 18.66700127720833, "standard_gw": 18.56840029358864, "standard_gx": 18.049821257591248, "rowwise_fwd": 7.742393761873245, "rowwise_bwd": 9.479016065597534, "global_fwd": 7.806576788425446, "global_bwd": 9.328477084636688, "x_quantize_rowwise": 2.368297427892685, "g_quantize_rowwise": 0.5978643894195557, "w_quantize_rowwise": 0.047303736209869385, "w_quantize_colwise_transpose": 0.5982741713523865, "w_quantize_global": 0.13678893446922302, "w_quantize_global_transpose": 0.1488029956817627, "cast_x": 4.880513995885849, "cast_g": 1.2248307466506958, "cast_w": 0.08270144462585449, "time_standard": 55.285222828388214, "time_rowwise": 39.401549845933914, "time_global": 38.955207914114}
{"repeat": 64, "batch_size": 1024, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 0.529509037733078, "standard_gw": 0.5781911313533783, "standard_gx": 0.6095841526985168, "rowwise_fwd": 0.2811029553413391, "rowwise_bwd": 0.3345906734466553, "global_fwd": 0.27928128838539124, "global_bwd": 0.33126771450042725, "x_quantize_rowwise": 0.025760382413864136, "g_quantize_rowwise": 0.06494298577308655, "w_quantize_rowwise": 0.15570968389511108, "w_quantize_colwise_transpose": 1.6086548566818237, "w_quantize_global": 0.481434166431427, "w_quantize_global_transpose": 0.505443662405014, "cast_x": 0.01582130789756775, "cast_g": 0.08295103907585144, "cast_w": 0.311531126499176, "time_standard": 1.7172843217849731, "time_rowwise": 3.048952668905258, "time_global": 2.2663213312625885}
{"repeat": 64, "batch_size": 1024, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 0.5729459226131439, "standard_gw": 0.5789846181869507, "standard_gx": 0.5775243043899536, "rowwise_fwd": 0.36711618304252625, "rowwise_bwd": 0.2913735806941986, "global_fwd": 0.33703818917274475, "global_bwd": 0.2821236848831177, "x_quantize_rowwise": 0.064849853515625, "g_quantize_rowwise": 0.025060027837753296, "w_quantize_rowwise": 0.22537633776664734, "w_quantize_colwise_transpose": 3.6401040852069855, "w_quantize_global": 0.4818551242351532, "w_quantize_global_transpose": 0.5101114511489868, "cast_x": 0.08286535739898682, "cast_g": 0.015828758478164673, "cast_w": 0.3114677965641022, "time_standard": 1.7294548451900482, "time_rowwise": 5.192864686250687, "time_global": 2.2800229489803314}
{"repeat": 64, "batch_size": 2048, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 1.1735819280147552, "standard_gw": 1.121576875448227, "standard_gx": 1.1242404580116272, "rowwise_fwd": 0.5535706877708435, "rowwise_bwd": 0.5567893385887146, "global_fwd": 0.5486570298671722, "global_bwd": 0.551365315914154, "x_quantize_rowwise": 0.02710893750190735, "g_quantize_rowwise": 0.11784210801124573, "w_quantize_rowwise": 0.15565752983093262, "w_quantize_colwise_transpose": 1.607745885848999, "w_quantize_global": 0.4824437201023102, "w_quantize_global_transpose": 0.5060508847236633, "cast_x": 0.03808736801147461, "cast_g": 0.15912577509880066, "cast_w": 0.31150132417678833, "time_standard": 3.4193992614746094, "time_rowwise": 4.14029136300087, "time_global": 3.35504487156868}
{"repeat": 64, "batch_size": 2048, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 1.1169910430908203, "standard_gw": 1.1065900325775146, "standard_gx": 1.1815577745437622, "rowwise_fwd": 0.5917288362979889, "rowwise_bwd": 0.5614385008811951, "global_fwd": 0.5646944046020508, "global_bwd": 0.5500949919223785, "x_quantize_rowwise": 0.118207186460495, "g_quantize_rowwise": 0.025041401386260986, "w_quantize_rowwise": 0.22566691040992737, "w_quantize_colwise_transpose": 3.635551780462265, "w_quantize_global": 0.4815608263015747, "w_quantize_global_transpose": 0.509701669216156, "cast_x": 0.15912950038909912, "cast_g": 0.03797560930252075, "cast_w": 0.3114044666290283, "time_standard": 3.405138850212097, "time_rowwise": 6.264224648475647, "time_global": 3.3558905124664307}
{"repeat": 64, "batch_size": 4096, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 2.3259930312633514, "standard_gw": 2.1472275257110596, "standard_gx": 2.213582396507263, "rowwise_fwd": 1.0509602725505829, "rowwise_bwd": 0.9888559579849243, "global_fwd": 1.0398179292678833, "global_bwd": 0.9887740015983582, "x_quantize_rowwise": 0.04647299647331238, "g_quantize_rowwise": 0.22570788860321045, "w_quantize_rowwise": 0.1554824411869049, "w_quantize_colwise_transpose": 1.610085368156433, "w_quantize_global": 0.48134103417396545, "w_quantize_global_transpose": 0.5054809153079987, "cast_x": 0.08297711610794067, "cast_g": 0.3115646541118622, "cast_w": 0.31159818172454834, "time_standard": 6.686802953481674, "time_rowwise": 6.224792450666428, "time_global": 5.434822291135788}
{"repeat": 64, "batch_size": 4096, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 2.19760462641716, "standard_gw": 2.2860951721668243, "standard_gx": 2.290956676006317, "rowwise_fwd": 1.0311491787433624, "rowwise_bwd": 1.0555200278759003, "global_fwd": 0.9858310222625732, "global_bwd": 1.0394863784313202, "x_quantize_rowwise": 0.22591277956962585, "g_quantize_rowwise": 0.046234577894210815, "w_quantize_rowwise": 0.22603943943977356, "w_quantize_colwise_transpose": 3.628809005022049, "w_quantize_global": 0.4819147288799286, "w_quantize_global_transpose": 0.5104243755340576, "cast_x": 0.3114528954029083, "cast_g": 0.08296966552734375, "cast_w": 0.3116317093372345, "time_standard": 6.7746564745903015, "time_rowwise": 8.499760180711746, "time_global": 5.575899034738541}
{"repeat": 64, "batch_size": 8192, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 4.633370786905289, "standard_gw": 4.397690296173096, "standard_gx": 4.286538809537888, "rowwise_fwd": 2.089906483888626, "rowwise_bwd": 1.9657425582408905, "global_fwd": 2.0679645240306854, "global_bwd": 1.9629858434200287, "x_quantize_rowwise": 0.08271634578704834, "g_quantize_rowwise": 0.43905526399612427, "w_quantize_rowwise": 0.1551508903503418, "w_quantize_colwise_transpose": 1.6106180846691132, "w_quantize_global": 0.48185884952545166, "w_quantize_global_transpose": 0.506274402141571, "cast_x": 0.15918537974357605, "cast_g": 0.6163418292999268, "cast_w": 0.311531126499176, "time_standard": 13.317599892616272, "time_rowwise": 10.74087992310524, "time_global": 9.938545525074005}
{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 4.424266517162323, "standard_gw": 4.391487687826157, "standard_gx": 4.61186096072197, "rowwise_fwd": 1.9874684512615204, "rowwise_bwd": 2.093140035867691, "global_fwd": 1.9647255539894104, "global_bwd": 2.06940621137619, "x_quantize_rowwise": 0.43999403715133667, "g_quantize_rowwise": 0.08271634578704834, "w_quantize_rowwise": 0.22581592202186584, "w_quantize_colwise_transpose": 3.631964325904846, "w_quantize_global": 0.4821456968784332, "w_quantize_global_transpose": 0.5102343857288361, "cast_x": 0.6164386868476868, "cast_g": 0.1591108739376068, "cast_w": 0.31154975295066833, "time_standard": 13.42761516571045, "time_rowwise": 12.852586805820465, "time_global": 9.940709918737411}
{"repeat": 64, "batch_size": 16384, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 9.229827672243118, "standard_gw": 8.319318294525146, "standard_gx": 8.652344346046448, "rowwise_fwd": 4.163607954978943, "rowwise_bwd": 3.778301179409027, "global_fwd": 4.121184349060059, "global_bwd": 3.7708766758441925, "x_quantize_rowwise": 0.1553669571876526, "g_quantize_rowwise": 0.8715838193893433, "w_quantize_rowwise": 0.15540048480033875, "w_quantize_colwise_transpose": 1.6092769801616669, "w_quantize_global": 0.4813969135284424, "w_quantize_global_transpose": 0.5070343613624573, "cast_x": 0.31150132417678833, "cast_g": 1.2259706854820251, "cast_w": 0.311482697725296, "time_standard": 26.201490312814713, "time_rowwise": 19.052855670452118, "time_global": 18.226761370897293}
{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 8.577890694141388, "standard_gw": 9.073298424482346, "standard_gx": 9.210295975208282, "rowwise_fwd": 3.7784352898597717, "rowwise_bwd": 4.165928810834885, "global_fwd": 3.7702471017837524, "global_bwd": 4.121150821447372, "x_quantize_rowwise": 0.868629664182663, "g_quantize_rowwise": 0.1554340124130249, "w_quantize_rowwise": 0.22614002227783203, "w_quantize_colwise_transpose": 3.6367811262607574, "w_quantize_global": 0.4828609526157379, "w_quantize_global_transpose": 0.510137528181076, "cast_x": 1.2258104979991913, "cast_g": 0.31299516558647156, "cast_w": 0.3114677965641022, "time_standard": 26.861485093832016, "time_rowwise": 21.90464735031128, "time_global": 18.981758505105972}
{"repeat": 64, "batch_size": 32768, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 18.52763444185257, "standard_gw": 17.835520207881927, "standard_gx": 17.375655472278595, "rowwise_fwd": 8.35346058011055, "rowwise_bwd": 7.584303617477417, "global_fwd": 8.300606161355972, "global_bwd": 7.550913840532303, "x_quantize_rowwise": 0.3016740083694458, "g_quantize_rowwise": 1.7321519553661346, "w_quantize_rowwise": 0.15538185834884644, "w_quantize_colwise_transpose": 1.6110800206661224, "w_quantize_global": 0.4815198481082916, "w_quantize_global_transpose": 0.5066357553005219, "cast_x": 0.6163753569126129, "cast_g": 2.4452805519104004, "cast_w": 0.31156837940216064, "time_standard": 53.73881012201309, "time_rowwise": 37.573572248220444, "time_global": 36.7090217769146}
{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 18.073823302984238, "standard_gw": 16.71283319592476, "standard_gx": 18.46104860305786, "rowwise_fwd": 7.542364299297333, "rowwise_bwd": 8.374195545911789, "global_fwd": 7.5644850730896, "global_bwd": 8.26016440987587, "x_quantize_rowwise": 1.7326027154922485, "g_quantize_rowwise": 0.30233338475227356, "w_quantize_rowwise": 0.2259574830532074, "w_quantize_colwise_transpose": 3.634512424468994, "w_quantize_global": 0.48204511404037476, "w_quantize_global_transpose": 0.5093887448310852, "cast_x": 2.445656806230545, "cast_g": 0.6163381040096283, "cast_w": 0.31144917011260986, "time_standard": 53.24770510196686, "time_rowwise": 38.524799048900604, "time_global": 35.56385263800621}
{"repeat": 64, "batch_size": 65536, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 36.123402416706085, "standard_gw": 32.68447890877724, "standard_gx": 34.13737937808037, "rowwise_fwd": 16.65867120027542, "rowwise_bwd": 15.004873275756836, "global_fwd": 16.536589711904526, "global_bwd": 14.949381351470947, "x_quantize_rowwise": 0.5952902138233185, "g_quantize_rowwise": 3.4581348299980164, "w_quantize_rowwise": 0.15559792518615723, "w_quantize_colwise_transpose": 1.6055963933467865, "w_quantize_global": 0.48203766345977783, "w_quantize_global_transpose": 0.5048215389251709, "cast_x": 1.2256354093551636, "cast_g": 4.875503480434418, "cast_w": 0.3110244870185852, "time_standard": 102.94526070356369, "time_rowwise": 70.16264274716377, "time_global": 69.210734218359}
{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 35.0223146378994, "standard_gw": 32.84081444144249, "standard_gx": 35.984884947538376, "rowwise_fwd": 15.018381178379059, "rowwise_bwd": 16.69919490814209, "global_fwd": 14.942582696676254, "global_bwd": 16.529250890016556, "x_quantize_rowwise": 3.442291170358658, "g_quantize_rowwise": 0.5951747298240662, "w_quantize_rowwise": 0.22576376795768738, "w_quantize_colwise_transpose": 3.621157258749008, "w_quantize_global": 0.48135966062545776, "w_quantize_global_transpose": 0.5095489323139191, "cast_x": 4.875205457210541, "cast_g": 1.2237727642059326, "cast_w": 0.3110431134700775, "time_standard": 103.84801402688026, "time_rowwise": 72.44277745485306, "time_global": 69.3410225212574}
{"repeat": 64, "batch_size": 131072, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 72.33698666095734, "standard_gw": 71.31465151906013, "standard_gx": 69.32922825217247, "rowwise_fwd": 33.37707370519638, "rowwise_bwd": 30.1642008125782, "global_fwd": 33.002063632011414, "global_bwd": 30.003495514392853, "x_quantize_rowwise": 1.1819563806056976, "g_quantize_rowwise": 6.896954029798508, "w_quantize_rowwise": 0.15557929873466492, "w_quantize_colwise_transpose": 1.6083605587482452, "w_quantize_global": 0.48125162720680237, "w_quantize_global_transpose": 0.5055665969848633, "cast_x": 2.442535012960434, "cast_g": 9.750165045261383, "cast_w": 0.31094998121261597, "time_standard": 212.98086643218994, "time_rowwise": 144.69877630472183, "time_global": 143.38593930006027}
{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 70.24158909916878, "standard_gw": 72.03734293580055, "standard_gx": 72.01339676976204, "rowwise_fwd": 30.072908848524094, "rowwise_bwd": 33.376410603523254, "global_fwd": 29.965493828058243, "global_bwd": 33.01112726330757, "x_quantize_rowwise": 6.894122809171677, "g_quantize_rowwise": 1.1817142367362976, "w_quantize_rowwise": 0.22567808628082275, "w_quantize_colwise_transpose": 3.616899251937866, "w_quantize_global": 0.4819147288799286, "w_quantize_global_transpose": 0.5107112228870392, "cast_x": 9.750377386808395, "cast_g": 2.4411343038082123, "cast_w": 0.31099095940589905, "time_standard": 214.29232880473137, "time_rowwise": 147.40507677197456, "time_global": 144.0824270248413}
{"repeat": 64, "batch_size": 65536, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 138.23134452104568, "standard_gw": 131.48364424705505, "standard_gx": 141.09868183732033, "rowwise_fwd": 65.38830325007439, "rowwise_bwd": 58.39048698544502, "global_fwd": 65.2194656431675, "global_bwd": 58.58004465699196, "x_quantize_rowwise": 1.1899955570697784, "g_quantize_rowwise": 6.623774766921997, "w_quantize_rowwise": 0.5935952067375183, "w_quantize_colwise_transpose": 24.08137544989586, "w_quantize_global": 1.740824431180954, "w_quantize_global_transpose": 1.8664970993995667, "cast_x": 2.413548529148102, "cast_g": 9.63655486702919, "cast_w": 1.1956281960010529, "time_standard": 410.81367060542107, "time_rowwise": 287.7511754631996, "time_global": 266.7042464017868}
{"repeat": 64, "batch_size": 65536, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 141.08363911509514, "standard_gw": 133.26667994260788, "standard_gx": 136.0350362956524, "rowwise_fwd": 58.49892646074295, "rowwise_bwd": 65.34496694803238, "global_fwd": 58.73573571443558, "global_bwd": 65.30505418777466, "x_quantize_rowwise": 6.648071110248566, "g_quantize_rowwise": 1.1903978884220123, "w_quantize_rowwise": 0.8329600095748901, "w_quantize_colwise_transpose": 15.297897160053253, "w_quantize_global": 1.7403066158294678, "w_quantize_global_transpose": 1.8791332840919495, "cast_x": 9.636614471673965, "cast_g": 2.4122819304466248, "cast_w": 1.1954344809055328, "time_standard": 410.3853553533554, "time_rowwise": 281.07989951968193, "time_global": 268.7653787434101}
{"repeat": 64, "batch_size": 1024, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 2.535879611968994, "standard_gw": 2.249978482723236, "standard_gx": 2.2262558341026306, "rowwise_fwd": 1.085665076971054, "rowwise_bwd": 1.069542020559311, "global_fwd": 1.0830685496330261, "global_bwd": 1.0597631335258484, "x_quantize_rowwise": 0.02650916576385498, "g_quantize_rowwise": 0.1200847327709198, "w_quantize_rowwise": 0.5937665700912476, "w_quantize_colwise_transpose": 23.926906287670135, "w_quantize_global": 1.7397291958332062, "w_quantize_global_transpose": 1.8652454018592834, "cast_x": 0.03688782453536987, "cast_g": 0.15725940465927124, "cast_w": 1.1969134211540222, "time_standard": 7.012113928794861, "time_rowwise": 29.07245233654976, "time_global": 8.144378662109375}
{"repeat": 64, "batch_size": 1024, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 2.245493233203888, "standard_gw": 2.2966675460338593, "standard_gx": 2.216015011072159, "rowwise_fwd": 1.1000856757164001, "rowwise_bwd": 1.0902360081672668, "global_fwd": 1.0597333312034607, "global_bwd": 1.0812543332576752, "x_quantize_rowwise": 0.11992454528808594, "g_quantize_rowwise": 0.026784837245941162, "w_quantize_rowwise": 0.8310377597808838, "w_quantize_colwise_transpose": 15.30550792813301, "w_quantize_global": 1.7401352524757385, "w_quantize_global_transpose": 1.8841177225112915, "cast_x": 0.1573599874973297, "cast_g": 0.03676116466522217, "cast_w": 1.195952296257019, "time_standard": 6.758175790309906, "time_rowwise": 20.770244300365448, "time_global": 8.208617568016052}
{"repeat": 64, "batch_size": 2048, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 4.197858273983002, "standard_gw": 4.288379102945328, "standard_gx": 4.155721515417099, "rowwise_fwd": 2.0567886531352997, "rowwise_bwd": 1.9073635339736938, "global_fwd": 2.0506344735622406, "global_bwd": 1.9086338579654694, "x_quantize_rowwise": 0.04758685827255249, "g_quantize_rowwise": 0.22284314036369324, "w_quantize_rowwise": 0.5935467779636383, "w_quantize_colwise_transpose": 23.935042321681976, "w_quantize_global": 1.7397813498973846, "w_quantize_global_transpose": 1.8662959337234497, "cast_x": 0.08194148540496826, "cast_g": 0.3077872097492218, "cast_w": 1.1968687176704407, "time_standard": 12.641958892345428, "time_rowwise": 33.05155038833618, "time_global": 12.124154716730118}
{"repeat": 64, "batch_size": 2048, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 4.126541316509247, "standard_gw": 4.309836775064468, "standard_gx": 4.117351025342941, "rowwise_fwd": 1.9266381859779358, "rowwise_bwd": 2.0577237010002136, "global_fwd": 1.908630132675171, "global_bwd": 2.0505934953689575, "x_quantize_rowwise": 0.22304058074951172, "g_quantize_rowwise": 0.04766136407852173, "w_quantize_rowwise": 0.8306317031383514, "w_quantize_colwise_transpose": 15.309855341911316, "w_quantize_global": 1.7415396869182587, "w_quantize_global_transpose": 1.8827766180038452, "cast_x": 0.30782073736190796, "cast_g": 0.08186325430870056, "cast_w": 1.1955127120018005, "time_standard": 12.553729116916656, "time_rowwise": 24.70538765192032, "time_global": 12.164078652858734}
{"repeat": 64, "batch_size": 4096, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 8.298952132463455, "standard_gw": 8.345257490873337, "standard_gx": 8.647706359624863, "rowwise_fwd": 4.106882959604263, "rowwise_bwd": 3.8046911358833313, "global_fwd": 4.09451499581337, "global_bwd": 3.8078874349594116, "x_quantize_rowwise": 0.08447840809822083, "g_quantize_rowwise": 0.4291348159313202, "w_quantize_rowwise": 0.5934201180934906, "w_quantize_colwise_transpose": 23.843105882406235, "w_quantize_global": 1.7399191856384277, "w_quantize_global_transpose": 1.8653236329555511, "cast_x": 0.1577921211719513, "cast_g": 0.6089024245738983, "cast_w": 1.1952444911003113, "time_standard": 25.291915982961655, "time_rowwise": 41.2069708108902, "time_global": 20.366515964269638}
{"repeat": 64, "batch_size": 4096, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 8.323360234498978, "standard_gw": 8.433796465396881, "standard_gx": 8.236430585384369, "rowwise_fwd": 3.8114115595817566, "rowwise_bwd": 4.106346517801285, "global_fwd": 3.8080140948295593, "global_bwd": 4.094675183296204, "x_quantize_rowwise": 0.4288516938686371, "g_quantize_rowwise": 0.08437782526016235, "w_quantize_rowwise": 0.8310228586196899, "w_quantize_colwise_transpose": 15.306610614061356, "w_quantize_global": 1.741155982017517, "w_quantize_global_transpose": 1.8809586763381958, "cast_x": 0.6091706454753876, "cast_g": 0.157233327627182, "cast_w": 1.1953115463256836, "time_standard": 24.993587285280228, "time_rowwise": 33.00241753458977, "time_global": 20.471829921007156}
{"repeat": 64, "batch_size": 8192, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 16.656354069709778, "standard_gw": 17.066240310668945, "standard_gx": 17.252348363399506, "rowwise_fwd": 8.220307528972626, "rowwise_bwd": 7.2372183203697205, "global_fwd": 8.2036592066288, "global_bwd": 7.236208766698837, "x_quantize_rowwise": 0.15832111239433289, "g_quantize_rowwise": 0.8406005799770355, "w_quantize_rowwise": 0.5935393273830414, "w_quantize_colwise_transpose": 23.86143058538437, "w_quantize_global": 1.7401576042175293, "w_quantize_global_transpose": 1.8653534352779388, "cast_x": 0.3079026937484741, "cast_g": 1.209162175655365, "cast_w": 1.1951625347137451, "time_standard": 50.97494274377823, "time_rowwise": 57.97765776515007, "time_global": 37.11054101586342}
{"repeat": 64, "batch_size": 8192, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 17.398890107870102, "standard_gw": 18.470749258995056, "standard_gx": 16.520217061042786, "rowwise_fwd": 7.235266268253326, "rowwise_bwd": 8.207589387893677, "global_fwd": 7.235914468765259, "global_bwd": 8.204508572816849, "x_quantize_rowwise": 0.8409880101680756, "g_quantize_rowwise": 0.15821680426597595, "w_quantize_rowwise": 0.8324198424816132, "w_quantize_colwise_transpose": 15.305522829294205, "w_quantize_global": 1.7396919429302216, "w_quantize_global_transpose": 1.8805749714374542, "cast_x": 1.2103468179702759, "cast_g": 0.30729547142982483, "cast_w": 1.1953599750995636, "time_standard": 52.389856427907944, "time_rowwise": 51.05075240135193, "time_global": 38.53064402937889}
{"repeat": 64, "batch_size": 16384, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 33.533211797475815, "standard_gw": 33.00020843744278, "standard_gx": 34.614477306604385, "rowwise_fwd": 16.364943236112595, "rowwise_bwd": 14.551006257534027, "global_fwd": 16.33496955037117, "global_bwd": 14.513172209262848, "x_quantize_rowwise": 0.3053396940231323, "g_quantize_rowwise": 1.6693994402885437, "w_quantize_rowwise": 0.5936138331890106, "w_quantize_colwise_transpose": 23.89485388994217, "w_quantize_global": 1.741711050271988, "w_quantize_global_transpose": 1.8656104803085327, "cast_x": 0.6089657545089722, "cast_g": 2.4122074246406555, "cast_w": 1.1951886117458344, "time_standard": 101.14789754152298, "time_rowwise": 90.37936478853226, "time_global": 69.430410861969}
{"repeat": 64, "batch_size": 16384, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 33.65536406636238, "standard_gw": 33.02193805575371, "standard_gx": 33.10496360063553, "rowwise_fwd": 14.54489678144455, "rowwise_bwd": 16.36252924799919, "global_fwd": 14.50401172041893, "global_bwd": 16.33254438638687, "x_quantize_rowwise": 1.6695670783519745, "g_quantize_rowwise": 0.3054291009902954, "w_quantize_rowwise": 0.83121657371521, "w_quantize_colwise_transpose": 15.305932611227036, "w_quantize_global": 1.7382949590682983, "w_quantize_global_transpose": 1.880194991827011, "cast_x": 2.412091940641403, "cast_g": 0.6079599261283875, "cast_w": 1.1950358748435974, "time_standard": 99.78226572275162, "time_rowwise": 82.04150944948196, "time_global": 69.45198029279709}
{"repeat": 64, "batch_size": 32768, "dim_out": 32384, "dim_in": 8096, "wm": 4, "switch": false, "standard_fwd": 67.96638667583466, "standard_gw": 67.99514591693878, "standard_gx": 69.66376304626465, "rowwise_fwd": 33.51752087473869, "rowwise_bwd": 29.131878167390823, "global_fwd": 32.65715390443802, "global_bwd": 29.13403883576393, "x_quantize_rowwise": 0.6002038717269897, "g_quantize_rowwise": 3.3336542546749115, "w_quantize_rowwise": 0.5934685468673706, "w_quantize_colwise_transpose": 23.92345294356346, "w_quantize_global": 1.7405375838279724, "w_quantize_global_transpose": 1.8656738102436066, "cast_x": 1.2112446129322052, "cast_g": 4.81804832816124, "cast_w": 1.1952146887779236, "time_standard": 205.6252956390381, "time_rowwise": 159.09532457590103, "time_global": 137.3264081776142}
{"repeat": 64, "batch_size": 32768, "dim_out": 8096, "dim_in": 32384, "wm": 4, "switch": true, "standard_fwd": 68.2341456413269, "standard_gw": 65.5074268579483, "standard_gx": 67.13805347681046, "rowwise_fwd": 29.153641313314438, "rowwise_bwd": 32.71844983100891, "global_fwd": 29.124341905117035, "global_bwd": 32.65979886054993, "x_quantize_rowwise": 3.3318176865577698, "g_quantize_rowwise": 0.6004795432090759, "w_quantize_rowwise": 0.8309967815876007, "w_quantize_colwise_transpose": 15.305690467357635, "w_quantize_global": 1.7405711114406586, "w_quantize_global_transpose": 1.8802620470523834, "cast_x": 4.8183538019657135, "cast_g": 1.2096390128135681, "cast_w": 1.1951103806495667, "time_standard": 200.87962597608566, "time_rowwise": 147.44850248098373, "time_global": 134.84469801187515}
{"repeat": 64, "batch_size": 1024, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.07764250040054321, "standard_gw": 0.07398426532745361, "standard_gx": 0.08482858538627625, "rowwise_fwd": 0.05266070365905762, "rowwise_bwd": 0.04478543996810913, "global_fwd": 0.052012503147125244, "global_bwd": 0.044364482164382935, "x_quantize_rowwise": 0.02640858292579651, "g_quantize_rowwise": 0.02539902925491333, "w_quantize_rowwise": 0.026457011699676514, "w_quantize_colwise_transpose": 0.17770379781723022, "w_quantize_global": 0.07440149784088135, "w_quantize_global_transpose": 0.08142739534378052, "cast_x": 0.008150935173034668, "cast_g": 0.022415071725845337, "cast_w": 0.03479421138763428, "time_standard": 0.23645535111427307, "time_rowwise": 0.42739883065223694, "time_global": 0.3779977560043335}
{"repeat": 64, "batch_size": 1024, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.08524581789970398, "standard_gw": 0.07383152842521667, "standard_gx": 0.07564574480056763, "rowwise_fwd": 0.04478171467781067, "rowwise_bwd": 0.052671879529953, "global_fwd": 0.04452839493751526, "global_bwd": 0.05219504237174988, "x_quantize_rowwise": 0.025328248739242554, "g_quantize_rowwise": 0.027123838663101196, "w_quantize_rowwise": 0.025607645511627197, "w_quantize_colwise_transpose": 0.17121434211730957, "w_quantize_global": 0.07916614413261414, "w_quantize_global_transpose": 0.08177384734153748, "cast_x": 0.022619962692260742, "cast_g": 0.008556991815567017, "cast_w": 0.034421682357788086, "time_standard": 0.23472309112548828, "time_rowwise": 0.42055919766426086, "time_global": 0.3839470446109772}
{"repeat": 64, "batch_size": 2048, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.13731792569160461, "standard_gw": 0.13414397835731506, "standard_gx": 0.14049187302589417, "rowwise_fwd": 0.10158121585845947, "rowwise_bwd": 0.07804110646247864, "global_fwd": 0.09908527135848999, "global_bwd": 0.07766112685203552, "x_quantize_rowwise": 0.026516616344451904, "g_quantize_rowwise": 0.03666803240776062, "w_quantize_rowwise": 0.024981796741485596, "w_quantize_colwise_transpose": 0.17706677317619324, "w_quantize_global": 0.07443130016326904, "w_quantize_global_transpose": 0.07870793342590332, "cast_x": 0.01224130392074585, "cast_g": 0.05828961730003357, "cast_w": 0.03501400351524353, "time_standard": 0.41195377707481384, "time_rowwise": 0.5789995193481445, "time_global": 0.5272142589092255}
{"repeat": 64, "batch_size": 2048, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.14651194214820862, "standard_gw": 0.14011189341545105, "standard_gx": 0.140264630317688, "rowwise_fwd": 0.081576406955719, "rowwise_bwd": 0.10671466588973999, "global_fwd": 0.08158013224601746, "global_bwd": 0.10219961404800415, "x_quantize_rowwise": 0.03775954246520996, "g_quantize_rowwise": 0.026103109121322632, "w_quantize_rowwise": 0.02656877040863037, "w_quantize_colwise_transpose": 0.17822161316871643, "w_quantize_global": 0.07506832480430603, "w_quantize_global_transpose": 0.07928535342216492, "cast_x": 0.05893409252166748, "cast_g": 0.012326985597610474, "cast_w": 0.03498047590255737, "time_standard": 0.42688846588134766, "time_rowwise": 0.5970560014247894, "time_global": 0.5421079695224762}
{"repeat": 64, "batch_size": 4096, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.2734065055847168, "standard_gw": 0.25558844208717346, "standard_gx": 0.29174983501434326, "rowwise_fwd": 0.173322856426239, "rowwise_bwd": 0.1515895128250122, "global_fwd": 0.17048418521881104, "global_bwd": 0.1506991684436798, "x_quantize_rowwise": 0.025950372219085693, "g_quantize_rowwise": 0.0653192400932312, "w_quantize_rowwise": 0.027138739824295044, "w_quantize_colwise_transpose": 0.17699971795082092, "w_quantize_global": 0.07373467087745667, "w_quantize_global_transpose": 0.07901713252067566, "cast_x": 0.02214685082435608, "cast_g": 0.11127442121505737, "cast_w": 0.03481656312942505, "time_standard": 0.8207447826862335, "time_rowwise": 0.8759088814258575, "time_global": 0.8207932114601135}
{"repeat": 64, "batch_size": 4096, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.27839839458465576, "standard_gw": 0.2537444233894348, "standard_gx": 0.28207898139953613, "rowwise_fwd": 0.16542896628379822, "rowwise_bwd": 0.18540024757385254, "global_fwd": 0.15722215175628662, "global_bwd": 0.17368420958518982, "x_quantize_rowwise": 0.06661936640739441, "g_quantize_rowwise": 0.027049332857131958, "w_quantize_rowwise": 0.025507062673568726, "w_quantize_colwise_transpose": 0.1741349697113037, "w_quantize_global": 0.07463246583938599, "w_quantize_global_transpose": 0.07879361510276794, "cast_x": 0.11301413178443909, "cast_g": 0.023346394300460815, "cast_w": 0.03505498170852661, "time_standard": 0.8142217993736267, "time_rowwise": 0.8978843688964844, "time_global": 0.8317455649375916}
{"repeat": 64, "batch_size": 8192, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.5755424499511719, "standard_gw": 0.5219094455242157, "standard_gx": 0.5992203950881958, "rowwise_fwd": 0.33193081617355347, "rowwise_bwd": 0.295441597700119, "global_fwd": 0.32791122794151306, "global_bwd": 0.2906434237957001, "x_quantize_rowwise": 0.0337548553943634, "g_quantize_rowwise": 0.1225881278514862, "w_quantize_rowwise": 0.024937093257904053, "w_quantize_colwise_transpose": 0.17729029059410095, "w_quantize_global": 0.0730752944946289, "w_quantize_global_transpose": 0.07835403084754944, "cast_x": 0.058166682720184326, "cast_g": 0.21592900156974792, "cast_w": 0.03454089164733887, "time_standard": 1.6966722905635834, "time_rowwise": 1.5078522264957428, "time_global": 1.4482364058494568}
{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.5104020237922668, "standard_gw": 0.5302242934703827, "standard_gx": 0.5842559039592743, "rowwise_fwd": 0.32220035791397095, "rowwise_bwd": 0.3576017916202545, "global_fwd": 0.2939775586128235, "global_bwd": 0.3313682973384857, "x_quantize_rowwise": 0.12369826436042786, "g_quantize_rowwise": 0.03423169255256653, "w_quantize_rowwise": 0.026501715183258057, "w_quantize_colwise_transpose": 0.16975775361061096, "w_quantize_global": 0.0768713653087616, "w_quantize_global_transpose": 0.08094683289527893, "cast_x": 0.21589547395706177, "cast_g": 0.05825608968734741, "cast_w": 0.03466010093688965, "time_standard": 1.6248822212219238, "time_rowwise": 1.5642158687114716, "time_global": 1.4713183045387268}
{"repeat": 64, "batch_size": 16384, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 1.194491982460022, "standard_gw": 1.0553859174251556, "standard_gx": 1.0726377367973328, "rowwise_fwd": 0.636763870716095, "rowwise_bwd": 0.5154944956302643, "global_fwd": 0.6281323730945587, "global_bwd": 0.5117170512676239, "x_quantize_rowwise": 0.062175095081329346, "g_quantize_rowwise": 0.23643672466278076, "w_quantize_rowwise": 0.025566667318344116, "w_quantize_colwise_transpose": 0.17768144607543945, "w_quantize_global": 0.07302314043045044, "w_quantize_global_transpose": 0.07866695523262024, "cast_x": 0.11140108108520508, "cast_g": 0.42498111724853516, "cast_w": 0.034831464290618896, "time_standard": 3.3225156366825104, "time_rowwise": 2.7095042169094086, "time_global": 2.645537257194519}
{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 1.0797791182994843, "standard_gw": 1.062549650669098, "standard_gx": 1.104947179555893, "rowwise_fwd": 0.5390122532844543, "rowwise_bwd": 0.6449781358242035, "global_fwd": 0.5145668983459473, "global_bwd": 0.6276033818721771, "x_quantize_rowwise": 0.23603439331054688, "g_quantize_rowwise": 0.062234699726104736, "w_quantize_rowwise": 0.02781301736831665, "w_quantize_colwise_transpose": 0.1703314483165741, "w_quantize_global": 0.07431954145431519, "w_quantize_global_transpose": 0.08028373122215271, "cast_x": 0.4249885678291321, "cast_g": 0.1113303005695343, "cast_w": 0.0348016619682312, "time_standard": 3.247275948524475, "time_rowwise": 2.742953598499298, "time_global": 2.657592296600342}
{"repeat": 64, "batch_size": 32768, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 2.392485737800598, "standard_gw": 2.046734094619751, "standard_gx": 2.177651971578598, "rowwise_fwd": 1.252591609954834, "rowwise_bwd": 1.0205842554569244, "global_fwd": 1.230098307132721, "global_bwd": 1.0132193565368652, "x_quantize_rowwise": 0.11823698878288269, "g_quantize_rowwise": 0.4639141261577606, "w_quantize_rowwise": 0.02602487802505493, "w_quantize_colwise_transpose": 0.17801672220230103, "w_quantize_global": 0.07301196455955505, "w_quantize_global_transpose": 0.07893890142440796, "cast_x": 0.21591037511825562, "cast_g": 0.843394547700882, "cast_w": 0.03460049629211426, "time_standard": 6.616871803998947, "time_rowwise": 5.106102675199509, "time_global": 5.0241537392139435}
{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 2.205628901720047, "standard_gw": 1.9917488098144531, "standard_gx": 2.1518059074878693, "rowwise_fwd": 1.040138304233551, "rowwise_bwd": 1.2538731098175049, "global_fwd": 1.0131187736988068, "global_bwd": 1.2291893362998962, "x_quantize_rowwise": 0.46381354331970215, "g_quantize_rowwise": 0.11790916323661804, "w_quantize_rowwise": 0.027123838663101196, "w_quantize_colwise_transpose": 0.17021596431732178, "w_quantize_global": 0.0752471387386322, "w_quantize_global_transpose": 0.08159875869750977, "cast_x": 0.8433908224105835, "cast_g": 0.215873122215271, "cast_w": 0.03452599048614502, "time_standard": 6.349183619022369, "time_rowwise": 5.064822733402252, "time_global": 4.972625523805618}
{"repeat": 64, "batch_size": 65536, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 4.755370318889618, "standard_gw": 4.736289381980896, "standard_gx": 4.0378570556640625, "rowwise_fwd": 2.4783052504062653, "rowwise_bwd": 1.9634142518043518, "global_fwd": 2.435591071844101, "global_bwd": 1.9498206675052643, "x_quantize_rowwise": 0.22948533296585083, "g_quantize_rowwise": 0.9186491370201111, "w_quantize_rowwise": 0.028233975172042847, "w_quantize_colwise_transpose": 0.17858296632766724, "w_quantize_global": 0.07418543100357056, "w_quantize_global_transpose": 0.07958710193634033, "cast_x": 0.4257224500179291, "cast_g": 1.680031418800354, "cast_w": 0.03458559513092041, "time_standard": 13.529516756534576, "time_rowwise": 10.532960295677185, "time_global": 10.423608124256134}
{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 4.050172865390778, "standard_gw": 3.916766494512558, "standard_gx": 4.281226545572281, "rowwise_fwd": 1.9789263606071472, "rowwise_bwd": 2.477586269378662, "global_fwd": 1.9495487213134766, "global_bwd": 2.434592694044113, "x_quantize_rowwise": 0.918261706829071, "g_quantize_rowwise": 0.22961944341659546, "w_quantize_rowwise": 0.025540590286254883, "w_quantize_colwise_transpose": 0.17032772302627563, "w_quantize_global": 0.07384642958641052, "w_quantize_global_transpose": 0.08105114102363586, "cast_x": 1.679886132478714, "cast_g": 0.42508915066719055, "cast_w": 0.03442913293838501, "time_standard": 12.248165905475616, "time_rowwise": 9.717028588056564, "time_global": 9.60368663072586}
{"repeat": 64, "batch_size": 131072, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 9.53347235918045, "standard_gw": 8.138865232467651, "standard_gx": 7.9666972160339355, "rowwise_fwd": 4.984956234693527, "rowwise_bwd": 3.850068897008896, "global_fwd": 4.9025751650333405, "global_bwd": 3.820303827524185, "x_quantize_rowwise": 0.45222043991088867, "g_quantize_rowwise": 1.8290691077709198, "w_quantize_rowwise": 0.026736408472061157, "w_quantize_colwise_transpose": 0.17832592129707336, "w_quantize_global": 0.07471069693565369, "w_quantize_global_transpose": 0.08177757263183594, "cast_x": 0.8435025811195374, "cast_g": 3.3529214560985565, "cast_w": 0.03475695848464966, "time_standard": 25.639034807682037, "time_rowwise": 19.460242241621017, "time_global": 19.299522042274475}
{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 7.996037602424622, "standard_gw": 8.2748644053936, "standard_gx": 8.523400872945786, "rowwise_fwd": 3.8556940853595734, "rowwise_bwd": 4.966288805007935, "global_fwd": 3.820043057203293, "global_bwd": 4.882067441940308, "x_quantize_rowwise": 1.8279887735843658, "g_quantize_rowwise": 0.4520900547504425, "w_quantize_rowwise": 0.02676248550415039, "w_quantize_colwise_transpose": 0.17083808779716492, "w_quantize_global": 0.07691606879234314, "w_quantize_global_transpose": 0.08223950862884521, "cast_x": 3.3530443906784058, "cast_g": 0.8434318006038666, "cast_w": 0.034671276807785034, "time_standard": 24.794302880764008, "time_rowwise": 19.574526697397232, "time_global": 19.416209310293198}
{"repeat": 64, "batch_size": 1024, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.09413063526153564, "standard_gw": 0.10038167238235474, "standard_gx": 0.09725615382194519, "rowwise_fwd": 0.05979463458061218, "rowwise_bwd": 0.0525452196598053, "global_fwd": 0.059057027101516724, "global_bwd": 0.05194917321205139, "x_quantize_rowwise": 0.02664700150489807, "g_quantize_rowwise": 0.02642720937728882, "w_quantize_rowwise": 0.030562281608581543, "w_quantize_colwise_transpose": 0.2400912344455719, "w_quantize_global": 0.09407848119735718, "w_quantize_global_transpose": 0.10256841778755188, "cast_x": 0.008724629878997803, "cast_g": 0.028502196073532104, "cast_w": 0.05552172660827637, "time_standard": 0.29176846146583557, "time_rowwise": 0.5364492535591125, "time_global": 0.4611089825630188}
{"repeat": 64, "batch_size": 1024, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.09753555059432983, "standard_gw": 0.10102242231369019, "standard_gx": 0.09121373295783997, "rowwise_fwd": 0.052150338888168335, "rowwise_bwd": 0.059779733419418335, "global_fwd": 0.05161017179489136, "global_bwd": 0.05943328142166138, "x_quantize_rowwise": 0.026702880859375, "g_quantize_rowwise": 0.02469494938850403, "w_quantize_rowwise": 0.03324449062347412, "w_quantize_colwise_transpose": 0.23468583822250366, "w_quantize_global": 0.09394437074661255, "w_quantize_global_transpose": 0.10142102837562561, "cast_x": 0.028360635042190552, "cast_g": 0.008717179298400879, "cast_w": 0.05577504634857178, "time_standard": 0.28977170586586, "time_rowwise": 0.5322806537151337, "time_global": 0.4588291049003601}
{"repeat": 64, "batch_size": 2048, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.18056854605674744, "standard_gw": 0.18374621868133545, "standard_gx": 0.19219890236854553, "rowwise_fwd": 0.1150965690612793, "rowwise_bwd": 0.0903494656085968, "global_fwd": 0.11263042688369751, "global_bwd": 0.08984282612800598, "x_quantize_rowwise": 0.027067959308624268, "g_quantize_rowwise": 0.040043145418167114, "w_quantize_rowwise": 0.03063306212425232, "w_quantize_colwise_transpose": 0.24128705263137817, "w_quantize_global": 0.09361281991004944, "w_quantize_global_transpose": 0.1024976372718811, "cast_x": 0.01381710171699524, "cast_g": 0.06845593452453613, "cast_w": 0.05572289228439331, "time_standard": 0.5565136671066284, "time_rowwise": 0.7282234728336334, "time_global": 0.6494410336017609}
{"repeat": 64, "batch_size": 2048, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.16536936163902283, "standard_gw": 0.19479170441627502, "standard_gx": 0.18597766757011414, "rowwise_fwd": 0.09634345769882202, "rowwise_bwd": 0.11937320232391357, "global_fwd": 0.09264424443244934, "global_bwd": 0.11524930596351624, "x_quantize_rowwise": 0.04038214683532715, "g_quantize_rowwise": 0.025559216737747192, "w_quantize_rowwise": 0.03334507346153259, "w_quantize_colwise_transpose": 0.23956596851348877, "w_quantize_global": 0.09445473551750183, "w_quantize_global_transpose": 0.1020580530166626, "cast_x": 0.06891414523124695, "cast_g": 0.013861805200576782, "cast_w": 0.05607306957244873, "time_standard": 0.546138733625412, "time_rowwise": 0.7493607699871063, "time_global": 0.6651394069194794}
{"repeat": 64, "batch_size": 4096, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.36064907908439636, "standard_gw": 0.3711991012096405, "standard_gx": 0.3863237798213959, "rowwise_fwd": 0.22270530462265015, "rowwise_bwd": 0.1760348677635193, "global_fwd": 0.21781772375106812, "global_bwd": 0.17484650015830994, "x_quantize_rowwise": 0.02625212073326111, "g_quantize_rowwise": 0.07131323218345642, "w_quantize_rowwise": 0.030372291803359985, "w_quantize_colwise_transpose": 0.23974105715751648, "w_quantize_global": 0.09407475590705872, "w_quantize_global_transpose": 0.1024492084980011, "cast_x": 0.028584152460098267, "cast_g": 0.1303069293498993, "cast_w": 0.05582347512245178, "time_standard": 1.1181719601154327, "time_rowwise": 1.137617975473404, "time_global": 1.057952642440796}
{"repeat": 64, "batch_size": 4096, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.32703205943107605, "standard_gw": 0.3764517605304718, "standard_gx": 0.3938935697078705, "rowwise_fwd": 0.18771737813949585, "rowwise_bwd": 0.2374798059463501, "global_fwd": 0.1843757927417755, "global_bwd": 0.23005902767181396, "x_quantize_rowwise": 0.07155537605285645, "g_quantize_rowwise": 0.02625212073326111, "w_quantize_rowwise": 0.03294646739959717, "w_quantize_colwise_transpose": 0.23755058646202087, "w_quantize_global": 0.09388476610183716, "w_quantize_global_transpose": 0.10246038436889648, "cast_x": 0.13131648302078247, "cast_g": 0.028781592845916748, "cast_w": 0.05638599395751953, "time_standard": 1.0973773896694183, "time_rowwise": 1.1699534952640533, "time_global": 1.0850392282009125}
{"repeat": 64, "batch_size": 8192, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.7961541414260864, "standard_gw": 0.7424280047416687, "standard_gx": 0.8688867092132568, "rowwise_fwd": 0.432576984167099, "rowwise_bwd": 0.34543126821517944, "global_fwd": 0.4248805344104767, "global_bwd": 0.3432855010032654, "x_quantize_rowwise": 0.03750622272491455, "g_quantize_rowwise": 0.13292208313941956, "w_quantize_rowwise": 0.030599534511566162, "w_quantize_colwise_transpose": 0.24292618036270142, "w_quantize_global": 0.09351596236228943, "w_quantize_global_transpose": 0.1026056706905365, "cast_x": 0.06843730807304382, "cast_g": 0.2539418637752533, "cast_w": 0.05568563938140869, "time_standard": 2.407468855381012, "time_rowwise": 1.9643902778625488, "time_global": 1.8771439790725708}
{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.7150471210479736, "standard_gw": 0.7525831460952759, "standard_gx": 0.8075274527072906, "rowwise_fwd": 0.36595389246940613, "rowwise_bwd": 0.4404708743095398, "global_fwd": 0.3485158085823059, "global_bwd": 0.4275962710380554, "x_quantize_rowwise": 0.1329965889453888, "g_quantize_rowwise": 0.03767386078834534, "w_quantize_rowwise": 0.03295019268989563, "w_quantize_colwise_transpose": 0.23509934544563293, "w_quantize_global": 0.09398534893989563, "w_quantize_global_transpose": 0.10186433792114258, "cast_x": 0.2537667751312256, "cast_g": 0.06839632987976074, "cast_w": 0.05571544170379639, "time_standard": 2.27515771985054, "time_rowwise": 1.9977279007434845, "time_global": 1.8952153623104095}
{"repeat": 64, "batch_size": 16384, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 1.6392990946769714, "standard_gw": 1.4941170811653137, "standard_gx": 1.4451220631599426, "rowwise_fwd": 0.8369758725166321, "rowwise_bwd": 0.6830468773841858, "global_fwd": 0.8197203278541565, "global_bwd": 0.6782263517379761, "x_quantize_rowwise": 0.06883591413497925, "g_quantize_rowwise": 0.2565309405326843, "w_quantize_rowwise": 0.03046169877052307, "w_quantize_colwise_transpose": 0.2430342137813568, "w_quantize_global": 0.09346380829811096, "w_quantize_global_transpose": 0.10301917791366577, "cast_x": 0.13044849038124084, "cast_g": 0.5010999739170074, "cast_w": 0.05590170621871948, "time_standard": 4.578538239002228, "time_rowwise": 3.613002598285675, "time_global": 3.5139136016368866}
{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 1.4654621481895447, "standard_gw": 1.5012174844741821, "standard_gx": 1.5183314681053162, "rowwise_fwd": 0.7059797644615173, "rowwise_bwd": 0.8470229804515839, "global_fwd": 0.6788894534111023, "global_bwd": 0.8200779557228088, "x_quantize_rowwise": 0.2564750611782074, "g_quantize_rowwise": 0.06899237632751465, "w_quantize_rowwise": 0.03293529152870178, "w_quantize_colwise_transpose": 0.23559853434562683, "w_quantize_global": 0.09375810623168945, "w_quantize_global_transpose": 0.10203942656517029, "cast_x": 0.5010105669498444, "cast_g": 0.13037025928497314, "cast_w": 0.05577504634857178, "time_standard": 4.485011100769043, "time_rowwise": 3.648221492767334, "time_global": 3.521449863910675}
{"repeat": 64, "batch_size": 32768, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 3.236088901758194, "standard_gw": 2.8601549565792084, "standard_gx": 2.8000958263874054, "rowwise_fwd": 1.6548968851566315, "rowwise_bwd": 1.3559646904468536, "global_fwd": 1.6249343752861023, "global_bwd": 1.3474412262439728, "x_quantize_rowwise": 0.13122707605361938, "g_quantize_rowwise": 0.5038455128669739, "w_quantize_rowwise": 0.03061816096305847, "w_quantize_colwise_transpose": 0.24301931262016296, "w_quantize_global": 0.09343400597572327, "w_quantize_global_transpose": 0.10178983211517334, "cast_x": 0.25383010506629944, "cast_g": 0.9955987334251404, "cast_w": 0.05569681525230408, "time_standard": 8.896339684724808, "time_rowwise": 6.779726594686508, "time_global": 6.662826985120773}
{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 2.8433389961719513, "standard_gw": 2.861086279153824, "standard_gx": 3.0227042734622955, "rowwise_fwd": 1.4057457447052002, "rowwise_bwd": 1.6565024852752686, "global_fwd": 1.3475008308887482, "global_bwd": 1.6247481107711792, "x_quantize_rowwise": 0.5038045346736908, "g_quantize_rowwise": 0.13130158185958862, "w_quantize_rowwise": 0.03298744559288025, "w_quantize_colwise_transpose": 0.23539364337921143, "w_quantize_global": 0.09393692016601562, "w_quantize_global_transpose": 0.10208785533905029, "cast_x": 0.9952597320079803, "cast_g": 0.25385990738868713, "cast_w": 0.05589798092842102, "time_standard": 8.72712954878807, "time_rowwise": 6.826821714639664, "time_global": 6.664466112852097}
{"repeat": 64, "batch_size": 65536, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 6.449159234762192, "standard_gw": 6.384443491697311, "standard_gx": 5.543403327465057, "rowwise_fwd": 3.3065229654312134, "rowwise_bwd": 2.6249960064888, "global_fwd": 3.2497718930244446, "global_bwd": 2.6061534881591797, "x_quantize_rowwise": 0.25821104645729065, "g_quantize_rowwise": 0.9981803596019745, "w_quantize_rowwise": 0.030606985092163086, "w_quantize_colwise_transpose": 0.24094432592391968, "w_quantize_global": 0.09358301758766174, "w_quantize_global_transpose": 0.10264664888381958, "cast_x": 0.5018562078475952, "cast_g": 1.9840113818645477, "cast_w": 0.05584210157394409, "time_standard": 18.37700605392456, "time_rowwise": 13.843905180692673, "time_global": 13.692989945411682}
{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 5.508493632078171, "standard_gw": 5.689781159162521, "standard_gx": 6.020743399858475, "rowwise_fwd": 2.640843391418457, "rowwise_bwd": 3.3075474202632904, "global_fwd": 2.605751156806946, "global_bwd": 3.2674334943294525, "x_quantize_rowwise": 0.9983181953430176, "g_quantize_rowwise": 0.25597214698791504, "w_quantize_rowwise": 0.03277510404586792, "w_quantize_colwise_transpose": 0.23587048053741455, "w_quantize_global": 0.09367987513542175, "w_quantize_global_transpose": 0.10236725211143494, "cast_x": 1.9848868250846863, "cast_g": 0.5010329186916351, "cast_w": 0.055771321058273315, "time_standard": 17.219018191099167, "time_rowwise": 13.161107897758484, "time_global": 13.013303279876709}
{"repeat": 64, "batch_size": 131072, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 12.975204735994339, "standard_gw": 11.424731463193893, "standard_gx": 11.05477660894394, "rowwise_fwd": 6.623122841119766, "rowwise_bwd": 5.253363400697708, "global_fwd": 6.506938487291336, "global_bwd": 5.211424082517624, "x_quantize_rowwise": 0.5057789385318756, "g_quantize_rowwise": 1.9870363175868988, "w_quantize_rowwise": 0.030517578125, "w_quantize_colwise_transpose": 0.24361908435821533, "w_quantize_global": 0.09384006261825562, "w_quantize_global_transpose": 0.10285153985023499, "cast_x": 0.9967051446437836, "cast_g": 3.9620958268642426, "cast_w": 0.05599111318588257, "time_standard": 35.45471280813217, "time_rowwise": 26.068169623613358, "time_global": 25.83260089159012}
{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 11.05555146932602, "standard_gw": 11.32136583328247, "standard_gx": 12.035444378852844, "rowwise_fwd": 5.243867635726929, "rowwise_bwd": 6.622854620218277, "global_fwd": 5.209986120462418, "global_bwd": 6.507329642772675, "x_quantize_rowwise": 1.9862838089466095, "g_quantize_rowwise": 0.506080687046051, "w_quantize_rowwise": 0.03318488597869873, "w_quantize_colwise_transpose": 0.23682788014411926, "w_quantize_global": 0.09349361062049866, "w_quantize_global_transpose": 0.1023709774017334, "cast_x": 3.962486982345581, "cast_g": 0.9956248104572296, "cast_w": 0.05572289228439331, "time_standard": 34.412361681461334, "time_rowwise": 25.950465351343155, "time_global": 25.726910680532455}