{"id":691,"date":"2021-02-18T09:52:36","date_gmt":"2021-02-18T01:52:36","guid":{"rendered":"http:\/\/wh-nx3um5eupfhyxrxim5x.my3w.com\/index.php\/books\/"},"modified":"2026-01-06T14:13:20","modified_gmt":"2026-01-06T06:13:20","slug":"publications","status":"publish","type":"page","link":"http:\/\/zhangleuestc.cn\/index.php\/publications\/","title":{"rendered":"Publications"},"content":{"rendered":"<h1>Publications<\/h1>\n<div class=\"teachpress_pub_list\"><form name=\"tppublistform\" method=\"get\"><a name=\"tppubs\" id=\"tppubs\"><\/a><div class=\"teachpress_filter\"><select class=\"default\" name=\"yr\" id=\"yr\" tabindex=\"2\" onchange=\"teachpress_jumpMenu('parent',this, 'http:\/\/zhangleuestc.cn\/index.php\/publications\/?')\">\n                   <option value=\"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=#tppubs\">All years<\/option>\n                   <option value = \"tgid=&amp;yr=2026&amp;type=&amp;usr=&amp;auth=#tppubs\" >2026<\/option><option value = \"tgid=&amp;yr=2025&amp;type=&amp;usr=&amp;auth=#tppubs\" >2025<\/option><option value = \"tgid=&amp;yr=2024&amp;type=&amp;usr=&amp;auth=#tppubs\" >2024<\/option><option value = \"tgid=&amp;yr=2023&amp;type=&amp;usr=&amp;auth=#tppubs\" >2023<\/option><option value = \"tgid=&amp;yr=2022&amp;type=&amp;usr=&amp;auth=#tppubs\" >2022<\/option><option value = \"tgid=&amp;yr=2021&amp;type=&amp;usr=&amp;auth=#tppubs\" >2021<\/option><option value = \"tgid=&amp;yr=2020&amp;type=&amp;usr=&amp;auth=#tppubs\" >2020<\/option><option value = \"tgid=&amp;yr=2019&amp;type=&amp;usr=&amp;auth=#tppubs\" >2019<\/option><option value = \"tgid=&amp;yr=2018&amp;type=&amp;usr=&amp;auth=#tppubs\" >2018<\/option><option value = \"tgid=&amp;yr=2017&amp;type=&amp;usr=&amp;auth=#tppubs\" >2017<\/option>\n                <\/select><select class=\"default\" name=\"type\" id=\"type\" tabindex=\"3\" onchange=\"teachpress_jumpMenu('parent',this, 'http:\/\/zhangleuestc.cn\/index.php\/publications\/?')\">\n                   <option value=\"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=#tppubs\">All types<\/option>\n                   <option value = \"tgid=&amp;yr=&amp;type=article&amp;usr=&amp;auth=#tppubs\" >Journal Articles<\/option><option value = \"tgid=&amp;yr=&amp;type=conference&amp;usr=&amp;auth=#tppubs\" >Conferences<\/option><option value = \"tgid=&amp;yr=&amp;type=inproceedings&amp;usr=&amp;auth=#tppubs\" >Inproceedings<\/option><option value = \"tgid=&amp;yr=&amp;type=proceedings&amp;usr=&amp;auth=#tppubs\" >Proceedings<\/option>\n                <\/select><select class=\"default\" name=\"auth\" id=\"auth\" tabindex=\"5\" onchange=\"teachpress_jumpMenu('parent',this, 'http:\/\/zhangleuestc.cn\/index.php\/publications\/?')\">\n                   <option value=\"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=#tppubs\">All authors<\/option>\n                   <option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=36#tppubs\" >Le Zhang 0001<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=24#tppubs\" >Narendra Ahuja<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=121#tppubs\" >Aiping Huang, Lijian Li, Le Zhang, Yuzhen Niu, Tiesong Zhao, Chia-Wen Lin<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=117#tppubs\" >Ao Li, Le Zhang, Yun Liu, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=81#tppubs\" >Jia-Wang Bian<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=15#tppubs\" >JiaWang Bian<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=136#tppubs\" >Bing Li, Haotian Duan, Yun Liu, Le Zhang, Wei Cui, Joey Tianyi Zhou<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=116#tppubs\" >Bing Li, Wei Cui, Le Zhang, Ce Zhu, Wei Wang, Ivor Tsang, Joey Tianyi Zhou<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=125#tppubs\" >Bing Li, Wei Cui, Le Zhang, Qi Yang, Min Wu, Joey Tianyi Zhou<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=122#tppubs\" >Boyuan Sun, Yuqi Yang, Le Zhang, Ming-Ming Cheng, Qibin Hou<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=27#tppubs\" >Xiaofeng Cao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=48#tppubs\" >Yang Cao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=19#tppubs\" >Zhiguang Cao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=55#tppubs\" >Cen Chen<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=42#tppubs\" >Liyi Chen<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=9#tppubs\" >Zhenghua Chen<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=13#tppubs\" >Ming-Ming Cheng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=61#tppubs\" >Zhongyao Cheng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=126#tppubs\" >Cheng Gong, Yao Chen, Qiuyang Luo, Ye Lu, Tao Li, Yuzhi Zhang, Yufei Sun, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=134#tppubs\" >Cheng Lei, Ao Li, Hu Yao, Ce Zhu, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=139#tppubs\" >Cheng Lei, Jie Fan, Xinran Li, Tianzhu Xiang, Ao Li, Ce Zhu, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=7#tppubs\" >Wei Cui<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=65#tppubs\" >Jiawei Du<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=14#tppubs\" >Deng-Ping Fan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=64#tppubs\" >Zhiwen Fang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=118#tppubs\" >Fanxing Liu, Cheng Zeng, Le Zhang*, Yingjie Zhou*, Qing Mu, Yanru Zhang, Ling Zhang, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=84#tppubs\" >Chuan-Sheng Foo<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=114#tppubs\" >GangXu, QiBin Hou, Le Zhang, Ming-Ming Cheng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=56#tppubs\" >Kaizhou Gao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=87#tppubs\" >Wang Gao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=11#tppubs\" >Teo Sin Gee<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=57#tppubs\" >Hongliang Guo<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=111#tppubs\" >Guolei Sun, Yun Liu, Hao Tang, Ajad Chhatkuli, Le Zhang, Luc Van Gool<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=137#tppubs\" >Hailong Yan, Ao Li, Xiangtao Zhang, Zhe Liu, Zenglin Shi, Ce Zhu, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=132#tppubs\" >Hao Yu, Xin Yang, Le Zhang, Hanlin Gu, Tianrui Li, Lixin Fan, Qiang Yang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=62#tppubs\" >Steven CH Hoi<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=46#tppubs\" >Chaoyang Jiang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=33#tppubs\" >Peng-Tao Jiang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=141#tppubs\" >Le Zhang, Ao Li, Qibin Hou, Ce Zhu, Yonina C. Eldar<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=124#tppubs\" >Le Zhang, Qibin Hou, Yun Liu, Jia-Wang Bian, Xun Xu, Joey Tianyi Zhou, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=8#tppubs\" >Bing Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=18#tppubs\" >Jingwen Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=39#tppubs\" >Lei Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=35#tppubs\" >Shi-Jie Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=88#tppubs\" >Wei Wang Wei Cui Bing Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=49#tppubs\" >Xuan-Yi Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=91#tppubs\" >Zhichao Li<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=71#tppubs\" >Wen-Yan Lin<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=83#tppubs\" >Fayao Liu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=12#tppubs\" >Yun Liu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=45#tppubs\" >Shao-Ping Lu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=17#tppubs\" >Yining Ma<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=25#tppubs\" >Pierre Moulin<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=131#tppubs\" >Obed Irihose, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=53#tppubs\" >Songyou Peng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=66#tppubs\" >Xi Peng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=34#tppubs\" >Vahan Petrosyan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=144#tppubs\" >Qian Zeng, Le Zhang, Yipeng Liu, Ce Zhu, Fan Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=73#tppubs\" >Ian Reid<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=142#tppubs\" >Renjie Lin, Hongzhi He, Yilin Wu, Shide Du, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=140#tppubs\" >Renjie Lin, Jiacheng Li, Shide Du, Shiping Wang, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=145#tppubs\" >Rongxin Liao, Feng Li, Yanyan Wei, Zenglin Shi, Le Zhang, Huihui Bai, Meng Wang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=44#tppubs\" >Dongyu She<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=85#tppubs\" >Chunhua Shen<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=26#tppubs\" >Zenglin Shi<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=138#tppubs\" >Shiping Wang, Yueyang Pi, Yang Huang, Fuhai Chen, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=20#tppubs\" >Wen Song<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=23#tppubs\" >Ponnuthurai Nagaratnam Suganthan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=43#tppubs\" >Xiaoxiao Sun<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=21#tppubs\" >Jing Tang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=16#tppubs\" >Dacheng Tao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=120#tppubs\" >Tian Gao, Cheng-Zhong Xu, Le Zhang, Hui Kong<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=22#tppubs\" >Jagannadan Varadarajan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=30#tppubs\" >Chen Wang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=90#tppubs\" >Naiyan Wang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=113#tppubs\" >Wei Cui, Le Zhang, Bing Li, Zhenghua Chen, Min Wu, Xiaoli Li, Jiawen Kang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=119#tppubs\" >Wei Meng, Zhicong Liu, Bing Li, Wei Cui, Joey Tianyi Zhou, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=130#tppubs\" >Weiting Ou, Yipeng Liu, Zhijie Sun, Bing Li, Le Zhang, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=54#tppubs\" >Stefan Winkler<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=10#tppubs\" >Min Wu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=86#tppubs\" >Yu-Huan Wu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=133#tppubs\" >Xiangtao Zhang, Sheng Li, Ao Li, Yipeng Liu, Fan Zhang, Ce Zhu, Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=67#tppubs\" >Yang Xiao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=31#tppubs\" >Lihua Xie<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=129#tppubs\" >Xinhao Li, Yun Liu, Guolei Sun, Min Wu, Le Zhang, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=68#tppubs\" >Xun Xu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=143#tppubs\" >Yan Huang, Yongyi Su, Xin Lin, Le Zhang, Xun Xu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=40#tppubs\" >Guang Yang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=41#tppubs\" >Jufeng Yang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=28#tppubs\" >Yangdong Ye<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=72#tppubs\" >Sai-Kit Yeung<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=135#tppubs\" >Yu-Huan Wu, Shi-Chen Zhang, Yun Liu, Le Zhang, Xin Zhan, Daquan Zhou, Jiashi Feng, Ming-Ming Cheng, Liangli Zhen<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=115#tppubs\" >Yu-Huan Wu, Yun Liu, Le Zhang, Ming-Ming Cheng, Bo Ren<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=32#tppubs\" >Junsong Yuan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=37#tppubs\" >Guodong Zeng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=60#tppubs\" >Zeng Zeng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=89#tppubs\" >Huangying Zhan<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=6#tppubs\" >Le Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=82#tppubs\" >Wanyue Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=92#tppubs\" >Xin-Yu Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=58#tppubs\" >Xuexi Zhang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=47#tppubs\" >Jia-Xing Zhao<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=128#tppubs\" >Zhendong Liu, Le Zhang, Bing Li, Yingjie Zhou, Zhenghua Chen, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=29#tppubs\" >Guoyan Zheng<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=127#tppubs\" >Zhengyuan Xie, Haiquan Lu, Jia-wen Xiao, Enguang Wang, Le Zhang, Xialei Liu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=123#tppubs\" >Zhiwei Lin, Zhe Liu, Zhongyu Xia, Xinhao Wang, Yongtao Wang, Shengxiang Qi, Yang Dong, Nan Dong, Le Zhang, Ce Zhu<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=63#tppubs\" >Joey Tianyi Zhou<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=109#tppubs\" >Joey Tianyi Zhou; Le Zhang*; Jiawei Du; Xi Peng; Zhiwen Fang; Zhe Xiao; Hongyuan Zhu;<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=110#tppubs\" >Xun Xu; Loong-Fah Cheong; Zhuwen Li; Le Zhang; Ce Zhu;<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=38#tppubs\" >Xiahai Zhuang<\/option><option value = \"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=59#tppubs\" >Xiaofeng Zou<\/option>\n                <\/select><select class=\"default\" name=\"usr\" id=\"usr\" tabindex=\"6\" onchange=\"teachpress_jumpMenu('parent',this, 'http:\/\/zhangleuestc.cn\/index.php\/publications\/?')\">\n                   <option value=\"tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=#tppubs\">All users<\/option>\n                   <option value = \"tgid=&amp;yr=&amp;type=&amp;usr=1&amp;auth=#tppubs\" >admin<\/option>\n                <\/select><\/div><\/form><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">62 entries<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 of 2 <a href=\"http:\/\/zhangleuestc.cn\/index.php\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"next page\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"http:\/\/zhangleuestc.cn\/index.php\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"last page\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><div class=\"teachpress_publication_list\"><h3 class=\"tp_h3\" id=\"tp_h3_2026\">2026<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Renjie Lin, Hongzhi He, Yilin Wu, Shide Du, Le Zhang<\/p><p class=\"tp_pub_title\">Graph Meets Deep Unfolding: An Interpretable Mutual-benefit Multi-view Learning Network <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">AAAI 2026, <\/span><span class=\"tp_pub_additional_year\">2026<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_67\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('67','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_67\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('67','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/rl227\/IMML-Net\" title=\"https:\/\/github.com\/rl227\/IMML-Net\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_67\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Graph Meets Deep Unfolding: An Interpretable Mutual-benefit Multi-view Learning Network},<br \/>\r\nauthor = {Renjie Lin, Hongzhi He, Yilin Wu, Shide Du, Le Zhang},<br \/>\r\nurl = {https:\/\/github.com\/rl227\/IMML-Net},<br \/>\r\nyear  = {2026},<br \/>\r\ndate = {2026-01-21},<br \/>\r\nurldate = {2026-01-21},<br \/>\r\nbooktitle = {AAAI 2026},<br \/>\r\nabstract = {Significant efforts have been focused on enhancing the uti\u0002lization of multiple node features and topological structures in multi-view graph learning through explicit model-driven and implicit deep learning-based methodologies. The for\u0002mer excels in embedding prior knowledge, thereby offering theoretical interpretability but is limited in application flex\u0002ibility due to manual parameter selection. In contrast, the<br \/>\r\nlatter leverages automatic differentiation, providing greater flexibility but lacking theoretical interpretability due to their opaque nature. Motivated by these observations, we propose an interpretable deep unfolding network for mutual-benefit multi-view graph learning, aiming to combine the strengths of both approaches. Specifically, we employ the Alternating Direction Method of Multipliers (ADMM) to solve a multi\u0002view graph learning model with sparse and low-rank con\u0002straints. This solution is then integrated into deep unfolding networks to enhance interpretability. Furthermore, we con\u0002vert optimization conditions into implicit losses and utilize automatic differentiation to update parameters, reducing the need for manual tuning and increasing flexibility. This inte\u0002gration optimizes multi-view learning for a graph represen\u0002tation that balances interpretability and flexibility. Empirical evaluations on six diverse datasets demonstrate the effective\u0002ness and superiority of the proposed method over state-of\u0002the-art approaches},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('67','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_67\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Significant efforts have been focused on enhancing the uti\u0002lization of multiple node features and topological structures in multi-view graph learning through explicit model-driven and implicit deep learning-based methodologies. The for\u0002mer excels in embedding prior knowledge, thereby offering theoretical interpretability but is limited in application flex\u0002ibility due to manual parameter selection. In contrast, the<br \/>\r\nlatter leverages automatic differentiation, providing greater flexibility but lacking theoretical interpretability due to their opaque nature. Motivated by these observations, we propose an interpretable deep unfolding network for mutual-benefit multi-view graph learning, aiming to combine the strengths of both approaches. Specifically, we employ the Alternating Direction Method of Multipliers (ADMM) to solve a multi\u0002view graph learning model with sparse and low-rank con\u0002straints. This solution is then integrated into deep unfolding networks to enhance interpretability. Furthermore, we con\u0002vert optimization conditions into implicit losses and utilize automatic differentiation to update parameters, reducing the need for manual tuning and increasing flexibility. This inte\u0002gration optimizes multi-view learning for a graph represen\u0002tation that balances interpretability and flexibility. Empirical evaluations on six diverse datasets demonstrate the effective\u0002ness and superiority of the proposed method over state-of\u0002the-art approaches<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('67','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yan Huang, Yongyi Su, Xin Lin, Le Zhang, Xun Xu<\/p><p class=\"tp_pub_title\">Enhancing Generalization of Depth Estimation Foundation Model via Weakly-Supervised Adaptation with Regularization <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">AAAI 2026, <\/span><span class=\"tp_pub_additional_year\">2026<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_68\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('68','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_68\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('68','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_68\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Enhancing Generalization of Depth Estimation Foundation Model via Weakly-Supervised Adaptation with Regularization},<br \/>\r\nauthor = {Yan Huang, Yongyi Su, Xin Lin, Le Zhang, Xun Xu},<br \/>\r\nyear  = {2026},<br \/>\r\ndate = {2026-01-20},<br \/>\r\nurldate = {2026-01-20},<br \/>\r\nbooktitle = {AAAI 2026},<br \/>\r\njournal = {AAAI 2026},<br \/>\r\nabstract = {The emergence of foundation models has substantially advanced zero-shot generalization in monocular depth estimation (MDE), as exemplified by the Depth Anything series. However, given access to some data from downstream tasks, a natural question arises: can the performance of these models be further improved? To this end, we propose WeSTAR, a parameter-efficient framework that performs Weakly supervised Self-Training Adaptation with Regularization, designed to enhance the robustness of MDE foundation models in unseen and diverse domains. We first adopt a dense self-training objective as the primary source of structural self-supervision. To further improve robustness, we introduce semantically-aware hierarchical normalization, which exploits instance-level segmentation maps to perform more stable and multi-scale structural normalization. Beyond dense supervision, we introduce a cost-efficient weak supervision in the form of pairwise ordinal depth annotations to further guide the adaptation process, which enforces informative ordinal constraints to mitigate local topological errors. Finally, a weight regularization loss is employed to anchor the LoRA updates, ensuring training stability and preserving the model's generalizable knowledge. Extensive experiments on both realistic and corrupted out-of-distribution datasets under diverse and challenging scenarios demonstrate that WeSTAR consistently improves generalization and achieves state-of-the-art performance across a wide range of benchmarks.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('68','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_68\" style=\"display:none;\"><div class=\"tp_abstract_entry\">The emergence of foundation models has substantially advanced zero-shot generalization in monocular depth estimation (MDE), as exemplified by the Depth Anything series. However, given access to some data from downstream tasks, a natural question arises: can the performance of these models be further improved? To this end, we propose WeSTAR, a parameter-efficient framework that performs Weakly supervised Self-Training Adaptation with Regularization, designed to enhance the robustness of MDE foundation models in unseen and diverse domains. We first adopt a dense self-training objective as the primary source of structural self-supervision. To further improve robustness, we introduce semantically-aware hierarchical normalization, which exploits instance-level segmentation maps to perform more stable and multi-scale structural normalization. Beyond dense supervision, we introduce a cost-efficient weak supervision in the form of pairwise ordinal depth annotations to further guide the adaptation process, which enforces informative ordinal constraints to mitigate local topological errors. Finally, a weight regularization loss is employed to anchor the LoRA updates, ensuring training stability and preserving the model's generalizable knowledge. Extensive experiments on both realistic and corrupted out-of-distribution datasets under diverse and challenging scenarios demonstrate that WeSTAR consistently improves generalization and achieves state-of-the-art performance across a wide range of benchmarks.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('68','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2025\">2025<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Qian Zeng, Le Zhang, Yipeng Liu, Ce Zhu, Fan Zhang<\/p><p class=\"tp_pub_title\">FunOTTA: On-the-Fly Adaptation on Cross-Domain Fundus Image via Stable Test-time Training <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Medical Imaging, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_69\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('69','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_69\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('69','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/Casperqian\/FunOTTA\" title=\"https:\/\/github.com\/Casperqian\/FunOTTA\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_69\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {FunOTTA: On-the-Fly Adaptation on Cross-Domain Fundus Image via Stable Test-time Training},<br \/>\r\nauthor = {Qian Zeng, Le Zhang, Yipeng Liu, Ce Zhu, Fan Zhang},<br \/>\r\nurl = {https:\/\/github.com\/Casperqian\/FunOTTA},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-12-01},<br \/>\r\nurldate = {2025-12-01},<br \/>\r\njournal = {IEEE Transactions on Medical Imaging},<br \/>\r\nabstract = {Fundus images are essential for the early screening and detection of eye diseases. While deep learning models using fundus images have significantly advanced the diagnosis of multiple eye diseases, variations in images from different imaging devices and locations (known as domain shifts) pose challenges for deploying pre-trained models in real-world applications. To address this, we propose a novel Fundus On-the-fly Test-Time Adaptation (FunOTTA) framework that effectively generalizes a fundus image diagnosis model to unseen environments, even under strong domain shifts. FunOTTA stands out for its stable adaptation process by performing dynamic disambiguation in the memory bank while minimizing harmful prior knowledge bias. We also introduce a new training objective during adaptation that enables the classifier to incrementally adapt to target patterns with reliable class conditional estimation and consistency regularization. We compare our method with several state-of-the-art test-time adaptation (TTA) pipelines. Experiments on cross-domain fundus image benchmarks across two diseases demonstrate the superiority of the overall framework and individual components under different backbone networks. Code is available at https:\/\/github.com\/Casperqian\/FunOTTA.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('69','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_69\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Fundus images are essential for the early screening and detection of eye diseases. While deep learning models using fundus images have significantly advanced the diagnosis of multiple eye diseases, variations in images from different imaging devices and locations (known as domain shifts) pose challenges for deploying pre-trained models in real-world applications. To address this, we propose a novel Fundus On-the-fly Test-Time Adaptation (FunOTTA) framework that effectively generalizes a fundus image diagnosis model to unseen environments, even under strong domain shifts. FunOTTA stands out for its stable adaptation process by performing dynamic disambiguation in the memory bank while minimizing harmful prior knowledge bias. We also introduce a new training objective during adaptation that enables the classifier to incrementally adapt to target patterns with reliable class conditional estimation and consistency regularization. We compare our method with several state-of-the-art test-time adaptation (TTA) pipelines. Experiments on cross-domain fundus image benchmarks across two diseases demonstrate the superiority of the overall framework and individual components under different backbone networks. Code is available at https:\/\/github.com\/Casperqian\/FunOTTA.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('69','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Rongxin Liao, Feng Li, Yanyan Wei, Zenglin Shi, Le Zhang, Huihui Bai, Meng Wang<\/p><p class=\"tp_pub_title\">Prompt to Restore, Restore to Prompt: Cyclic Prompting for Universal Adverse Weather Removal <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Image Processing, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_70\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('70','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_70\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('70','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/RongxinL\/CyclicPrompt\" title=\"https:\/\/github.com\/RongxinL\/CyclicPrompt\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_70\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Prompt to Restore, Restore to Prompt: Cyclic Prompting for Universal Adverse Weather Removal},<br \/>\r\nauthor = {Rongxin Liao, Feng Li, Yanyan Wei, Zenglin Shi, Le Zhang, Huihui Bai, Meng Wang},<br \/>\r\nurl = {https:\/\/github.com\/RongxinL\/CyclicPrompt},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-12-01},<br \/>\r\nurldate = {2025-12-01},<br \/>\r\njournal = {IEEE Transactions on Image Processing},<br \/>\r\nabstract = {Universal adverse weather removal (UAWR) seeks to address various weather degradations within a unified framework. Recent methods are inspired by prompt learning using pre-trained vision-language models (e.g., CLIP), leveraging degradation-aware prompts to facilitate weather-free image restoration, yielding significant improvements. In this work, we propose CyclicPrompt, an innovative cyclic prompt approach designed to enhance the effectiveness, adaptability, and generalizability of UAWR. CyclicPrompt Comprises two key components: 1) a composite context prompt that integrates weather-related information and context-aware representations into the network to guide restoration. This prompt differs from previous methods by marrying learnable input-conditional vectors with weather-specific knowledge, thereby improving adaptability across various degradations. 2) The erase-and-paste mechanism, after the initial guided restoration, substitutes weather-specific knowledge with constrained restoration priors, inducing high-quality weather-free concepts into the composite prompt to further fine-tune the restoration process. Therefore, we can form a cyclic \"Prompt-Restore-Prompt\" pipeline that adeptly harnesses weather-specific knowledge, textual contexts, and reliable textures. Extensive experiments on synthetic and real-world datasets validate the superior performance of CyclicPrompt. The code is available at: https:\/\/github.com\/RongxinL\/CyclicPrompt.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('70','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_70\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Universal adverse weather removal (UAWR) seeks to address various weather degradations within a unified framework. Recent methods are inspired by prompt learning using pre-trained vision-language models (e.g., CLIP), leveraging degradation-aware prompts to facilitate weather-free image restoration, yielding significant improvements. In this work, we propose CyclicPrompt, an innovative cyclic prompt approach designed to enhance the effectiveness, adaptability, and generalizability of UAWR. CyclicPrompt Comprises two key components: 1) a composite context prompt that integrates weather-related information and context-aware representations into the network to guide restoration. This prompt differs from previous methods by marrying learnable input-conditional vectors with weather-specific knowledge, thereby improving adaptability across various degradations. 2) The erase-and-paste mechanism, after the initial guided restoration, substitutes weather-specific knowledge with constrained restoration priors, inducing high-quality weather-free concepts into the composite prompt to further fine-tune the restoration process. Therefore, we can form a cyclic &quot;Prompt-Restore-Prompt&quot; pipeline that adeptly harnesses weather-specific knowledge, textual contexts, and reliable textures. Extensive experiments on synthetic and real-world datasets validate the superior performance of CyclicPrompt. The code is available at: https:\/\/github.com\/RongxinL\/CyclicPrompt.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('70','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Le Zhang, Ao Li, Qibin Hou, Ce Zhu, Yonina C. Eldar<\/p><p class=\"tp_pub_title\">Deep Learning Empowered Super-Resolution: A Comprehensive Survey and Future Prospects <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">Proceedings of the IEEE, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_66\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('66','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_66\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('66','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/AVC2-UESTC\/Holistic-Super-Resolution-Review\" title=\"https:\/\/github.com\/AVC2-UESTC\/Holistic-Super-Resolution-Review\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_66\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Deep Learning Empowered Super-Resolution: A Comprehensive Survey and Future Prospects},<br \/>\r\nauthor = {Le Zhang, Ao Li, Qibin Hou, Ce Zhu, Yonina C. Eldar},<br \/>\r\nurl = {https:\/\/github.com\/AVC2-UESTC\/Holistic-Super-Resolution-Review},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-09-14},<br \/>\r\nurldate = {2025-09-14},<br \/>\r\njournal = {Proceedings of the IEEE},<br \/>\r\nabstract = {Super-resolution (SR) has garnered significant attention within the computer vision community, driven by advances in deep learning (DL) techniques and the growing demand for high-quality visual applications. With the expansion of this field, numerous surveys have emerged. Most existing surveys focus on specific domains, lacking a comprehensive overview of this field. Here, we present an in-depth review of diverse SR methods, encompassing single image super-resolution (SISR), video super-resolution (VSR), stereo super-resolution (SSR), and light field super-resolution (LFSR). We extensively cover over 150 SISR methods, nearly 70 VSR approaches, and approximately 30 techniques for SSR and LFSR. We analyze methodologies, datasets, evaluation protocols, empirical results, and complexity. In addition, we conducted a taxonomy based on each backbone structure according to the diverse purposes. We also explore valuable yet under-studied open issues in the field. We believe that this work will serve as a valuable resource and offer guidance to researchers in this domain. To facilitate access to related work, we created a dedicated repository available at url{https:\/\/github.com\/AVC2-UESTC\/Holistic-Super-Resolution-Review}},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('66','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_66\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Super-resolution (SR) has garnered significant attention within the computer vision community, driven by advances in deep learning (DL) techniques and the growing demand for high-quality visual applications. With the expansion of this field, numerous surveys have emerged. Most existing surveys focus on specific domains, lacking a comprehensive overview of this field. Here, we present an in-depth review of diverse SR methods, encompassing single image super-resolution (SISR), video super-resolution (VSR), stereo super-resolution (SSR), and light field super-resolution (LFSR). We extensively cover over 150 SISR methods, nearly 70 VSR approaches, and approximately 30 techniques for SSR and LFSR. We analyze methodologies, datasets, evaluation protocols, empirical results, and complexity. In addition, we conducted a taxonomy based on each backbone structure according to the diverse purposes. We also explore valuable yet under-studied open issues in the field. We believe that this work will serve as a valuable resource and offer guidance to researchers in this domain. To facilitate access to related work, we created a dedicated repository available at url{https:\/\/github.com\/AVC2-UESTC\/Holistic-Super-Resolution-Review}<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('66','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Cheng Lei, Jie Fan, Xinran Li, Tianzhu Xiang, Ao Li, Ce Zhu, Le Zhang\r\n<\/p><p class=\"tp_pub_title\">Towards Real Zero-Shot Camouflaged Object Segmentation without Camouflaged Annotations <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_64\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('64','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_64\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('64','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_64\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Towards Real Zero-Shot Camouflaged Object Segmentation without Camouflaged Annotations},<br \/>\r\nauthor = {Cheng Lei, Jie Fan, Xinran Li, Tianzhu Xiang, Ao Li, Ce Zhu, Le Zhang<br \/>\r\n},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-08-14},<br \/>\r\nurldate = {2025-08-14},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {Camouflaged Object Segmentation (COS) faces significant challenges due to the scarcity of annotated data, where meticulous pixel-level annotation is both labor-intensive and costly, primarily due to the intricate object-background boundaries. Addressing the core question, \"Can COS be effectively achieved in a zero-shot manner without manual annotations for any camouflaged object?\", we propose an affirmative solution. We analyze the learned attention patterns for camouflaged objects and introduce a robust zero-shot COS framework. Our findings reveal that while transformer models for salient object segmentation (SOS) prioritize global features in their attention mechanisms, camouflaged object segmentation exhibits both global and local attention biases. Based on these findings, we design a framework that adapts with the inherent local pattern bias of COS while incorporating global attention patterns and a broad semantic feature space derived from SOS. This enables efficient zero-shot transfer for COS. Specifically, We incorporate an Masked Image Modeling (MIM) based image encoder optimized for Parameter-Efficient Fine-Tuning (PEFT), a Multimodal Large Language Model (M-LLM), and a Multi-scale Fine-grained Alignment (MFA) mechanism. The MIM encoder captures essential local features, while the PEFT module learns global and semantic representations from SOS datasets. To further enhance semantic granularity, we leverage the M-LLM to generate caption embeddings conditioned on visual cues, which are meticulously aligned with multi-scale visual features via MFA. This alignment enables precise interpretation of complex semantic contexts. Moreover, we introduce a learnable codebook to represent the M-LLM during inference, significantly reducing computational demands while maintaining performance. Our framework demonstrates its versatility and efficacy through rigorous experimentation, achieving state-of-the-art performance in zero-shot COS with $F_{beta}^w$ scores of 72.9% on CAMO and 71.7% on COD10K. By removing the M-LLM during inference, we achieve an inference speed comparable to that of traditional end-to-end models, reaching 18.1 FPS. Additionally, our method excels in polyp segmentation, and underwater scene segmentation, outperforming challenging baselines in both zero-shot and supervised settings, thereby highlighting its potential for broad applicability in diverse segmentation tasks.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('64','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_64\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Camouflaged Object Segmentation (COS) faces significant challenges due to the scarcity of annotated data, where meticulous pixel-level annotation is both labor-intensive and costly, primarily due to the intricate object-background boundaries. Addressing the core question, &quot;Can COS be effectively achieved in a zero-shot manner without manual annotations for any camouflaged object?&quot;, we propose an affirmative solution. We analyze the learned attention patterns for camouflaged objects and introduce a robust zero-shot COS framework. Our findings reveal that while transformer models for salient object segmentation (SOS) prioritize global features in their attention mechanisms, camouflaged object segmentation exhibits both global and local attention biases. Based on these findings, we design a framework that adapts with the inherent local pattern bias of COS while incorporating global attention patterns and a broad semantic feature space derived from SOS. This enables efficient zero-shot transfer for COS. Specifically, We incorporate an Masked Image Modeling (MIM) based image encoder optimized for Parameter-Efficient Fine-Tuning (PEFT), a Multimodal Large Language Model (M-LLM), and a Multi-scale Fine-grained Alignment (MFA) mechanism. The MIM encoder captures essential local features, while the PEFT module learns global and semantic representations from SOS datasets. To further enhance semantic granularity, we leverage the M-LLM to generate caption embeddings conditioned on visual cues, which are meticulously aligned with multi-scale visual features via MFA. This alignment enables precise interpretation of complex semantic contexts. Moreover, we introduce a learnable codebook to represent the M-LLM during inference, significantly reducing computational demands while maintaining performance. Our framework demonstrates its versatility and efficacy through rigorous experimentation, achieving state-of-the-art performance in zero-shot COS with $F_{beta}^w$ scores of 72.9% on CAMO and 71.7% on COD10K. By removing the M-LLM during inference, we achieve an inference speed comparable to that of traditional end-to-end models, reaching 18.1 FPS. Additionally, our method excels in polyp segmentation, and underwater scene segmentation, outperforming challenging baselines in both zero-shot and supervised settings, thereby highlighting its potential for broad applicability in diverse segmentation tasks.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('64','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Shiping Wang, Yueyang Pi, Yang Huang, Fuhai Chen, Le Zhang<\/p><p class=\"tp_pub_title\">Multi-Channel Equilibrium Graph Neural Network for Multi-View Semi-Supervised Learning <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_63\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('63','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_63\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('63','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_63\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Multi-Channel Equilibrium Graph Neural Network for Multi-View Semi-Supervised Learning},<br \/>\r\nauthor = {Shiping Wang, Yueyang Pi, Yang Huang, Fuhai Chen, Le Zhang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-06-27},<br \/>\r\nurldate = {2025-06-27},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {In practical applications, the difficulty of multi-view data annotation poses a challenge for multi-view semi-supervised learning. Although some graph-based approaches have been proposed for this task, they often struggle with capturing long-range information and memory bottlenecks, and usually encounter over-smoothing. To address these issues, this papers propose an implicit model, named<br \/>\r\nMulti-view Equilibrium Graph Neural Network (MEGNN). Through an equilibrium point iterative process, the proposed MEGNN naturally captures long-range information and effectively reduces the consumption of memory compared with explicit models. Furthermore, the proposed method deals with the issue of over-smoothing in deep graph convolutional networks by residual connection and shrinkage factor. We analyze the effect of the shrinkage factor on the information capturing capability of the model, and demonstrate that the proposed method does not encounter over-smoothing. Comprehensive experimental results demonstrate that the proposed method outperforms compared state-of-the-arts.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('63','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_63\" style=\"display:none;\"><div class=\"tp_abstract_entry\">In practical applications, the difficulty of multi-view data annotation poses a challenge for multi-view semi-supervised learning. Although some graph-based approaches have been proposed for this task, they often struggle with capturing long-range information and memory bottlenecks, and usually encounter over-smoothing. To address these issues, this papers propose an implicit model, named<br \/>\r\nMulti-view Equilibrium Graph Neural Network (MEGNN). Through an equilibrium point iterative process, the proposed MEGNN naturally captures long-range information and effectively reduces the consumption of memory compared with explicit models. Furthermore, the proposed method deals with the issue of over-smoothing in deep graph convolutional networks by residual connection and shrinkage factor. We analyze the effect of the shrinkage factor on the information capturing capability of the model, and demonstrate that the proposed method does not encounter over-smoothing. Comprehensive experimental results demonstrate that the proposed method outperforms compared state-of-the-arts.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('63','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Hailong Yan, Ao Li, Xiangtao Zhang, Zhe Liu, Zenglin Shi, Ce Zhu, Le Zhang\r\n<\/p><p class=\"tp_pub_title\">MobileIE: An Extremely Lightweight and Effective ConvNet for Real-Time Image Enhancement on Mobile Devices <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">ICCV , <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_62\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('62','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_62\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('62','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_62\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {MobileIE: An Extremely Lightweight and Effective ConvNet for Real-Time Image Enhancement on Mobile Devices},<br \/>\r\nauthor = {Hailong Yan, Ao Li, Xiangtao Zhang, Zhe Liu, Zenglin Shi, Ce Zhu, Le Zhang<br \/>\r\n},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-06-26},<br \/>\r\nurldate = {2025-06-26},<br \/>\r\nbooktitle = {ICCV },<br \/>\r\nabstract = {Recent advancements in deep neural networks have driven significant progress in image enhancement (IE). However, deploying deep learning models on resource-constrained platforms, such as mobile devices, remains challenging due to high computation and memory demands. To address these challenges and facilitate real-time IE on mobile, we introduce an extremely lightweight Convolutional Neural Network (CNN) framework with around 4K parameters. Our approach integrates re-parameterization with an Incremental Weight Optimization strategy to ensure efficiency. Additionally, we enhance performance with a Feature Self-Transform module and a Hierarchical Dual-Path Attention mechanism, optimized with a Local Variance-Weighted loss. With this efficient framework, we are the first to achieve real-time IE inference at up to 1,100 frames per second (FPS) while delivering competitive image quality, achieving the best trade-off between speed and performance across multiple IE tasks.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('62','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_62\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Recent advancements in deep neural networks have driven significant progress in image enhancement (IE). However, deploying deep learning models on resource-constrained platforms, such as mobile devices, remains challenging due to high computation and memory demands. To address these challenges and facilitate real-time IE on mobile, we introduce an extremely lightweight Convolutional Neural Network (CNN) framework with around 4K parameters. Our approach integrates re-parameterization with an Incremental Weight Optimization strategy to ensure efficiency. Additionally, we enhance performance with a Feature Self-Transform module and a Hierarchical Dual-Path Attention mechanism, optimized with a Local Variance-Weighted loss. With this efficient framework, we are the first to achieve real-time IE inference at up to 1,100 frames per second (FPS) while delivering competitive image quality, achieving the best trade-off between speed and performance across multiple IE tasks.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('62','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yu-Huan Wu, Shi-Chen Zhang, Yun Liu, Le Zhang, Xin Zhan, Daquan Zhou, Jiashi Feng, Ming-Ming Cheng, Liangli Zhen<\/p><p class=\"tp_pub_title\">Low-Resolution Self-Attention for Semantic Segmentation <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_60\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('60','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_60\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('60','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/yuhuan-wu\/LRFormer\" title=\"https:\/\/github.com\/yuhuan-wu\/LRFormer\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_60\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Low-Resolution Self-Attention for Semantic Segmentation},<br \/>\r\nauthor = {Yu-Huan Wu, Shi-Chen Zhang, Yun Liu, Le Zhang, Xin Zhan, Daquan Zhou, Jiashi Feng, Ming-Ming Cheng, Liangli Zhen},<br \/>\r\nurl = {https:\/\/github.com\/yuhuan-wu\/LRFormer},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-06-05},<br \/>\r\nurldate = {2025-06-05},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {Semantic segmentation tasks naturally require high-resolution information for pixel-wise segmentation and global context<br \/>\r\ninformation for class prediction. While existing vision transformers demonstrate promising performance, they often utilize high-resolution<br \/>\r\ncontext modeling, resulting in a computational bottleneck. In this work, we challenge conventional wisdom and introduce the Low-<br \/>\r\nResolution Self-Attention (LRSA) mechanism to capture global context at a significantly reduced computational cost, i.e., FLOPs. Our<br \/>\r\napproach involves computing self-attention in a fixed low-resolution space, regardless of the input image\u2019s resolution, with additional<br \/>\r\n3 \u00d7 3 depth-wise convolutions to capture fine details in the high-resolution space. We demonstrate the effectiveness of our LRSA<br \/>\r\napproach by building the LRFormer, a vision transformer with an encoder-decoder structure. Extensive experiments on the ADE20K,<br \/>\r\nCOCO-Stuff, and Cityscapes datasets demonstrate that LRFormer outperforms state-of-the-art models. Code is available at https:<br \/>\r\n\/\/github.com\/yuhuan-wu\/LRFormer.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('60','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_60\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Semantic segmentation tasks naturally require high-resolution information for pixel-wise segmentation and global context<br \/>\r\ninformation for class prediction. While existing vision transformers demonstrate promising performance, they often utilize high-resolution<br \/>\r\ncontext modeling, resulting in a computational bottleneck. In this work, we challenge conventional wisdom and introduce the Low-<br \/>\r\nResolution Self-Attention (LRSA) mechanism to capture global context at a significantly reduced computational cost, i.e., FLOPs. Our<br \/>\r\napproach involves computing self-attention in a fixed low-resolution space, regardless of the input image\u2019s resolution, with additional<br \/>\r\n3 \u00d7 3 depth-wise convolutions to capture fine details in the high-resolution space. We demonstrate the effectiveness of our LRSA<br \/>\r\napproach by building the LRFormer, a vision transformer with an encoder-decoder structure. Extensive experiments on the ADE20K,<br \/>\r\nCOCO-Stuff, and Cityscapes datasets demonstrate that LRFormer outperforms state-of-the-art models. Code is available at https:<br \/>\r\n\/\/github.com\/yuhuan-wu\/LRFormer.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('60','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Bing Li, Haotian Duan, Yun Liu, Le Zhang, Wei Cui, Joey Tianyi Zhou<\/p><p class=\"tp_pub_title\">STADe: Sensory Temporal Action Detection via Temporal-Spectral Representation Learning <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_61\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('61','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_61\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('61','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_61\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {STADe: Sensory Temporal Action Detection via Temporal-Spectral Representation Learning},<br \/>\r\nauthor = {Bing Li, Haotian Duan, Yun Liu, Le Zhang, Wei Cui, Joey Tianyi Zhou},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-06-05},<br \/>\r\nurldate = {2025-06-05},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {Temporal action detection (TAD) is a vital challenge in computer vision and the Internet of Things, aiming to detect and identify actions within temporal sequences. While TAD has primarily been associated with video data, its applications can also be extended to sensor data, opening up opportunities for various real-world applications. However, applying existing TAD models to sensory signals presents distinct challenges such as varying sampling rates, intricate pattern structures, and subtle, noise-prone patterns. In response to these challenges, we propose a Sensory Temporal Action Detection (STADe) model. STADe leverages Fourier kernels and adaptive frequency filtering to adaptively capture the nuanced interplay of temporal and frequency features underlying complex patterns. Moreover, STADe embraces adaptability by employing deep fusion at varying resolutions and scales, making it versatile enough to accommodate diverse data characteristics, such as the wide spectrum of sampling rates and action durations encountered in sensory signals. Unlike conventional models with unidirectional category-to-proposal dependencies, STADe adopts a cross-cascade predictor to introduce bidirectional and temporal dependencies within categories. To extensively evaluate STADe and promote future research in sensory TAD, we establish three diverse datasets using various sensors, featuring diverse sensor types, action categories, and sampling rates. Experiments across one public and our three new datasets demonstrate STADe's superior performance over state-of-the-art TAD models in sensory TAD tasks. Codes, models, and data will be released.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('61','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_61\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Temporal action detection (TAD) is a vital challenge in computer vision and the Internet of Things, aiming to detect and identify actions within temporal sequences. While TAD has primarily been associated with video data, its applications can also be extended to sensor data, opening up opportunities for various real-world applications. However, applying existing TAD models to sensory signals presents distinct challenges such as varying sampling rates, intricate pattern structures, and subtle, noise-prone patterns. In response to these challenges, we propose a Sensory Temporal Action Detection (STADe) model. STADe leverages Fourier kernels and adaptive frequency filtering to adaptively capture the nuanced interplay of temporal and frequency features underlying complex patterns. Moreover, STADe embraces adaptability by employing deep fusion at varying resolutions and scales, making it versatile enough to accommodate diverse data characteristics, such as the wide spectrum of sampling rates and action durations encountered in sensory signals. Unlike conventional models with unidirectional category-to-proposal dependencies, STADe adopts a cross-cascade predictor to introduce bidirectional and temporal dependencies within categories. To extensively evaluate STADe and promote future research in sensory TAD, we establish three diverse datasets using various sensors, featuring diverse sensor types, action categories, and sampling rates. Experiments across one public and our three new datasets demonstrate STADe's superior performance over state-of-the-art TAD models in sensory TAD tasks. Codes, models, and data will be released.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('61','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Renjie Lin, Jiacheng Li, Shide Du, Shiping Wang, Le Zhang<\/p><p class=\"tp_pub_title\">OIMGC-Net: Optimization-inspired Interpretable Multi-view Graph Clustering Network <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">ACM MM, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_65\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('65','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_65\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('65','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_65\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {OIMGC-Net: Optimization-inspired Interpretable Multi-view Graph Clustering Network},<br \/>\r\nauthor = {Renjie Lin, Jiacheng Li, Shide Du, Shiping Wang, Le Zhang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-05-01},<br \/>\r\nurldate = {2025-05-01},<br \/>\r\nbooktitle = {ACM MM},<br \/>\r\nabstract = {Deep multi-view graph clustering seeks to integrate diverse graph feature sets and uncover consistent information across multiple<br \/>\r\nviews. While extensive prior research has utilized various neural network architectures to address multi-view graph clustering<br \/>\r\nchallenges, these approaches exhibit notable limitations: 1) The \"black-box\" nature of deep learning models, which obscures their<br \/>\r\ninternal mechanisms and impedes interpretability; 2) Insufffcient efforts aim to capture low-dimensional representations through<br \/>\r\ngraphs that reffect intuitive clustering structures and reduce computational cost. To address these limitations, this paper introduces an<br \/>\r\ninterpretable multi-view graph clustering framework constructed with optimization-inspired modules. The proposed approach formulates<br \/>\r\nlow-dimensional clustering representation learning from graph matrices as an optimization problem, deriving an iterative solution rooted in this formulation. By seamlessly bridging this optimization process to a deep network architecture, the model learns a low-dimensional clustering representation for graph-structured data across multiple views while adhering to the iterative optimization principles and reducing computational costs. This transparent network design enhances the interpretability of multi-view clustering, enabling intuitive and human-understandable learning of clustering structures. Extensive experimental evaluations validate the proposed framework\u2019s superiority over state-of-the-art methods in multi-view clustering tasks while ensuring interpretability.<br \/>\r\n<br \/>\r\nand reducing computational costs.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('65','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_65\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Deep multi-view graph clustering seeks to integrate diverse graph feature sets and uncover consistent information across multiple<br \/>\r\nviews. While extensive prior research has utilized various neural network architectures to address multi-view graph clustering<br \/>\r\nchallenges, these approaches exhibit notable limitations: 1) The &quot;black-box&quot; nature of deep learning models, which obscures their<br \/>\r\ninternal mechanisms and impedes interpretability; 2) Insufffcient efforts aim to capture low-dimensional representations through<br \/>\r\ngraphs that reffect intuitive clustering structures and reduce computational cost. To address these limitations, this paper introduces an<br \/>\r\ninterpretable multi-view graph clustering framework constructed with optimization-inspired modules. The proposed approach formulates<br \/>\r\nlow-dimensional clustering representation learning from graph matrices as an optimization problem, deriving an iterative solution rooted in this formulation. By seamlessly bridging this optimization process to a deep network architecture, the model learns a low-dimensional clustering representation for graph-structured data across multiple views while adhering to the iterative optimization principles and reducing computational costs. This transparent network design enhances the interpretability of multi-view clustering, enabling intuitive and human-understandable learning of clustering structures. Extensive experimental evaluations validate the proposed framework\u2019s superiority over state-of-the-art methods in multi-view clustering tasks while ensuring interpretability.<br \/>\r\n<br \/>\r\nand reducing computational costs.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('65','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Cheng Lei, Ao Li, Hu Yao, Ce Zhu, Le Zhang<\/p><p class=\"tp_pub_title\">Rethinking Token Reduction with Parameter-Efficient Fine-Tuning in ViT for Pixel-Level Tasks <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">CVPR, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_58\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('58','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_58\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('58','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_58\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Rethinking Token Reduction with Parameter-Efficient Fine-Tuning in ViT for Pixel-Level Tasks},<br \/>\r\nauthor = {Cheng Lei, Ao Li, Hu Yao, Ce Zhu, Le Zhang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-02-28},<br \/>\r\nurldate = {2025-02-28},<br \/>\r\nbooktitle = {CVPR},<br \/>\r\nabstract = {Parameter-efficient fine-tuning (PEFT) adapts pre-trained models to new tasks by updating only a small subset of parameters, achieving efficiency but still facing significant inference costs driven by input token length. This challenge is even more pronounced in pixel-level tasks, which require longer input sequences compared to image-level tasks. Although token reduction (TR) techniques can help reduce computational demands, they often lead to homogeneous attention patterns that compromise performance in pixel-level scenarios. This study underscores the importance of maintaining attention diversity for these tasks and proposes to enhance attention diversity while ensuring the completeness of token sequences. Our approach effectively reduces the number of tokens processed within transformer blocks, improving computational efficiency without sacrificing performance on several pixel-level tasks. We also demonstrate the superior generalization capability of our proposed method compared to challenging baseline models.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('58','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_58\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Parameter-efficient fine-tuning (PEFT) adapts pre-trained models to new tasks by updating only a small subset of parameters, achieving efficiency but still facing significant inference costs driven by input token length. This challenge is even more pronounced in pixel-level tasks, which require longer input sequences compared to image-level tasks. Although token reduction (TR) techniques can help reduce computational demands, they often lead to homogeneous attention patterns that compromise performance in pixel-level scenarios. This study underscores the importance of maintaining attention diversity for these tasks and proposes to enhance attention diversity while ensuring the completeness of token sequences. Our approach effectively reduces the number of tokens processed within transformer blocks, improving computational efficiency without sacrificing performance on several pixel-level tasks. We also demonstrate the superior generalization capability of our proposed method compared to challenging baseline models.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('58','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Xiangtao Zhang, Sheng Li, Ao Li, Yipeng Liu, Fan Zhang, Ce Zhu, Le Zhang<\/p><p class=\"tp_pub_title\">Subspace Constraint and Contribution Estimation for Heterogeneous Federated Learning <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">CVPR, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_59\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('59','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_59\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('59','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_59\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Subspace Constraint and Contribution Estimation for Heterogeneous Federated Learning},<br \/>\r\nauthor = {Xiangtao Zhang, Sheng Li, Ao Li, Yipeng Liu, Fan Zhang, Ce Zhu, Le Zhang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-02-28},<br \/>\r\nurldate = {2025-02-28},<br \/>\r\nbooktitle = {CVPR},<br \/>\r\nabstract = {Heterogeneous Federated Learning (HFL) has received widespread attention due to its adaptability to different models and data. The HFL approach utilizing auxiliary models for knowledge transfer enhances flexibility. However, existing frameworks face the challenges of aggregation bias and local overfitting. To address these issues, we propose FedSCE. It reduces the degree of freedom of update and improves generalization performance by limiting the specific layer of local model update to the local subspace. The subspace is dynamically updated to ensure coverage of the latest model update trajectory. Additionally, FedSCE evaluates client contributions based on the update distance of the auxiliary model in feature space and parameter space, achieving adaptive weighted aggregation. We validate our approach in both feature-skewed and label-skewed scenarios, demonstrating that on the Office10, our method exceeds the best baseline by 3.87%. Our source code will be released.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('59','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_59\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Heterogeneous Federated Learning (HFL) has received widespread attention due to its adaptability to different models and data. The HFL approach utilizing auxiliary models for knowledge transfer enhances flexibility. However, existing frameworks face the challenges of aggregation bias and local overfitting. To address these issues, we propose FedSCE. It reduces the degree of freedom of update and improves generalization performance by limiting the specific layer of local model update to the local subspace. The subspace is dynamically updated to ensure coverage of the latest model update trajectory. Additionally, FedSCE evaluates client contributions based on the update distance of the auxiliary model in feature space and parameter space, achieving adaptive weighted aggregation. We validate our approach in both feature-skewed and label-skewed scenarios, demonstrating that on the Office10, our method exceeds the best baseline by 3.87%. Our source code will be released.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('59','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Hao Yu, Xin Yang, Le Zhang, Hanlin Gu, Tianrui Li, Lixin Fan, Qiang Yang<\/p><p class=\"tp_pub_title\">Handling Spatial-Temporal Data Heterogeneity for Federated Continual Learning via Tail Anchor <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">CVPR, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_57\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('57','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_57\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('57','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_57\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Handling Spatial-Temporal Data Heterogeneity for Federated Continual Learning via Tail Anchor},<br \/>\r\nauthor = {Hao Yu, Xin Yang, Le Zhang, Hanlin Gu, Tianrui Li, Lixin Fan, Qiang Yang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-02-02},<br \/>\r\nurldate = {2025-02-02},<br \/>\r\nbooktitle = {CVPR},<br \/>\r\njournal = {CVPR},<br \/>\r\nabstract = {Federated continual learning (FCL) allows each client to continually update its knowledge from task streams, enhancing the applicability of federated learning in real-world scenarios. However, FCL needs to address not only spatial data heterogeneity between clients but also temporal data heterogeneity between tasks. In this paper, empirical experiments demonstrate that such input-level heterogeneity significantly affects the model's internal parameters and outputs, leading to severe spatial-temporal catastrophic forgetting of previous and local knowledge. To this end, we propose Federated Tail Anchor (FedTA) to mix trainable Tail Anchor with the frozen output features to adjust their position in the feature space, thereby overcoming parameter-forgetting and output-forgetting. Moreover, three novel components are also included in FedTA:  Input Enhancement for improving the performance of pre-trained models on downstream tasks;  Selective Input Knowledge Fusion for fusion of heterogeneous local knowledge on the server side; and Best Global Prototype Selection for finding the best anchor point for each class in the feature space. Extensive experiments demonstrate that FedTA not only outperforms existing FCL methods but also effectively preserves the relative positions of features, remaining unaffected by spatial and temporal changes},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('57','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_57\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Federated continual learning (FCL) allows each client to continually update its knowledge from task streams, enhancing the applicability of federated learning in real-world scenarios. However, FCL needs to address not only spatial data heterogeneity between clients but also temporal data heterogeneity between tasks. In this paper, empirical experiments demonstrate that such input-level heterogeneity significantly affects the model's internal parameters and outputs, leading to severe spatial-temporal catastrophic forgetting of previous and local knowledge. To this end, we propose Federated Tail Anchor (FedTA) to mix trainable Tail Anchor with the frozen output features to adjust their position in the feature space, thereby overcoming parameter-forgetting and output-forgetting. Moreover, three novel components are also included in FedTA:  Input Enhancement for improving the performance of pre-trained models on downstream tasks;  Selective Input Knowledge Fusion for fusion of heterogeneous local knowledge on the server side; and Best Global Prototype Selection for finding the best anchor point for each class in the feature space. Extensive experiments demonstrate that FedTA not only outperforms existing FCL methods but also effectively preserves the relative positions of features, remaining unaffected by spatial and temporal changes<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('57','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Zhendong Liu, Le Zhang, Bing Li, Yingjie Zhou, Zhenghua Chen, Ce Zhu<\/p><p class=\"tp_pub_title\">WiFi CSI Based Temporal Activity Detection Via Dual Pyramid Network <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">AAAI, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_52\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('52','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_52\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('52','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/AVC2-UESTC\/WiFiTAD\" title=\"https:\/\/github.com\/AVC2-UESTC\/WiFiTAD\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_52\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {WiFi CSI Based Temporal Activity Detection Via Dual Pyramid Network},<br \/>\r\nauthor = {Zhendong Liu, Le Zhang, Bing Li, Yingjie Zhou, Zhenghua Chen, Ce Zhu},<br \/>\r\nurl = {https:\/\/github.com\/AVC2-UESTC\/WiFiTAD},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-05},<br \/>\r\nurldate = {2025-01-05},<br \/>\r\nbooktitle = {AAAI},<br \/>\r\njournal = {AAAI},<br \/>\r\nabstract = {We address the challenge of WiFi-based temporal activity detection and propose an efficient Dual Pyramid Network that integrates Temporal Signal Semantic Encoders and Local Sensitive Response Encoders. The Temporal Signal Semantic Encoder splits feature learning into high and low-frequency components, using a novel Signed Mask-Attention mechanism to emphasize important areas and downplay unimportant ones, with the features fused using ContraNorm. The Local Sensitive Response Encoder captures fluctuations without learning. These feature pyramids are then combined using a new cross-attention fusion mechanism. We also introduce a dataset with over 2,114 activity segments across 553 WiFi CSI samples, each lasting around 85 seconds. Extensive experiments show our method outperforms challenging baselines. },<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('52','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_52\" style=\"display:none;\"><div class=\"tp_abstract_entry\">We address the challenge of WiFi-based temporal activity detection and propose an efficient Dual Pyramid Network that integrates Temporal Signal Semantic Encoders and Local Sensitive Response Encoders. The Temporal Signal Semantic Encoder splits feature learning into high and low-frequency components, using a novel Signed Mask-Attention mechanism to emphasize important areas and downplay unimportant ones, with the features fused using ContraNorm. The Local Sensitive Response Encoder captures fluctuations without learning. These feature pyramids are then combined using a new cross-attention fusion mechanism. We also introduce a dataset with over 2,114 activity segments across 553 WiFi CSI samples, each lasting around 85 seconds. Extensive experiments show our method outperforms challenging baselines. <\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('52','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Ao Li, Le Zhang, Yun Liu, Ce Zhu<\/p><p class=\"tp_pub_title\">Exploring Frequency-Inspired Optimization in Transformer for Efficient Single Image Super-Resolution <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_53\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('53','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_53\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('53','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/AVC2-UESTC\/Frequency-Inspired-Optimization-for-EfficientSR\" title=\"https:\/\/github.com\/AVC2-UESTC\/Frequency-Inspired-Optimization-for-EfficientSR\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_53\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Exploring Frequency-Inspired Optimization in Transformer for Efficient Single Image Super-Resolution},<br \/>\r\nauthor = {Ao Li, Le Zhang, Yun Liu, Ce Zhu},<br \/>\r\nurl = {https:\/\/github.com\/AVC2-UESTC\/Frequency-Inspired-Optimization-for-EfficientSR},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-05},<br \/>\r\nurldate = {2025-01-05},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {Transformer-based methods have exhibited remarkable potential in single image super-resolution (SISR) by effectively extracting long-range dependencies. However, most of the current research in this area has prioritized the design of transformer blocks to capture global information, while overlooking the importance of incorporating high-frequency priors, which we believe could be beneficial. In our study, we conducted a series of experiments and found that transformer structures are more adept at capturing low-frequency information, but have limited capacity in constructing high-frequency representations when compared to their convolutional counterparts. Our proposed solution, the cross-refinement adaptive feature modulation transformer (CRAFT), integrates the strengths of both convolutional and transformer structures. It comprises three key components: the high-frequency enhancement residual block (HFERB) for extracting high-frequency information, the shift rectangle window attention block (SRWAB) for capturing global information, and the hybrid fusion block (HFB) for refining the global representation. To tackle the inherent intricacies of transformer structures, we introduce a frequency-guided post-training quantization (PTQ) method aimed at enhancing CRAFT's efficiency. These strategies incorporate adaptive dual clipping and boundary refinement. To further amplify the versatility of our proposed approach, we extend our PTQ strategy to function as a general quantization method for transformer-based SISR techniques. Our experimental findings showcase CRAFT's superiority over current state-of-the-art methods, both in full-precision and quantization scenarios. These results underscore the efficacy and universality of our PTQ strategy.  Code available: https:\/\/github.com\/AVC2-UESTC\/Frequency-Inspired-Optimization-for-EfficientSR},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('53','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_53\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Transformer-based methods have exhibited remarkable potential in single image super-resolution (SISR) by effectively extracting long-range dependencies. However, most of the current research in this area has prioritized the design of transformer blocks to capture global information, while overlooking the importance of incorporating high-frequency priors, which we believe could be beneficial. In our study, we conducted a series of experiments and found that transformer structures are more adept at capturing low-frequency information, but have limited capacity in constructing high-frequency representations when compared to their convolutional counterparts. Our proposed solution, the cross-refinement adaptive feature modulation transformer (CRAFT), integrates the strengths of both convolutional and transformer structures. It comprises three key components: the high-frequency enhancement residual block (HFERB) for extracting high-frequency information, the shift rectangle window attention block (SRWAB) for capturing global information, and the hybrid fusion block (HFB) for refining the global representation. To tackle the inherent intricacies of transformer structures, we introduce a frequency-guided post-training quantization (PTQ) method aimed at enhancing CRAFT's efficiency. These strategies incorporate adaptive dual clipping and boundary refinement. To further amplify the versatility of our proposed approach, we extend our PTQ strategy to function as a general quantization method for transformer-based SISR techniques. Our experimental findings showcase CRAFT's superiority over current state-of-the-art methods, both in full-precision and quantization scenarios. These results underscore the efficacy and universality of our PTQ strategy.  Code available: https:\/\/github.com\/AVC2-UESTC\/Frequency-Inspired-Optimization-for-EfficientSR<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('53','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Xinhao Li, Yun Liu, Guolei Sun, Min Wu, Le Zhang, Ce Zhu\r\n<\/p><p class=\"tp_pub_title\">Towards Open-Vocabulary Video Semantic Segmentation <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Multimedia, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_54\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('54','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_54\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('54','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/AVC2-UESTC\/OV2VSS\" title=\"https:\/\/github.com\/AVC2-UESTC\/OV2VSS\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_54\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Towards Open-Vocabulary Video Semantic Segmentation},<br \/>\r\nauthor = {Xinhao Li, Yun Liu, Guolei Sun, Min Wu, Le Zhang, Ce Zhu<br \/>\r\n},<br \/>\r\nurl = {https:\/\/github.com\/AVC2-UESTC\/OV2VSS},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-05},<br \/>\r\nurldate = {2025-01-05},<br \/>\r\njournal = {IEEE Transactions on Multimedia},<br \/>\r\nabstract = {Semantic segmentation in videos has been a focal point of recent research. However, existing models encounter challenges when faced with unfamiliar categories. To address this, we introduce the Open Vocabulary Video Semantic Segmentation (OV-VSS) task, designed to accurately segment every pixel across a wide range of open-vocabulary categories, including those that are novel or previously unexplored. To enhance OV-VSS performance, we propose a robust baseline, OV2VSS, which integrates a spatial-temporal fusion module, allowing the model to utilize temporal relationships across consecutive frames. Additionally, we incorporate a random frame enhancement module, broadening the model's understanding of semantic context throughout the entire video sequence. Our approach also includes video text encoding, which strengthens the model's capability to interpret textual information within the video context. Comprehensive evaluations on benchmark datasets such as VSPW and Cityscapes highlight OV-VSS's zero-shot generalization capabilities, especially in handling novel categories. The results validate OV2VSS's effectiveness, demonstrating improved performance in semantic segmentation tasks across diverse video datasets. Code available: https:\/\/github.com\/AVC2-UESTC\/OV2VSS},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('54','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_54\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Semantic segmentation in videos has been a focal point of recent research. However, existing models encounter challenges when faced with unfamiliar categories. To address this, we introduce the Open Vocabulary Video Semantic Segmentation (OV-VSS) task, designed to accurately segment every pixel across a wide range of open-vocabulary categories, including those that are novel or previously unexplored. To enhance OV-VSS performance, we propose a robust baseline, OV2VSS, which integrates a spatial-temporal fusion module, allowing the model to utilize temporal relationships across consecutive frames. Additionally, we incorporate a random frame enhancement module, broadening the model's understanding of semantic context throughout the entire video sequence. Our approach also includes video text encoding, which strengthens the model's capability to interpret textual information within the video context. Comprehensive evaluations on benchmark datasets such as VSPW and Cityscapes highlight OV-VSS's zero-shot generalization capabilities, especially in handling novel categories. The results validate OV2VSS's effectiveness, demonstrating improved performance in semantic segmentation tasks across diverse video datasets. Code available: https:\/\/github.com\/AVC2-UESTC\/OV2VSS<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('54','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Weiting Ou, Yipeng Liu, Zhijie Sun, Bing Li, Le Zhang, Ce Zhu<\/p><p class=\"tp_pub_title\">Codar: Complex-valued Neural Network for Crossing-Floor Intrusion Detection via WiFi <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">IEEE International Conference on Acoustics, Speech and Signal Processing, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_55\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('55','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_55\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('55','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/ouweiting\/Codar\" title=\"https:\/\/github.com\/ouweiting\/Codar\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_55\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {Codar: Complex-valued Neural Network for Crossing-Floor Intrusion Detection via WiFi},<br \/>\r\nauthor = {Weiting Ou, Yipeng Liu, Zhijie Sun, Bing Li, Le Zhang, Ce Zhu},<br \/>\r\nurl = {https:\/\/github.com\/ouweiting\/Codar},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-04},<br \/>\r\nurldate = {2025-01-04},<br \/>\r\nbooktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing},<br \/>\r\nabstract = {WiFi systems offer enormous potential for device-free human intrusion detection. Current methods often require routers to be deployed in multiple adjacent rooms on the same floor, which is redundant and costly. To solve this, we introduce the first work on intrusion detection in the crossing-floor scenario via WiFi. Routers on different floors are utilized without major modifications to the existing router layout. Many previous works require a high sample rate and ignore the phase information. In this paper, we propose Codar, a complex-valued LSTM-CNN neural network. The LSTM effectively captures temporal dependencies at a low sample rate in harsh propagation environments. Moreover, amplitude and phase features are explored jointly by complex-valued operations. Experimental results demonstrate Codar achieves 95%, 94.5%, and 99% accuracy for intrusion detection, user identification, and intruded floor identification, surpassing competitive methods. The code and dataset are available at https:\/\/github.com\/ouweiting\/Codar.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('55','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_55\" style=\"display:none;\"><div class=\"tp_abstract_entry\">WiFi systems offer enormous potential for device-free human intrusion detection. Current methods often require routers to be deployed in multiple adjacent rooms on the same floor, which is redundant and costly. To solve this, we introduce the first work on intrusion detection in the crossing-floor scenario via WiFi. Routers on different floors are utilized without major modifications to the existing router layout. Many previous works require a high sample rate and ignore the phase information. In this paper, we propose Codar, a complex-valued LSTM-CNN neural network. The LSTM effectively captures temporal dependencies at a low sample rate in harsh propagation environments. Moreover, amplitude and phase features are explored jointly by complex-valued operations. Experimental results demonstrate Codar achieves 95%, 94.5%, and 99% accuracy for intrusion detection, user identification, and intruded floor identification, surpassing competitive methods. The code and dataset are available at https:\/\/github.com\/ouweiting\/Codar.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('55','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Obed Irihose, Le Zhang<\/p><p class=\"tp_pub_title\">ExVC: Leveraging Mixture of Experts Models for Efficient Zero-shot Voice Conversion <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">IEEE International Conference on Acoustics, Speech and Signal Processing, <\/span><span class=\"tp_pub_additional_year\">2025<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_56\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('56','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_56\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('56','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_56\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {ExVC: Leveraging Mixture of Experts Models for Efficient Zero-shot Voice Conversion},<br \/>\r\nauthor = {Obed Irihose, Le Zhang},<br \/>\r\nyear  = {2025},<br \/>\r\ndate = {2025-01-04},<br \/>\r\nurldate = {2025-01-04},<br \/>\r\nbooktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing},<br \/>\r\nabstract = {Zero-shot voice conversion (VC) aims to alter the speaker identity in a voice to resemble that of the target speaker using only a short reference speech. While existing methods have achieved notable success in generating intelligible speech, balancing the trade-off between quality and similarity of the converted voice remains a challenge, especially when using a short target reference. This paper proposes ExVC, a zero-shot VC model that leverages the mixture of experts (MoE) layers and Conformer modules to enhance the expressiveness and overall performance. Additionally, to efficiently condition the model on speaker embedding, we employ feature-wise linear modulation (FiLM), which modulates the network based on the input speaker embedding, thereby improving the ability to adapt to various unseen speakers. Objective and subjective evaluations demonstrate that the proposed model outperforms the baseline models in terms of naturalness and quality. Audio samples are provided at: https:\/\/tksavy.github.io\/exvc\/.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('56','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_56\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Zero-shot voice conversion (VC) aims to alter the speaker identity in a voice to resemble that of the target speaker using only a short reference speech. While existing methods have achieved notable success in generating intelligible speech, balancing the trade-off between quality and similarity of the converted voice remains a challenge, especially when using a short target reference. This paper proposes ExVC, a zero-shot VC model that leverages the mixture of experts (MoE) layers and Conformer modules to enhance the expressiveness and overall performance. Additionally, to efficiently condition the model on speaker embedding, we employ feature-wise linear modulation (FiLM), which modulates the network based on the input speaker embedding, thereby improving the ability to adapt to various unseen speakers. Objective and subjective evaluations demonstrate that the proposed model outperforms the baseline models in terms of naturalness and quality. Audio samples are provided at: https:\/\/tksavy.github.io\/exvc\/.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('56','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2024\">2024<\/h3><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Cheng Gong, Yao Chen, Qiuyang Luo, Ye Lu, Tao Li, Yuzhi Zhang, Yufei Sun, Le Zhang<\/p><p class=\"tp_pub_title\">Deep Feature Surgery: Towards Accurate and Efficient Multi-Exit Networks <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">ECCV, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_50\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('50','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_50\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('50','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_50\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {Deep Feature Surgery: Towards Accurate and Efficient Multi-Exit Networks},<br \/>\r\nauthor = {Cheng Gong, Yao Chen, Qiuyang Luo, Ye Lu, Tao Li, Yuzhi Zhang, Yufei Sun, Le Zhang},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-11-13},<br \/>\r\nurldate = {2024-11-13},<br \/>\r\nbooktitle = {ECCV},<br \/>\r\nabstract = {Multi-exit network is a promising architecture for efficient model inference by sharing backbone networks and weights among multiple exits. However, the gradient conflict of the shared weights results in sub-optimal accuracy. This paper introduces Deep Feature Surgery (methodname), which consists of feature partitioning and feature referencing approaches to resolve gradient conflict issues during the training of multi-exit networks. The feature partitioning separates shared features along the depth axis among all exits to alleviate gradient conflict while simultaneously promoting joint optimization for each exit. Subsequently, feature referencing enhances multi-scale features for distinct exits across varying depths to improve the model accuracy. Furthermore, methodname~reduces the training operations with the reduced complexity of backpropagation. Experimental results on Cifar100 and ImageNet datasets exhibit that methodname~provides up to a textbf{50.00%} reduction in training time and attains up to a textbf{6.94%} enhancement in accuracy when contrasted with baseline methods across diverse models and tasks. Budgeted batch classification evaluation on MSDNet demonstrates that DFS uses about  fewer average FLOPs per image to achieve the same classification accuracy as baseline methods on Cifar100.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('50','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_50\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Multi-exit network is a promising architecture for efficient model inference by sharing backbone networks and weights among multiple exits. However, the gradient conflict of the shared weights results in sub-optimal accuracy. This paper introduces Deep Feature Surgery (methodname), which consists of feature partitioning and feature referencing approaches to resolve gradient conflict issues during the training of multi-exit networks. The feature partitioning separates shared features along the depth axis among all exits to alleviate gradient conflict while simultaneously promoting joint optimization for each exit. Subsequently, feature referencing enhances multi-scale features for distinct exits across varying depths to improve the model accuracy. Furthermore, methodname~reduces the training operations with the reduced complexity of backpropagation. Experimental results on Cifar100 and ImageNet datasets exhibit that methodname~provides up to a textbf{50.00%} reduction in training time and attains up to a textbf{6.94%} enhancement in accuracy when contrasted with baseline methods across diverse models and tasks. Budgeted batch classification evaluation on MSDNet demonstrates that DFS uses about  fewer average FLOPs per image to achieve the same classification accuracy as baseline methods on Cifar100.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('50','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_conference\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Zhengyuan Xie, Haiquan Lu, Jia-wen Xiao, Enguang Wang, Le Zhang, Xialei Liu<\/p><p class=\"tp_pub_title\">Early Preparation Pays Off: New Classifier Pre-tuning for Class Incremental Semantic Segmentation <span class=\"tp_pub_type conference\">Conference<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_booktitle\">ECCV, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_51\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('51','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_51\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('51','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/zhengyuan-xie\/ECCV24_NeST\" title=\"https:\/\/github.com\/zhengyuan-xie\/ECCV24_NeST\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_51\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@conference{nokey,<br \/>\r\ntitle = {Early Preparation Pays Off: New Classifier Pre-tuning for Class Incremental Semantic Segmentation},<br \/>\r\nauthor = {Zhengyuan Xie, Haiquan Lu, Jia-wen Xiao, Enguang Wang, Le Zhang, Xialei Liu},<br \/>\r\nurl = { https:\/\/github.com\/zhengyuan-xie\/ECCV24_NeST },<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-11-13},<br \/>\r\nurldate = {2024-11-13},<br \/>\r\nbooktitle = {ECCV},<br \/>\r\nabstract = {Class incremental semantic segmentation aims to preserve old knowledge while learning new tasks, however, it is impeded by catastrophic forgetting and background shift issues. Prior works indicate the pivotal importance of initializing new classifiers and mainly focus on transferring knowledge from the background classifier or preparing classifiers for future classes, neglecting the flexibility and variance of new classifiers. In this paper, we propose a new classifier pre-tuning (NeST) method applied before the formal training process, learning a transformation from old classifiers to generate new classifiers for initialization rather than directly tuning the parameters of new classifiers. Our method can make new classifiers align with the backbone and adapt to the new data, preventing drastic changes in the feature extractor when learning new classes. Besides, we design a strategy considering the cross-task class similarity to initialize matrices used in the transformation, helping achieve the stability-plasticity trade-off. Experiments on Pascal VOC 2012 and ADE20K datasets show that the proposed strategy can significantly improve the performance of previous methods. The code is available at https:\/\/github.com\/zhengyuan-xie\/ECCV24_NeST .},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {conference}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('51','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_51\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Class incremental semantic segmentation aims to preserve old knowledge while learning new tasks, however, it is impeded by catastrophic forgetting and background shift issues. Prior works indicate the pivotal importance of initializing new classifiers and mainly focus on transferring knowledge from the background classifier or preparing classifiers for future classes, neglecting the flexibility and variance of new classifiers. In this paper, we propose a new classifier pre-tuning (NeST) method applied before the formal training process, learning a transformation from old classifiers to generate new classifiers for initialization rather than directly tuning the parameters of new classifiers. Our method can make new classifiers align with the backbone and adapt to the new data, preventing drastic changes in the feature extractor when learning new classes. Besides, we design a strategy considering the cross-task class similarity to initialize matrices used in the transformation, helping achieve the stability-plasticity trade-off. Experiments on Pascal VOC 2012 and ADE20K datasets show that the proposed strategy can significantly improve the performance of previous methods. The code is available at https:\/\/github.com\/zhengyuan-xie\/ECCV24_NeST .<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('51','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Bing Li, Wei Cui, Le Zhang, Qi Yang, Min Wu, Joey Tianyi Zhou<\/p><p class=\"tp_pub_title\">Democratizing Federated WiFi-based Human Activity Recognition Using Hypothesis Transfer <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Mobile Computing, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_49\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('49','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_49\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('49','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_49\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Democratizing Federated WiFi-based Human Activity Recognition Using Hypothesis Transfer},<br \/>\r\nauthor = {Bing Li, Wei Cui, Le Zhang, Qi Yang, Min Wu, Joey Tianyi Zhou},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-11-12},<br \/>\r\nurldate = {2024-11-12},<br \/>\r\njournal = {IEEE Transactions on Mobile Computing},<br \/>\r\nabstract = {Human activity recognition (HAR) is a crucial task in IoT systems with applications ranging from surveillance and intruder detection to home automation and more. Recently, non-invasive HAR utilizing WiFi signals has gained considerable attention due to advancements in ubiquitous WiFi technologies. However, recent studies have revealed significant privacy risks associated with WiFi signals, raising concerns about bio-information leakage. To address these concerns, the decentralized paradigm, particularly federated learning (FL), has emerged as a promising approach for training HAR models while preserving data privacy. Nevertheless, FL models may struggle in end-user environments due to substantial domain discrepancies between the source training data and the target end-user environment. This discrepancy arises from the sensitivity of WiFi signals to environmental changes, resulting in notable domain shifts. As a consequence, FL-based HAR approaches often face challenges when deployed in real-world WiFi environments. Albeit there are pioneer attempts on federated domain adaptation, they typically require non-trivial communication and computation cost, which is prohibitively expensive especially considering edge-based hardware equipment of end-user environment. In this paper, we propose a model to democratize the WiFi-based HAR system by enhancing recognition accuracy in unannotated end-user environments while prioritizing data privacy. Our model leverages the hypothesis transfer and a lightweight hypothesis ensemble to mitigate negative transfer. We prove a tighter theoretical upper bound compared to existing multi-source federated domain adaptation models. Extensive experiments shows our model improves the average accuracy by approximately 10 absolute percentage points in both cross-person and cross-environment settings comparing several state-of-the-art baselines.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('49','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_49\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Human activity recognition (HAR) is a crucial task in IoT systems with applications ranging from surveillance and intruder detection to home automation and more. Recently, non-invasive HAR utilizing WiFi signals has gained considerable attention due to advancements in ubiquitous WiFi technologies. However, recent studies have revealed significant privacy risks associated with WiFi signals, raising concerns about bio-information leakage. To address these concerns, the decentralized paradigm, particularly federated learning (FL), has emerged as a promising approach for training HAR models while preserving data privacy. Nevertheless, FL models may struggle in end-user environments due to substantial domain discrepancies between the source training data and the target end-user environment. This discrepancy arises from the sensitivity of WiFi signals to environmental changes, resulting in notable domain shifts. As a consequence, FL-based HAR approaches often face challenges when deployed in real-world WiFi environments. Albeit there are pioneer attempts on federated domain adaptation, they typically require non-trivial communication and computation cost, which is prohibitively expensive especially considering edge-based hardware equipment of end-user environment. In this paper, we propose a model to democratize the WiFi-based HAR system by enhancing recognition accuracy in unannotated end-user environments while prioritizing data privacy. Our model leverages the hypothesis transfer and a lightweight hypothesis ensemble to mitigate negative transfer. We prove a tighter theoretical upper bound compared to existing multi-source federated domain adaptation models. Extensive experiments shows our model improves the average accuracy by approximately 10 absolute percentage points in both cross-person and cross-environment settings comparing several state-of-the-art baselines.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('49','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Le Zhang, Qibin Hou, Yun Liu, Jia-Wang Bian, Xun Xu, Joey Tianyi Zhou, Ce Zhu<\/p><p class=\"tp_pub_title\">Deep Negative Correlation Classification <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">Machine Learning, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_48\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('48','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_48\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('48','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_48\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Deep Negative Correlation Classification},<br \/>\r\nauthor = {Le Zhang, Qibin Hou, Yun Liu, Jia-Wang Bian, Xun Xu, Joey Tianyi Zhou, Ce Zhu},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-11-11},<br \/>\r\nurldate = {2024-11-11},<br \/>\r\njournal = {Machine Learning},<br \/>\r\nabstract = {Ensemble learning serves as a straightforward way to improve the performance of almost any machine learning algorithm. Existing deep ensemble methods usually na\u00a8\u0131vely train many different models and then aggregate their predictions. This is not optimal in our view from two aspects: i) Na\u00a8\u0131vely training multiple models adds much more computational burden, especially in the deep learning era; ii) Purely optimizing each base model without considering their interactions limits the diversity of ensemble and performance gains. We tackle these issues by proposing deep negative correlation classification (DNCC), in which the accuracy and diversity trade-off is systematically controlled by decomposing the loss function seamlessly into individual accuracy and the \u201ccorrelation\u201d between individual models and the ensemble. DNCC yields a deep classification ensemble where the individual estimator is both accurate and \u201cnegatively correlated\u201d. Thanks to the optimized diversities, DNCC works well even when utilizing a shared network backbone, which significantly improves its efficiency when compared with most existing ensemble systems. Extensive experiments on multiple benchmark datasets and network structures demonstrate the superiority of the proposed method.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('48','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_48\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Ensemble learning serves as a straightforward way to improve the performance of almost any machine learning algorithm. Existing deep ensemble methods usually na\u00a8\u0131vely train many different models and then aggregate their predictions. This is not optimal in our view from two aspects: i) Na\u00a8\u0131vely training multiple models adds much more computational burden, especially in the deep learning era; ii) Purely optimizing each base model without considering their interactions limits the diversity of ensemble and performance gains. We tackle these issues by proposing deep negative correlation classification (DNCC), in which the accuracy and diversity trade-off is systematically controlled by decomposing the loss function seamlessly into individual accuracy and the \u201ccorrelation\u201d between individual models and the ensemble. DNCC yields a deep classification ensemble where the individual estimator is both accurate and \u201cnegatively correlated\u201d. Thanks to the optimized diversities, DNCC works well even when utilizing a shared network backbone, which significantly improves its efficiency when compared with most existing ensemble systems. Extensive experiments on multiple benchmark datasets and network structures demonstrate the superiority of the proposed method.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('48','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Boyuan Sun, Yuqi Yang, Le Zhang, Ming-Ming Cheng, Qibin Hou<\/p><p class=\"tp_pub_title\">CorrMatch: Label Propagation via Correlation Matching for Semi-Supervised Semantic Segmentation <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">CVPR, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_46\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('46','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_46\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('46','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_46\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {CorrMatch: Label Propagation via Correlation Matching for Semi-Supervised Semantic Segmentation},<br \/>\r\nauthor = {Boyuan Sun, Yuqi Yang, Le Zhang, Ming-Ming Cheng, Qibin Hou},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-02-01},<br \/>\r\nurldate = {2024-02-01},<br \/>\r\nbooktitle = {CVPR},<br \/>\r\nabstract = {In this paper, we present a simple but performant semi-supervised semantic segmentation approach, termed CorrMatch. Our goal is to mine more high-quality regions from the unlabeled images to leverage the unlabeled data more efficiently via consistency regularization. The key contributions of our CorrMatch are two novel and complementary strategies. First, we introduce an adaptive threshold updating strategy with a relaxed initialization to expand the high-quality regions. Furthermore, we propose to propagate high-confidence predictions through measuring the pairwise similarities between pixels. Despite its simplicity, we show that CorrMatch achieves great performance on popular semi-supervised semantic segmentation benchmarks. Taking the DeepLabV3+ framework with ResNet-101 backbone as our segmentation model, we receive a 76%+ mIoU score on the Pascal VOC 2012 segmentation benchmark with only 92 annotated images provided. We also achieve a consistent improvement over previous semi-supervised semantic segmentation models. Code will be made publicly available.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('46','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_46\" style=\"display:none;\"><div class=\"tp_abstract_entry\">In this paper, we present a simple but performant semi-supervised semantic segmentation approach, termed CorrMatch. Our goal is to mine more high-quality regions from the unlabeled images to leverage the unlabeled data more efficiently via consistency regularization. The key contributions of our CorrMatch are two novel and complementary strategies. First, we introduce an adaptive threshold updating strategy with a relaxed initialization to expand the high-quality regions. Furthermore, we propose to propagate high-confidence predictions through measuring the pairwise similarities between pixels. Despite its simplicity, we show that CorrMatch achieves great performance on popular semi-supervised semantic segmentation benchmarks. Taking the DeepLabV3+ framework with ResNet-101 backbone as our segmentation model, we receive a 76%+ mIoU score on the Pascal VOC 2012 segmentation benchmark with only 92 annotated images provided. We also achieve a consistent improvement over previous semi-supervised semantic segmentation models. Code will be made publicly available.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('46','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Zhiwei Lin, Zhe Liu, Zhongyu Xia, Xinhao Wang, Yongtao Wang, Shengxiang Qi, Yang Dong, Nan Dong, Le Zhang, Ce Zhu<\/p><p class=\"tp_pub_title\">RCBEVDet: Radar-camera Fusion in Bird\u2019s Eye View for 3D Object Detection <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">CVPR, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_47\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('47','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_47\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('47','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_47\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {RCBEVDet: Radar-camera Fusion in Bird\u2019s Eye View for 3D Object Detection},<br \/>\r\nauthor = {Zhiwei Lin, Zhe Liu, Zhongyu Xia, Xinhao Wang, Yongtao Wang, Shengxiang Qi, Yang Dong, Nan Dong, Le Zhang, Ce Zhu},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-02-01},<br \/>\r\nurldate = {2024-02-01},<br \/>\r\nbooktitle = {CVPR},<br \/>\r\nabstract = {Three-dimensional object detection is one of the key tasks in autonomous driving. To reduce costs in practice, low-cost multi-view cameras for 3D object detection are proposed to replace the expansive LiDAR sensors. However, relying solely on cameras is difficult to achieve highly accurate and robust 3D object detection. An effective solution to this issue is combining multi-view cameras with the economical millimeter-wave radar sensor to achieve more reliable multi-modal 3D object detection. In this paper, we<br \/>\r\nintroduce RCBEVDet, a radar-camera fusion 3D object detection method in the bird\u2019s eye view (BEV). Specifically, we first design RadarBEVNet for radar BEV feature extraction. RadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section (RCS) aware BEV encoder.  In the dual-stream radar backbone, a point-based encoder and a transformer-based encoder are proposed to extract radar features, with an injection and extraction module to facilitate communication between the two encoders. The RCS-aware BEV encoder takes RCS as the object size prior to scattering the point feature in BEV. Besides, we present the Cross-Attention Multi-layer Fusion module to automatically align the multi-modal BEV feature from radar and camera with the deformable attention mechanism, and then fuse the feature with channel and spatial fusion layers. Experimental results show that RCBEVDet achieves new state-of-the-art radar-camera fusion results on nuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore, RCBEVDet achieves better 3D detection results than all real-time camera-only and radar-camera 3D object detectors with a faster inference speed at 21\u223c28 FPS.<br \/>\r\n},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('47','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_47\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Three-dimensional object detection is one of the key tasks in autonomous driving. To reduce costs in practice, low-cost multi-view cameras for 3D object detection are proposed to replace the expansive LiDAR sensors. However, relying solely on cameras is difficult to achieve highly accurate and robust 3D object detection. An effective solution to this issue is combining multi-view cameras with the economical millimeter-wave radar sensor to achieve more reliable multi-modal 3D object detection. In this paper, we<br \/>\r\nintroduce RCBEVDet, a radar-camera fusion 3D object detection method in the bird\u2019s eye view (BEV). Specifically, we first design RadarBEVNet for radar BEV feature extraction. RadarBEVNet consists of a dual-stream radar backbone and a Radar Cross-Section (RCS) aware BEV encoder.  In the dual-stream radar backbone, a point-based encoder and a transformer-based encoder are proposed to extract radar features, with an injection and extraction module to facilitate communication between the two encoders. The RCS-aware BEV encoder takes RCS as the object size prior to scattering the point feature in BEV. Besides, we present the Cross-Attention Multi-layer Fusion module to automatically align the multi-modal BEV feature from radar and camera with the deformable attention mechanism, and then fuse the feature with channel and spatial fusion layers. Experimental results show that RCBEVDet achieves new state-of-the-art radar-camera fusion results on nuScenes and view-of-delft (VoD) 3D object detection benchmarks. Furthermore, RCBEVDet achieves better 3D detection results than all real-time camera-only and radar-camera 3D object detectors with a faster inference speed at 21\u223c28 FPS.<br \/>\r\n<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('47','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Tian Gao, Cheng-Zhong Xu, Le Zhang, Hui Kong<\/p><p class=\"tp_pub_title\">GSB: Group superposition binarization for vision transformer with limited training samples <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">Neural Networks, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_44\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('44','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_44\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('44','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/IMRL\/GSB-Vision-Transformer\" title=\"https:\/\/github.com\/IMRL\/GSB-Vision-Transformer\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_44\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {GSB: Group superposition binarization for vision transformer with limited training samples},<br \/>\r\nauthor = {Tian Gao, Cheng-Zhong Xu, Le Zhang, Hui Kong},<br \/>\r\nurl = {https:\/\/github.com\/IMRL\/GSB-Vision-Transformer},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\nurldate = {2024-01-01},<br \/>\r\njournal = {Neural Networks},<br \/>\r\nabstract = {Vision Transformer (ViT) has performed remarkably in various computer vision tasks. Nonetheless, affected by the massive amount of parameters, ViT usually suffers from serious overfitting problems with a relatively limited number of training samples. In addition, ViT generally demands heavy computing resources, which limit its deployment on resource-constrained devices. As a type of model-compression method, model binarization is potentially a good choice to solve the above problems. Compared with the full-precision one, the model with the binarization method replaces complex tensor multiplication with simple bit-wise binary operations and represents full-precision model parameters and activations with only 1-bit ones, which potentially solves the problem of model size and computational complexity, respectively. In this paper, we investigate a binarized ViT model. Empirically, we observe that the existing binarization technology designed for Convolutional Neural Networks (CNN) cannot migrate well to a ViT\u2019s binarization task. We also find that the decline of the accuracy of the binary ViT model is mainly due to the information loss of the Attention module and the Value vector. Therefore, we propose a novel model binarization technique, called Group Superposition Binarization (GSB), to deal with these issues. Furthermore, in order to further improve the performance of the binarization model, we have investigated the gradient calculation procedure in the binarization process and derived more proper gradient calculation equations for GSB to reduce the influence of gradient mismatch. Then, the knowledge distillation technique is introduced to alleviate the performance degradation caused by model binarization. Analytically, model binarization can limit<br \/>\r\nthe parameter\u2019s search space during parameter updates while training a model. Therefore, the binarization process can actually play an implicit regularization role and help solve the problem of overfitting in the case of insufficient training data. Experiments on three datasets with limited numbers of training samples demonstrate that the proposed GSB model achieves state-of-the-art performance among the binary quantization schemes and exceeds its full-precision counterpart on some indicators. Code and<br \/>\r\nmodels are available at: https:\/\/github.com\/IMRL\/GSB-Vision-Transformer.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('44','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_44\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Vision Transformer (ViT) has performed remarkably in various computer vision tasks. Nonetheless, affected by the massive amount of parameters, ViT usually suffers from serious overfitting problems with a relatively limited number of training samples. In addition, ViT generally demands heavy computing resources, which limit its deployment on resource-constrained devices. As a type of model-compression method, model binarization is potentially a good choice to solve the above problems. Compared with the full-precision one, the model with the binarization method replaces complex tensor multiplication with simple bit-wise binary operations and represents full-precision model parameters and activations with only 1-bit ones, which potentially solves the problem of model size and computational complexity, respectively. In this paper, we investigate a binarized ViT model. Empirically, we observe that the existing binarization technology designed for Convolutional Neural Networks (CNN) cannot migrate well to a ViT\u2019s binarization task. We also find that the decline of the accuracy of the binary ViT model is mainly due to the information loss of the Attention module and the Value vector. Therefore, we propose a novel model binarization technique, called Group Superposition Binarization (GSB), to deal with these issues. Furthermore, in order to further improve the performance of the binarization model, we have investigated the gradient calculation procedure in the binarization process and derived more proper gradient calculation equations for GSB to reduce the influence of gradient mismatch. Then, the knowledge distillation technique is introduced to alleviate the performance degradation caused by model binarization. Analytically, model binarization can limit<br \/>\r\nthe parameter\u2019s search space during parameter updates while training a model. Therefore, the binarization process can actually play an implicit regularization role and help solve the problem of overfitting in the case of insufficient training data. Experiments on three datasets with limited numbers of training samples demonstrate that the proposed GSB model achieves state-of-the-art performance among the binary quantization schemes and exceeds its full-precision counterpart on some indicators. Code and<br \/>\r\nmodels are available at: https:\/\/github.com\/IMRL\/GSB-Vision-Transformer.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('44','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Aiping Huang, Lijian Li, Le Zhang, Yuzhen Niu, Tiesong Zhao, Chia-Wen Lin<\/p><p class=\"tp_pub_title\">Multi-View Graph Embedding Learning for Image Co-Segmentation and Co-Localization <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Circuits and Systems for Video Technology, <\/span><span class=\"tp_pub_additional_year\">2024<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_45\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('45','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_45\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('45','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_45\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Multi-View Graph Embedding Learning for Image Co-Segmentation and Co-Localization},<br \/>\r\nauthor = {Aiping Huang, Lijian Li, Le Zhang, Yuzhen Niu, Tiesong Zhao, Chia-Wen Lin},<br \/>\r\nyear  = {2024},<br \/>\r\ndate = {2024-01-01},<br \/>\r\nurldate = {2024-01-01},<br \/>\r\njournal = {IEEE Transactions on Circuits and Systems for Video Technology},<br \/>\r\nabstract = {Image co-segmentation and co-localization exploit inter-image information to identify and extract foreground objects with a batch mode. However, they remain challenging when confronted with large object variations or complex backgrounds. This paper proposes a multi-view graph embedding (MV-Gem) learning scheme which integrates diversity, robustness and discernibility of object features to alleviate this phenomenon. To encourage the diversity, the deep co-information containing both low-layer general representations and high-layer semantic information is generated to form a multi-view feature pool for comprehensive co-object description. To enhance the robustness, a multi-view adaptive weighted learning is formulated to fuse the deep co-information for feature complementation. To ensure the discernibility, the graph embedding and sparse constraint are embedded into the fusion formulation for feature selection. The former aims to inherit important structures from multiple views, and the latter further selects important features to restrain irrelevant backgrounds. With these techniques, MV-Gem gradually recovers all co-objects through optimization iterations. Extensive experimental results on real-world datasets demonstrate that MV-Gem is capable of locating and delineating co-objects in an image group.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('45','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_45\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Image co-segmentation and co-localization exploit inter-image information to identify and extract foreground objects with a batch mode. However, they remain challenging when confronted with large object variations or complex backgrounds. This paper proposes a multi-view graph embedding (MV-Gem) learning scheme which integrates diversity, robustness and discernibility of object features to alleviate this phenomenon. To encourage the diversity, the deep co-information containing both low-layer general representations and high-layer semantic information is generated to form a multi-view feature pool for comprehensive co-object description. To enhance the robustness, a multi-view adaptive weighted learning is formulated to fuse the deep co-information for feature complementation. To ensure the discernibility, the graph embedding and sparse constraint are embedded into the fusion formulation for feature selection. The former aims to inherit important structures from multiple views, and the latter further selects important features to restrain irrelevant backgrounds. With these techniques, MV-Gem gradually recovers all co-objects through optimization iterations. Extensive experimental results on real-world datasets demonstrate that MV-Gem is capable of locating and delineating co-objects in an image group.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('45','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2023\">2023<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Wei Meng, Zhicong Liu, Bing Li, Wei Cui, Joey Tianyi Zhou, Le Zhang<\/p><p class=\"tp_pub_title\">GrapHAR: A Lightweight Human Activity Recognition Model by Exploring the Sub-carrier Correlations <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Wireless Communications, <\/span><span class=\"tp_pub_additional_year\">2023<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_43\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('43','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_43\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('43','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_43\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {GrapHAR: A Lightweight Human Activity Recognition Model by Exploring the Sub-carrier Correlations},<br \/>\r\nauthor = {Wei Meng, Zhicong Liu, Bing Li, Wei Cui, Joey Tianyi Zhou, Le Zhang},<br \/>\r\nyear  = {2023},<br \/>\r\ndate = {2023-08-08},<br \/>\r\njournal = {IEEE Transactions on Wireless Communications},<br \/>\r\nabstract = {Human activity recognition (HAR) is an important task due to its far-reaching applications, such as surveillance, healthcare systems, and human-computer interaction. Recently, Channel State Information (CSI)-based HAR has attracted increasing attention in the research community due to its ubiquitous availability, good user privacy, and fewer constraints on working conditions. Most of the existing methods for CSI-based HAR use various deep learning models, such as Convolutional Neural Networks (CNNs), Long Short-Term Memory (LSTM), and Transformers, to distinguish activities based on their temporal patterns. Despite their remarkable effectiveness, these methods solely focus on temporal patterns while ignoring the correlations among sub-carriers. This limitation prevents them from achieving further performance improvement. Moreover, recent works often involve advanced yet massive and inefficient neural architectures, like Transformers, to obtain satisfactory recognition accuracy. The performance gain is traded off with a steep increase in model complexity, which leads to low efficacy and high training\/inference costs outsides the small time window. To address these issues, we propose a lightweight CSI-based HAR model. Our model makes the first effort to explore the graphical correlations of CSI sub-carriers, working in conjunction with a temporal causal convolution module. The high efficacy design enables our model to be highly effective without requiring excessive model complexity. Extensive experiments conducted on four real-world datasets demonstrate that our model outperforms state-of-the-art methods, including a strong Transformer-based baseline. It achieves an average improvement of 8 percentage points in recognition accuracy, with only 10% of the parameters compared to the Transformer-based method (4.95M vs. 49.24M). Additionally, our model is significantly faster, with empirical training and execution times at least 2.07 times faster than the baseline.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('43','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_43\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Human activity recognition (HAR) is an important task due to its far-reaching applications, such as surveillance, healthcare systems, and human-computer interaction. Recently, Channel State Information (CSI)-based HAR has attracted increasing attention in the research community due to its ubiquitous availability, good user privacy, and fewer constraints on working conditions. Most of the existing methods for CSI-based HAR use various deep learning models, such as Convolutional Neural Networks (CNNs), Long Short-Term Memory (LSTM), and Transformers, to distinguish activities based on their temporal patterns. Despite their remarkable effectiveness, these methods solely focus on temporal patterns while ignoring the correlations among sub-carriers. This limitation prevents them from achieving further performance improvement. Moreover, recent works often involve advanced yet massive and inefficient neural architectures, like Transformers, to obtain satisfactory recognition accuracy. The performance gain is traded off with a steep increase in model complexity, which leads to low efficacy and high training\/inference costs outsides the small time window. To address these issues, we propose a lightweight CSI-based HAR model. Our model makes the first effort to explore the graphical correlations of CSI sub-carriers, working in conjunction with a temporal causal convolution module. The high efficacy design enables our model to be highly effective without requiring excessive model complexity. Extensive experiments conducted on four real-world datasets demonstrate that our model outperforms state-of-the-art methods, including a strong Transformer-based baseline. It achieves an average improvement of 8 percentage points in recognition accuracy, with only 10% of the parameters compared to the Transformer-based method (4.95M vs. 49.24M). Additionally, our model is significantly faster, with empirical training and execution times at least 2.07 times faster than the baseline.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('43','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Bing Li, Wei Cui, Le Zhang, Ce Zhu, Wei Wang, Ivor Tsang, Joey Tianyi Zhou<\/p><p class=\"tp_pub_title\">DifFormer: Multi-Resolutional Differencing Transformer With Dynamic Ranging for Time Series Analysis <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Pattern Analysis and Machine Intelligence, <\/span><span class=\"tp_pub_additional_year\">2023<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_40\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('40','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_40\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('40','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_40\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {DifFormer: Multi-Resolutional Differencing Transformer With Dynamic Ranging for Time Series Analysis},<br \/>\r\nauthor = {Bing Li, Wei Cui, Le Zhang, Ce Zhu, Wei Wang, Ivor Tsang, Joey Tianyi Zhou},<br \/>\r\nyear  = {2023},<br \/>\r\ndate = {2023-07-17},<br \/>\r\nurldate = {2023-07-17},<br \/>\r\njournal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br \/>\r\nabstract = {Time series analysis is essential to many far-reaching applications of data science and statistics including economic and financial forecasting, surveillance, and automated business processing. Though being greatly successful of Transformer in computer vision and natural language processing, the potential of employing it as the general backbone in analyzing the ubiquitous times series data has not been fully released yet. Prior Transformer variants on time series highly rely on task-dependent designs and pre-assumed ``pattern biases'', revealing its insufficiency in representing nuanced seasonal, cyclic, and outlier patterns which are highly prevalent in time series. As a consequence, they can not generalize well to different time series analysis tasks. To tackle the challenges, we propose emph{DifFormer}, an effective and efficient Transformer architecture that can serve as a workhorse for a variety of time-series analysis tasks. DifFormer incorporates a novel multi-resolutional differencing mechanism, which is able to progressively and adaptively make nuanced yet meaningful changes prominent, meanwhile, the periodic or cyclic patterns can be dynamically captured with flexible lagging and dynamic ranging operations. Extensive experiments demonstrate DifFormer significantly outperforms state-of-the-art models on three essential time-series analysis tasks, including classification, regression, and forecasting. In addition to its superior performances, DifFormer also excels in efficiency -- a linear time\/memory complexity with empirically lower time consumption.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('40','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_40\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Time series analysis is essential to many far-reaching applications of data science and statistics including economic and financial forecasting, surveillance, and automated business processing. Though being greatly successful of Transformer in computer vision and natural language processing, the potential of employing it as the general backbone in analyzing the ubiquitous times series data has not been fully released yet. Prior Transformer variants on time series highly rely on task-dependent designs and pre-assumed ``pattern biases'', revealing its insufficiency in representing nuanced seasonal, cyclic, and outlier patterns which are highly prevalent in time series. As a consequence, they can not generalize well to different time series analysis tasks. To tackle the challenges, we propose emph{DifFormer}, an effective and efficient Transformer architecture that can serve as a workhorse for a variety of time-series analysis tasks. DifFormer incorporates a novel multi-resolutional differencing mechanism, which is able to progressively and adaptively make nuanced yet meaningful changes prominent, meanwhile, the periodic or cyclic patterns can be dynamically captured with flexible lagging and dynamic ranging operations. Extensive experiments demonstrate DifFormer significantly outperforms state-of-the-art models on three essential time-series analysis tasks, including classification, regression, and forecasting. In addition to its superior performances, DifFormer also excels in efficiency -- a linear time\/memory complexity with empirically lower time consumption.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('40','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Ao Li, Le Zhang, Yun Liu, Ce Zhu<\/p><p class=\"tp_pub_title\">Feature Modulation Transformer: Cross-Refinement of Global Representation via High-Frequency Prior for Image Super-Resolution <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">ICCV, <\/span><span class=\"tp_pub_additional_year\">2023<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_41\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('41','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_41\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('41','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_41\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Feature Modulation Transformer: Cross-Refinement of Global Representation via High-Frequency Prior for Image Super-Resolution},<br \/>\r\nauthor = {Ao Li, Le Zhang, Yun Liu, Ce Zhu},<br \/>\r\nyear  = {2023},<br \/>\r\ndate = {2023-07-17},<br \/>\r\nurldate = {2023-07-17},<br \/>\r\nbooktitle = {ICCV},<br \/>\r\nabstract = {Transformer-based methods have exhibited remarkable potential in Single Image Super-Resolution (SISR) by effectively extracting long-range dependencies. However, most of the current research in this area has prioritized the design of transformer blocks to capture global information, while overlooking the importance of incorporating high-frequency priors, which we believe could be beneficial. In our study, we conducted a series of experiments and found that transformer structures are more adept at capturing low-frequency information, but have limited capacity in constructing high-frequency representations when compared to their convolutional counterparts. Our proposed solution, the Cross-Refinement Adaptive Feature Modulation Transformer (CRAFT), integrates the strengths of both convolutional and transformer structures. It comprises three key components: textbf{the High-Frequency Enhancement Residual Block (HFERB)} for extracting high-frequency information, textbf{the Shift Rectangle Window Attention Block (SRWAB)} for capturing global information, and textbf{the Hybrid Fusion Block (HFB)} for refining the global representation. Our experiments on multiple datasets demonstrate that CRAFT outperforms state-of-the-art methods by up to textbf{0.29dB} while using fewer parameters.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('41','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_41\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Transformer-based methods have exhibited remarkable potential in Single Image Super-Resolution (SISR) by effectively extracting long-range dependencies. However, most of the current research in this area has prioritized the design of transformer blocks to capture global information, while overlooking the importance of incorporating high-frequency priors, which we believe could be beneficial. In our study, we conducted a series of experiments and found that transformer structures are more adept at capturing low-frequency information, but have limited capacity in constructing high-frequency representations when compared to their convolutional counterparts. Our proposed solution, the Cross-Refinement Adaptive Feature Modulation Transformer (CRAFT), integrates the strengths of both convolutional and transformer structures. It comprises three key components: textbf{the High-Frequency Enhancement Residual Block (HFERB)} for extracting high-frequency information, textbf{the Shift Rectangle Window Attention Block (SRWAB)} for capturing global information, and textbf{the Hybrid Fusion Block (HFB)} for refining the global representation. Our experiments on multiple datasets demonstrate that CRAFT outperforms state-of-the-art methods by up to textbf{0.29dB} while using fewer parameters.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('41','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2022\">2022<\/h3><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Fanxing Liu, Cheng Zeng, Le Zhang*, Yingjie Zhou*, Qing Mu, Yanru Zhang, Ling Zhang, Ce Zhu<\/p><p class=\"tp_pub_title\">FedTADBench: Federated Time-series Anomaly Detection Benchmark <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">IEEE HPCC \uff08Best Paper Award), <\/span><span class=\"tp_pub_additional_year\">2022<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_42\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('42','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_42\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('42','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/fanxingliu2020\/FedTADBench\" title=\"https:\/\/github.com\/fanxingliu2020\/FedTADBench\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_42\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {FedTADBench: Federated Time-series Anomaly Detection Benchmark},<br \/>\r\nauthor = {Fanxing Liu, Cheng Zeng, Le Zhang*, Yingjie Zhou*, Qing Mu, Yanru Zhang, Ling Zhang, Ce Zhu},<br \/>\r\nurl = {https:\/\/github.com\/fanxingliu2020\/FedTADBench},<br \/>\r\nyear  = {2022},<br \/>\r\ndate = {2022-12-15},<br \/>\r\nurldate = {2022-12-15},<br \/>\r\nbooktitle = {IEEE HPCC \uff08Best Paper Award)},<br \/>\r\nabstract = {Time series anomaly detection strives to uncover potential abnormal behaviors and patterns from temporal data, and has fundamental significance in diverse application scenarios. Constructing an effective detection model usually requires<br \/>\r\nadequate training data stored in a centralized manner, however, this requirement sometimes could not be satisfied in realistic<br \/>\r\nscenarios. As a prevailing approach to address the above problem, federated learning has demonstrated its power to cooperate with the distributed data available while protecting the privacy of data providers. However, it is still unclear that how existing time series anomaly detection algorithms perform with decentralized data storage and privacy protection through federated learning.<br \/>\r\nTo study this, we conduct a federated time series anomaly<br \/>\r\ndetection benchmark, named FedTADBench, which involves five<br \/>\r\nrepresentative time series anomaly detection algorithms and four<br \/>\r\npopular federated learning methods. We would like to answer<br \/>\r\nthe following questions: (1)How is the performance of time series<br \/>\r\nanomaly detection algorithms when meeting federated learning?<br \/>\r\n(2) Which federated learning method is the most appropriate<br \/>\r\none for time series anomaly detection? (3) How do federated<br \/>\r\ntime series anomaly detection approaches perform on different<br \/>\r\npartitions of data in clients? Numbers of results as well as corresponding analysis are provided from extensive experiments with various settings. The source code of our benchmark is publicly<br \/>\r\navailable at https:\/\/github.com\/fanxingliu2020\/FedTADBench},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('42','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_42\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Time series anomaly detection strives to uncover potential abnormal behaviors and patterns from temporal data, and has fundamental significance in diverse application scenarios. Constructing an effective detection model usually requires<br \/>\r\nadequate training data stored in a centralized manner, however, this requirement sometimes could not be satisfied in realistic<br \/>\r\nscenarios. As a prevailing approach to address the above problem, federated learning has demonstrated its power to cooperate with the distributed data available while protecting the privacy of data providers. However, it is still unclear that how existing time series anomaly detection algorithms perform with decentralized data storage and privacy protection through federated learning.<br \/>\r\nTo study this, we conduct a federated time series anomaly<br \/>\r\ndetection benchmark, named FedTADBench, which involves five<br \/>\r\nrepresentative time series anomaly detection algorithms and four<br \/>\r\npopular federated learning methods. We would like to answer<br \/>\r\nthe following questions: (1)How is the performance of time series<br \/>\r\nanomaly detection algorithms when meeting federated learning?<br \/>\r\n(2) Which federated learning method is the most appropriate<br \/>\r\none for time series anomaly detection? (3) How do federated<br \/>\r\ntime series anomaly detection approaches perform on different<br \/>\r\npartitions of data in clients? Numbers of results as well as corresponding analysis are provided from extensive experiments with various settings. The source code of our benchmark is publicly<br \/>\r\navailable at https:\/\/github.com\/fanxingliu2020\/FedTADBench<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('42','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Guolei Sun, Yun Liu, Hao Tang, Ajad Chhatkuli, Le Zhang, Luc Van Gool<\/p><p class=\"tp_pub_title\">Mining Relations among Cross-Frame Affinities for Video Semantic Segmentation <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">ECCV2022, <\/span><span class=\"tp_pub_additional_year\">2022<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_36\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('36','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_36\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('36','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/GuoleiSun\/VSS-MRCFA\" title=\"https:\/\/github.com\/GuoleiSun\/VSS-MRCFA\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_36\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {Mining Relations among Cross-Frame Affinities for Video Semantic Segmentation},<br \/>\r\nauthor = {Guolei Sun, Yun Liu, Hao Tang, Ajad Chhatkuli, Le Zhang, Luc Van Gool},<br \/>\r\nurl = {https:\/\/github.com\/GuoleiSun\/VSS-MRCFA},<br \/>\r\nyear  = {2022},<br \/>\r\ndate = {2022-10-18},<br \/>\r\nbooktitle = {ECCV2022},<br \/>\r\nabstract = {The essence of video semantic segmentation (VSS) is how to leverage temporal information for prediction. Previous efforts are mainly devoted to developing new techniques to calculate the cross-frame affinities such as optical flow and attention. Instead, this paper contributes from a different angle by mining relations among cross-frame affinities, upon which better temporal information aggregation could be achieved. We explore relations among affinities in two aspects: single-scale intrinsic correlations and multi-scale relations. Inspired by traditional feature processing, we propose Single-scale Affinity Refinement (SAR) and Multi-scale Affinity Aggregation (MAA). To make it feasible to execute MAA, we propose a Selective Token Masking (STM) strategy to select a subset of consistent reference tokens for different scales when calculating affinities, which also improves the efficiency of our method. At last, the cross-frame affinities strengthened by SAR and MAA are adopted for adaptively aggregating temporal information. Our experiments demonstrate that the proposed method performs favorably against state-of-the-art VSS methods. The code is publicly available at https:\/\/github.com\/GuoleiSun\/VSS-MRCFA.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('36','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_36\" style=\"display:none;\"><div class=\"tp_abstract_entry\">The essence of video semantic segmentation (VSS) is how to leverage temporal information for prediction. Previous efforts are mainly devoted to developing new techniques to calculate the cross-frame affinities such as optical flow and attention. Instead, this paper contributes from a different angle by mining relations among cross-frame affinities, upon which better temporal information aggregation could be achieved. We explore relations among affinities in two aspects: single-scale intrinsic correlations and multi-scale relations. Inspired by traditional feature processing, we propose Single-scale Affinity Refinement (SAR) and Multi-scale Affinity Aggregation (MAA). To make it feasible to execute MAA, we propose a Selective Token Masking (STM) strategy to select a subset of consistent reference tokens for different scales when calculating affinities, which also improves the efficiency of our method. At last, the cross-frame affinities strengthened by SAR and MAA are adopted for adaptively aggregating temporal information. Our experiments demonstrate that the proposed method performs favorably against state-of-the-art VSS methods. The code is publicly available at https:\/\/github.com\/GuoleiSun\/VSS-MRCFA.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('36','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">GangXu, QiBin Hou, Le Zhang, Ming-Ming Cheng<\/p><p class=\"tp_pub_title\">FMNet: Frequency-Aware Modulation Network for SDR-to-HDR Translation <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">ACM MM, <\/span><span class=\"tp_pub_additional_year\">2022<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_37\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('37','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_37\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('37','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/MCG-NKU\/FMNet\" title=\"https:\/\/github.com\/MCG-NKU\/FMNet\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_37\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{nokey,<br \/>\r\ntitle = {FMNet: Frequency-Aware Modulation Network for SDR-to-HDR Translation},<br \/>\r\nauthor = {GangXu, QiBin Hou, Le Zhang, Ming-Ming Cheng},<br \/>\r\nurl = {https:\/\/github.com\/MCG-NKU\/FMNet},<br \/>\r\nyear  = {2022},<br \/>\r\ndate = {2022-10-13},<br \/>\r\nurldate = {2022-10-13},<br \/>\r\nbooktitle = {ACM MM},<br \/>\r\nabstract = {High-dynamic-range (HDR) media resources that preserve high contrast and more details in shadow and highlight areas in television are becoming increasingly popular for modern display technology compared to the widely available standard-dynamic-range (SDR) media resources. However, due to the exorbitant price of HDR cameras, researchers have attempted to develop the SDR-to-HDR techniques to convert the abundant SDR media resources to the HDR versions for cost-saving. Recent SDR-to-HDR methods mostly apply the image-adaptive modulation scheme to dynamically modulate the local contrast. However, these methods often fail to properly capture the low-frequency cues, resulting in artifacts in the low-frequency regions and low visual quality. Motivated by the Discrete Cosine Transform (DCT), in this paper, we propose a Frequency-aware Modulation Network (FMNet) to enhance the contrast in a frequency-adaptive way for SDR-to-HDR translation. Specifically, we design a frequency-aware modulation block that can dynamically modulate the features according to its frequency-domain responses. This allows us to reduce the structural distortions and artifacts in the translated low-frequency regions and reconstruct high-quality HDR content in the translated results. Experimental results on the HDRTV1K dataset show that our FMNet outperforms previous methods and the perceptual quality of the generated HDR images can be largely improved. Our code is available at https:\/\/github.com\/MCG-NKU\/FMNet.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('37','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_37\" style=\"display:none;\"><div class=\"tp_abstract_entry\">High-dynamic-range (HDR) media resources that preserve high contrast and more details in shadow and highlight areas in television are becoming increasingly popular for modern display technology compared to the widely available standard-dynamic-range (SDR) media resources. However, due to the exorbitant price of HDR cameras, researchers have attempted to develop the SDR-to-HDR techniques to convert the abundant SDR media resources to the HDR versions for cost-saving. Recent SDR-to-HDR methods mostly apply the image-adaptive modulation scheme to dynamically modulate the local contrast. However, these methods often fail to properly capture the low-frequency cues, resulting in artifacts in the low-frequency regions and low visual quality. Motivated by the Discrete Cosine Transform (DCT), in this paper, we propose a Frequency-aware Modulation Network (FMNet) to enhance the contrast in a frequency-adaptive way for SDR-to-HDR translation. Specifically, we design a frequency-aware modulation block that can dynamically modulate the features according to its frequency-domain responses. This allows us to reduce the structural distortions and artifacts in the translated low-frequency regions and reconstruct high-quality HDR content in the translated results. Experimental results on the HDRTV1K dataset show that our FMNet outperforms previous methods and the perceptual quality of the generated HDR images can be largely improved. Our code is available at https:\/\/github.com\/MCG-NKU\/FMNet.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('37','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yu-Huan Wu, Yun Liu, Le Zhang, Ming-Ming Cheng, Bo Ren<\/p><p class=\"tp_pub_title\">EDN: Salient object detection via extremely-downsampled network <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TIP, <\/span><span class=\"tp_pub_additional_year\">2022<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_39\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('39','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_39\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('39','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/yuhuan-wu\/EDN\" title=\"https:\/\/github.com\/yuhuan-wu\/EDN\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_39\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {EDN: Salient object detection via extremely-downsampled network},<br \/>\r\nauthor = {Yu-Huan Wu, Yun Liu, Le Zhang, Ming-Ming Cheng, Bo Ren},<br \/>\r\nurl = {https:\/\/github.com\/yuhuan-wu\/EDN},<br \/>\r\nyear  = {2022},<br \/>\r\ndate = {2022-06-08},<br \/>\r\nurldate = {2022-06-08},<br \/>\r\njournal = {IEEE TIP},<br \/>\r\nabstract = {Recent progress on salient object detection (SOD)<br \/>\r\nmainly benefits from multi-scale learning, where the high-level<br \/>\r\nand low-level features collaborate in locating salient objects<br \/>\r\nand discovering fine details, respectively. However, most efforts<br \/>\r\nare devoted to low-level feature learning by fusing multi-scale<br \/>\r\nfeatures or enhancing boundary representations. High-level features, which although have long proven effective for many<br \/>\r\nother tasks, yet have been barely studied for SOD. In this<br \/>\r\npaper, we tap into this gap and show that enhancing highlevel features is essential for SOD as well. To this end, we<br \/>\r\nintroduce an Extremely-Downsampled Network (EDN), which<br \/>\r\nemploys an extreme downsampling technique to effectively learn<br \/>\r\na global view of the whole image, leading to accurate salient<br \/>\r\nobject localization. To accomplish better multi-level feature<br \/>\r\nfusion, we construct the Scale-Correlated Pyramid Convolution<br \/>\r\n(SCPC) to build an elegant decoder for recovering object details<br \/>\r\nfrom the above extreme downsampling. Extensive experiments<br \/>\r\ndemonstrate that EDN achieves state-of-the-art performance with<br \/>\r\nreal-time speed. Our efficient EDN-Lite also achieves competitive<br \/>\r\nperformance with a speed of 316fps. Hence, this work is expected<br \/>\r\nto spark some new thinking in SOD. Code is available at<br \/>\r\nhttps:\/\/github.com\/yuhuan-wu\/EDN},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('39','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_39\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Recent progress on salient object detection (SOD)<br \/>\r\nmainly benefits from multi-scale learning, where the high-level<br \/>\r\nand low-level features collaborate in locating salient objects<br \/>\r\nand discovering fine details, respectively. However, most efforts<br \/>\r\nare devoted to low-level feature learning by fusing multi-scale<br \/>\r\nfeatures or enhancing boundary representations. High-level features, which although have long proven effective for many<br \/>\r\nother tasks, yet have been barely studied for SOD. In this<br \/>\r\npaper, we tap into this gap and show that enhancing highlevel features is essential for SOD as well. To this end, we<br \/>\r\nintroduce an Extremely-Downsampled Network (EDN), which<br \/>\r\nemploys an extreme downsampling technique to effectively learn<br \/>\r\na global view of the whole image, leading to accurate salient<br \/>\r\nobject localization. To accomplish better multi-level feature<br \/>\r\nfusion, we construct the Scale-Correlated Pyramid Convolution<br \/>\r\n(SCPC) to build an elegant decoder for recovering object details<br \/>\r\nfrom the above extreme downsampling. Extensive experiments<br \/>\r\ndemonstrate that EDN achieves state-of-the-art performance with<br \/>\r\nreal-time speed. Our efficient EDN-Lite also achieves competitive<br \/>\r\nperformance with a speed of 316fps. Hence, this work is expected<br \/>\r\nto spark some new thinking in SOD. Code is available at<br \/>\r\nhttps:\/\/github.com\/yuhuan-wu\/EDN<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('39','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Wei Cui, Le Zhang, Bing Li, Zhenghua Chen, Min Wu, Xiaoli Li, Jiawen Kang<\/p><p class=\"tp_pub_title\">Semi-Supervised Deep Adversarial Forest for Cross-Environment Localization <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Transactions on Vehicular Technology, <\/span><span class=\"tp_pub_additional_year\">2022<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_38\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('38','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_38\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('38','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_38\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Semi-Supervised Deep Adversarial Forest for Cross-Environment Localization},<br \/>\r\nauthor = {Wei Cui, Le Zhang, Bing Li, Zhenghua Chen, Min Wu, Xiaoli Li, Jiawen Kang},<br \/>\r\nyear  = {2022},<br \/>\r\ndate = {2022-05-04},<br \/>\r\nurldate = {2022-05-04},<br \/>\r\njournal = {IEEE Transactions on Vehicular Technology},<br \/>\r\nabstract = {Extracting channel state information (CSI) from WiFi signals is of proved high-effectiveness in locating human locations in a device-free manner. However, existing localization\/positioning systems are mainly trained and deployed in a fixed environment, and thus they are likely to suffer from substantial performance declines when immigrating to new environments. In this paper, we address the fundamental problem of WiFi-based cross-environment indoor localization using a semi-supervised approach, in which we only have access to the annotations of the source environment while the data in the target environments are un-annotated. This problem is of high practical values in enabling a well-trained system to be scalable to new environments without tedious human annotations. To this end, a deep neural forest is introduced which unifies the ensemble learning with the representation learning functionalities from deep neural networks in an end-to-end trainable fashion. On top of that, an adversarial training strategy is further employed to learn environment-invariant feature representations for facilitating more robust localization. Extensive experiments on real-world datasets demonstrate the superiority of the proposed methods over state-of-the-art baselines. Compared with the best-performing baseline, our model excels with an average 12.7% relative improvement on all six evaluation settings},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('38','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_38\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Extracting channel state information (CSI) from WiFi signals is of proved high-effectiveness in locating human locations in a device-free manner. However, existing localization\/positioning systems are mainly trained and deployed in a fixed environment, and thus they are likely to suffer from substantial performance declines when immigrating to new environments. In this paper, we address the fundamental problem of WiFi-based cross-environment indoor localization using a semi-supervised approach, in which we only have access to the annotations of the source environment while the data in the target environments are un-annotated. This problem is of high practical values in enabling a well-trained system to be scalable to new environments without tedious human annotations. To this end, a deep neural forest is introduced which unifies the ensemble learning with the representation learning functionalities from deep neural networks in an end-to-end trainable fashion. On top of that, an adversarial training strategy is further employed to learn environment-invariant feature representations for facilitating more robust localization. Extensive experiments on real-world datasets demonstrate the superiority of the proposed methods over state-of-the-art baselines. Compared with the best-performing baseline, our model excels with an average 12.7% relative improvement on all six evaluation settings<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('38','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2021\">2021<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yun Liu; Ming-Ming Cheng; Deng-Ping Fan; Le Zhang; JiaWang Bian; Dacheng Tao<\/p><p class=\"tp_pub_title\">Semantic edge detection with diverse deep supervision <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IJCV, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_3\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('3','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_3\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{liu2018semantic,<br \/>\r\ntitle = {Semantic edge detection with diverse deep supervision},<br \/>\r\nauthor = {Yun Liu and Ming-Ming Cheng and Deng-Ping Fan and Le Zhang and JiaWang Bian and Dacheng Tao},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-06-03},<br \/>\r\nurldate = {2018-01-01},<br \/>\r\njournal = {IJCV},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('3','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Xun Xu; Loong-Fah Cheong; Zhuwen Li; Le Zhang; Ce Zhu;<\/p><p class=\"tp_pub_title\">Learning Clustering for Motion Segmentation <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TCSVT, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_35\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('35','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_35\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('35','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/alex-xun-xu.github.io\/Doc\/Publication\/2021\/XuEtAl_TCSVT21.pdf\" title=\"https:\/\/alex-xun-xu.github.io\/Doc\/Publication\/2021\/XuEtAl_TCSVT21.pdf\" target=\"_blank\"><i class=\"fas fa-file-pdf\"><\/i><\/a><a class=\"tp_pub_link\" href=\"https:\/\/dx.doi.org\/10.1109\/TCSVT.2021.3069094\" title=\"Follow DOI:10.1109\/TCSVT.2021.3069094\" target=\"_blank\"><i class=\"ai ai-doi\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_35\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{nokey,<br \/>\r\ntitle = {Learning Clustering for Motion Segmentation},<br \/>\r\nauthor = {Xun Xu; Loong-Fah Cheong; Zhuwen Li; Le Zhang; Ce Zhu;},<br \/>\r\nurl = {https:\/\/alex-xun-xu.github.io\/Doc\/Publication\/2021\/XuEtAl_TCSVT21.pdf},<br \/>\r\ndoi = {10.1109\/TCSVT.2021.3069094},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-03-29},<br \/>\r\nurldate = {2021-03-29},<br \/>\r\njournal = {IEEE TCSVT},<br \/>\r\nabstract = {Subspace clustering has been extensively studied from the hypothesis-and-test, algebraic, and spectral clustering-based perspectives. Most assume that only a single type\/class of subspace is present. Generalizations to multiple types are non-trivial, plagued by challenges such as choice of types and numbers of models, sampling imbalance and parameter tuning. In many real world problems, data may not lie perfectly on a linear subspace and hand designed linear subspace models may not fit into these situations. In this work, we formulate the multi-type subspace clustering problem as one of learning non-linear subspace filters via deep multi-layer perceptrons (mlps). The response to the learnt subspace filters serve as the feature embedding that is clustering-friendly, i.e., points of the same clusters will be embedded closer together through the network. For inference, we apply K-means to the network output to cluster the data. Experiments are carried out on synthetic data and real world motion segmentation problems, producing state-of-the-art results.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('35','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_35\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Subspace clustering has been extensively studied from the hypothesis-and-test, algebraic, and spectral clustering-based perspectives. Most assume that only a single type\/class of subspace is present. Generalizations to multiple types are non-trivial, plagued by challenges such as choice of types and numbers of models, sampling imbalance and parameter tuning. In many real world problems, data may not lie perfectly on a linear subspace and hand designed linear subspace models may not fit into these situations. In this work, we formulate the multi-type subspace clustering problem as one of learning non-linear subspace filters via deep multi-layer perceptrons (mlps). The response to the learnt subspace filters serve as the feature embedding that is clustering-friendly, i.e., points of the same clusters will be embedded closer together through the network. For inference, we apply K-means to the network output to cluster the data. Experiments are carried out on synthetic data and real world motion segmentation problems, producing state-of-the-art results.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('35','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Le Zhang; Wei Cui; Bing Li; Zhenghua Chen; Min Wu; Teo Sin Gee<\/p><p class=\"tp_pub_title\">Privacy-Preserving Cross-Environment Human Activity Recognition <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TCybernetics, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_2\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('2','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_2\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{zhang2021privacy,<br \/>\r\ntitle = {Privacy-Preserving Cross-Environment Human Activity Recognition},<br \/>\r\nauthor = {Le Zhang and Wei Cui and Bing Li and Zhenghua Chen and Min Wu and Teo Sin Gee},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IEEE TCybernetics},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('2','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yining Ma; Jingwen Li; Zhiguang Cao; Wen Song; Le Zhang; Zhenghua Chen; Jing Tang<\/p><p class=\"tp_pub_title\">Learning to Iteratively Solve Routing Problems with Dual-Aspect Collaborative Transformer <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">NeurIPS, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_4\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('4','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_4\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('4','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/yining043\/VRP-DACT\" title=\"https:\/\/github.com\/yining043\/VRP-DACT\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_4\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{ma2021learning,<br \/>\r\ntitle = {Learning to Iteratively Solve Routing Problems with Dual-Aspect Collaborative Transformer},<br \/>\r\nauthor = {Yining Ma and Jingwen Li and Zhiguang Cao and Wen Song and Le Zhang and Zhenghua Chen and Jing Tang},<br \/>\r\nurl = {https:\/\/github.com\/yining043\/VRP-DACT},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\nbooktitle = {NeurIPS},<br \/>\r\njournal = {NeurIPS},<br \/>\r\nvolume = {34},<br \/>\r\nabstract = {Recently, Transformer has become a prevailing deep architecture for solving vehicle 2 routing problems (VRPs). However, the original Transformer is less effective in 3 learning improvement models because its positional encoding (PE) method is not 4 suitable in representing VRP solutions. This paper presents a novel Dual-Aspect 5 Collaborative Transformer (DACT) to learn embeddings for the node and positional 6 features separately, instead of fusing them together as done in the original PE, so 7 as to avoid potential noises and incompatible attention scores. Moreover, the 8 positional features are embedded through a novel cyclic positional encoding (CPE) 9 method to capture the circularity and symmetry of VRP solutions. We train DACT 10 using Proximal Policy Optimization, and design a curriculum learning strategy for 11 better sample efficiency. We apply DACT to solve the traveling salesman problem 12 (TSP) and capacitated vehicle routing problem (CVRP). Results show that DACT 13 outperforms existing Transformer based improvement models, and exhibits better 14 capability of generalizing across different problem sizes. Code is available at <a href=\"https:\/\/github.com\/yining043\/VRP-DACT\">https:\/\/github.com\/yining043\/VRP-DACT<\/a>},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('4','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_4\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Recently, Transformer has become a prevailing deep architecture for solving vehicle 2 routing problems (VRPs). However, the original Transformer is less effective in 3 learning improvement models because its positional encoding (PE) method is not 4 suitable in representing VRP solutions. This paper presents a novel Dual-Aspect 5 Collaborative Transformer (DACT) to learn embeddings for the node and positional 6 features separately, instead of fusing them together as done in the original PE, so 7 as to avoid potential noises and incompatible attention scores. Moreover, the 8 positional features are embedded through a novel cyclic positional encoding (CPE) 9 method to capture the circularity and symmetry of VRP solutions. We train DACT 10 using Proximal Policy Optimization, and design a curriculum learning strategy for 11 better sample efficiency. We apply DACT to solve the traveling salesman problem 12 (TSP) and capacitated vehicle routing problem (CVRP). Results show that DACT 13 outperforms existing Transformer based improvement models, and exhibits better 14 capability of generalizing across different problem sizes. Code is available at &lt;a href=&quot;https:\/\/github.com\/yining043\/VRP-DACT&quot;&gt;https:\/\/github.com\/yining043\/VRP-DACT&lt;\/a&gt;<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('4','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Le Zhang; Zenglin Shi; Ming-Ming Cheng; Yun Liu; Jia-Wang Bian; Joey Tianyi Zhou; Guoyan Zheng; Zeng Zeng<\/p><p class=\"tp_pub_title\">Nonlinear Regression via Deep Negative Correlation Learning <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TPAMI, <\/span><span class=\"tp_pub_additional_volume\">43 <\/span><span class=\"tp_pub_additional_number\">(3), <\/span><span class=\"tp_pub_additional_pages\">pp. 982-998, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_23\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('23','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_23\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('23','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/mmcheng.net\/dncl\/\" title=\"https:\/\/mmcheng.net\/dncl\/\" target=\"_blank\"><i class=\"fas fa-globe\"><\/i><\/a><a class=\"tp_pub_link\" href=\"https:\/\/dx.doi.org\/10.1109\/TPAMI.2019.2943860\" title=\"Follow DOI:10.1109\/TPAMI.2019.2943860\" target=\"_blank\"><i class=\"ai ai-doi\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_23\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{8850209,<br \/>\r\ntitle = {Nonlinear Regression via Deep Negative Correlation Learning},<br \/>\r\nauthor = {Le Zhang and Zenglin Shi and Ming-Ming Cheng and Yun Liu and Jia-Wang Bian and Joey Tianyi Zhou and Guoyan Zheng and Zeng Zeng},<br \/>\r\nurl = { https:\/\/mmcheng.net\/dncl\/},<br \/>\r\ndoi = {10.1109\/TPAMI.2019.2943860},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IEEE TPAMI},<br \/>\r\nvolume = {43},<br \/>\r\nnumber = {3},<br \/>\r\npages = {982-998},<br \/>\r\nabstract = {Nonlinear regression has been extensively employed in many computer vision problems (e.g., crowd counting, age estimation, affective computing). Under the umbrella of deep learning, two common solutions exist i) transforming nonlinear regression to a robust loss function which is jointly optimizable with the deep convolutional network, and ii) utilizing ensemble of deep networks. Although some improved performance is achieved, the former may be lacking due to the intrinsic limitation of choosing a single hypothesis and the latter may suffer from much larger computational complexity. To cope with those issues, we propose to regress via an efficient \u201cdivide and conquer\u201d manner. The core of our approach is the generalization of negative correlation learning that has been shown, both theoretically and empirically, to work well for non-deep regression problems. Without extra parameters, the proposed method controls the bias-variance-covariance trade-off systematically and usually yields a deep regression ensemble where each base model is both \u201caccurate\u201d and \u201cdiversified.\u201d Moreover, we show that each sub-problem in the proposed method has less Rademacher Complexity and thus is easier to optimize. Extensive experiments on several diverse and challenging tasks including crowd counting, personality analysis, age estimation, and image super-resolution demonstrate the superiority over challenging baselines as well as the versatility of the proposed method. The source code and trained models are available on our project page: https:\/\/mmcheng.net\/dncl\/.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('23','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_23\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Nonlinear regression has been extensively employed in many computer vision problems (e.g., crowd counting, age estimation, affective computing). Under the umbrella of deep learning, two common solutions exist i) transforming nonlinear regression to a robust loss function which is jointly optimizable with the deep convolutional network, and ii) utilizing ensemble of deep networks. Although some improved performance is achieved, the former may be lacking due to the intrinsic limitation of choosing a single hypothesis and the latter may suffer from much larger computational complexity. To cope with those issues, we propose to regress via an efficient \u201cdivide and conquer\u201d manner. The core of our approach is the generalization of negative correlation learning that has been shown, both theoretically and empirically, to work well for non-deep regression problems. Without extra parameters, the proposed method controls the bias-variance-covariance trade-off systematically and usually yields a deep regression ensemble where each base model is both \u201caccurate\u201d and \u201cdiversified.\u201d Moreover, we show that each sub-problem in the proposed method has less Rademacher Complexity and thus is easier to optimize. Extensive experiments on several diverse and challenging tasks including crowd counting, personality analysis, age estimation, and image super-resolution demonstrate the superiority over challenging baselines as well as the versatility of the proposed method. The source code and trained models are available on our project page: https:\/\/mmcheng.net\/dncl\/.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('23','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_proceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Wanyue Zhang; Xun Xu; Fayao Liu; Le Zhang; Chuan-Sheng Foo<\/p><p class=\"tp_pub_title\">On Automatic Data Augmentation for 3D Point Cloud Classification <span class=\"tp_pub_type proceedings\">Proceeding<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_howpublished\">BMVC, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_24\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('24','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_24\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@proceedings{zhang2021automatic,<br \/>\r\ntitle = {On Automatic Data Augmentation for 3D Point Cloud Classification},<br \/>\r\nauthor = {Wanyue Zhang and Xun Xu and Fayao Liu and Le Zhang and Chuan-Sheng Foo},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\nhowpublished = {BMVC},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {proceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('24','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yu-Huan Wu; Yun Liu; Le Zhang; Wang Gao; Ming-Ming Cheng<\/p><p class=\"tp_pub_title\">Regularized Densely-Connected Pyramid Network for Salient Instance Segmentation <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TIP, <\/span><span class=\"tp_pub_additional_volume\">30 <\/span>, <span class=\"tp_pub_additional_pages\">pp. 3897-3907, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_28\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('28','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_28\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('28','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/yuhuan-wu\/RDPNet\" title=\"https:\/\/github.com\/yuhuan-wu\/RDPNet\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><a class=\"tp_pub_link\" href=\"https:\/\/dx.doi.org\/10.1109\/TIP.2021.3065822\" title=\"Follow DOI:10.1109\/TIP.2021.3065822\" target=\"_blank\"><i class=\"ai ai-doi\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_28\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{9382868,<br \/>\r\ntitle = {Regularized Densely-Connected Pyramid Network for Salient Instance Segmentation},<br \/>\r\nauthor = {Yu-Huan Wu and Yun Liu and Le Zhang and Wang Gao and Ming-Ming Cheng},<br \/>\r\nurl = {https:\/\/github.com\/yuhuan-wu\/RDPNet},<br \/>\r\ndoi = {10.1109\/TIP.2021.3065822},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IEEE TIP},<br \/>\r\nvolume = {30},<br \/>\r\npages = {3897-3907},<br \/>\r\nabstract = {Much of the recent efforts on salient object detection (SOD) have been devoted to producing accurate saliency maps without being aware of their instance labels. To this end, we propose a new pipeline for end-to-end salient instance segmentation (SIS) that predicts a class-agnostic mask for each detected salient instance. To better use the rich feature hierarchies in deep networks and enhance the side predictions, we propose the regularized dense connections, which attentively promote informative features and suppress non-informative ones from all feature pyramids. A novel multi-level RoIAlign based decoder is introduced to adaptively aggregate multi-level features for better mask predictions. Such strategies can be well-encapsulated into the Mask R-CNN pipeline. Extensive experiments on popular benchmarks demonstrate that our design significantly outperforms existing sArt competitors by 6.3% (58.6% vs. 52.3%) in terms of the AP metric.The code is available at this https URL.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('28','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_28\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Much of the recent efforts on salient object detection (SOD) have been devoted to producing accurate saliency maps without being aware of their instance labels. To this end, we propose a new pipeline for end-to-end salient instance segmentation (SIS) that predicts a class-agnostic mask for each detected salient instance. To better use the rich feature hierarchies in deep networks and enhance the side predictions, we propose the regularized dense connections, which attentively promote informative features and suppress non-informative ones from all feature pyramids. A novel multi-level RoIAlign based decoder is introduced to adaptively aggregate multi-level features for better mask predictions. Such strategies can be well-encapsulated into the Mask R-CNN pipeline. Extensive experiments on popular benchmarks demonstrate that our design significantly outperforms existing sArt competitors by 6.3% (58.6% vs. 52.3%) in terms of the AP metric.The code is available at this https URL.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('28','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_inproceedings\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Wei Wang Wei Cui Bing Li; Min Wu<\/p><p class=\"tp_pub_title\">Two-Stream Convolution Augmented Transformer for Human Activity Recognition <span class=\"tp_pub_type inproceedings\">Inproceedings<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_booktitle\">AAAI, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_30\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('30','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_30\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('30','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/windofshadow\/THAT\" title=\"https:\/\/github.com\/windofshadow\/THAT\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_30\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@inproceedings{bing2021that,<br \/>\r\ntitle = {Two-Stream Convolution Augmented Transformer for Human Activity Recognition},<br \/>\r\nauthor = {Wei Wang Wei Cui Bing Li and Min Wu},<br \/>\r\nurl = {https:\/\/github.com\/windofshadow\/THAT},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\nbooktitle = {AAAI},<br \/>\r\nabstract = {Recognition of human activities is an important task due to its far-reaching applications such as healthcare system, context-aware applications, and security monitoring. Recently, WiFi based human activity recognition (HAR) is becoming ubiquitous due to its non-invasiveness. Existing WiFi-based HAR methods regard WiFi signals as a temporal sequence of channel state information (CSI), and employ deep sequential models (e.g., RNN, LSTM) to automatically capture channel-over-time features. Although being remarkably effective, they suffer from two major drawbacks. Firstly, the granularity of a single temporal point is blindly elementary for representing meaningful CSI patterns. Secondly, the time-over-channel features are also important, and could be a natural data augmentation. To address the drawbacks, we propose a novel Two-stream Convolution Augmented Human Activity Transformer (THAT) model. Our model proposes to utilize a two-stream structure to capture both time-over-channel and channel-over-time features, and use the multi-scale convolution augmented transformer to capture range-based patterns. Extensive experiments on four real experiment datasets demonstrate that our model outperforms state-of-the-art models in terms of both effectiveness and efficiency.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {inproceedings}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('30','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_30\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Recognition of human activities is an important task due to its far-reaching applications such as healthcare system, context-aware applications, and security monitoring. Recently, WiFi based human activity recognition (HAR) is becoming ubiquitous due to its non-invasiveness. Existing WiFi-based HAR methods regard WiFi signals as a temporal sequence of channel state information (CSI), and employ deep sequential models (e.g., RNN, LSTM) to automatically capture channel-over-time features. Although being remarkably effective, they suffer from two major drawbacks. Firstly, the granularity of a single temporal point is blindly elementary for representing meaningful CSI patterns. Secondly, the time-over-channel features are also important, and could be a natural data augmentation. To address the drawbacks, we propose a novel Two-stream Convolution Augmented Human Activity Transformer (THAT) model. Our model proposes to utilize a two-stream structure to capture both time-over-channel and channel-over-time features, and use the multi-scale convolution augmented transformer to capture range-based patterns. Extensive experiments on four real experiment datasets demonstrate that our model outperforms state-of-the-art models in terms of both effectiveness and efficiency.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('30','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Jia-Wang Bian; Huangying Zhan; Naiyan Wang; Zhichao Li; Le Zhang; Chunhua Shen; Ming-Ming Cheng; Ian Reid<\/p><p class=\"tp_pub_title\">Unsupervised Scale-consistent Depth Learning from Video <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IJCV, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_31\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('31','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_31\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('31','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/JiawangBian\/SC-SfMLearner-Release\" title=\"https:\/\/github.com\/JiawangBian\/SC-SfMLearner-Release\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_31\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{bian2021ijcv,<br \/>\r\ntitle = {Unsupervised Scale-consistent Depth Learning from Video},<br \/>\r\nauthor = {Jia-Wang Bian and Huangying Zhan and Naiyan Wang and Zhichao Li and Le Zhang and Chunhua Shen and Ming-Ming Cheng and Ian Reid},<br \/>\r\nurl = {https:\/\/github.com\/JiawangBian\/SC-SfMLearner-Release},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IJCV},<br \/>\r\nabstract = {We propose a monocular depth estimation method SC-Depth, which requires only unlabelled videos for training and enables the scale-consistent prediction at inference time. Our contributions include: (i) we propose a geometry consistency loss, which penalizes the inconsistency of predicted depths between adjacent views; (ii) we propose a self-discovered mask to automatically localize moving objects that violate the underlying static scene assumption and cause noisy signals during training; (iii) we demonstrate the efficacy of each component with a detailed ablation study and show high-quality<br \/>\r\ndepth estimation results in both KITTI and NYUv2 datasets. Moreover, thanks to the capability of scaleconsistent prediction, we show that our monocular-trained deep networks are readily integrated into ORB-SLAM2 system for more robust and accurate tracking. The proposed hybrid Pseudo-RGBD SLAM shows compelling results in KITTI, and it generalizes well to the KAIST dataset without additional training. Finally, we provide several demos for qualitative evaluation. The source code is released on GitHub.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('31','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_31\" style=\"display:none;\"><div class=\"tp_abstract_entry\">We propose a monocular depth estimation method SC-Depth, which requires only unlabelled videos for training and enables the scale-consistent prediction at inference time. Our contributions include: (i) we propose a geometry consistency loss, which penalizes the inconsistency of predicted depths between adjacent views; (ii) we propose a self-discovered mask to automatically localize moving objects that violate the underlying static scene assumption and cause noisy signals during training; (iii) we demonstrate the efficacy of each component with a detailed ablation study and show high-quality<br \/>\r\ndepth estimation results in both KITTI and NYUv2 datasets. Moreover, thanks to the capability of scaleconsistent prediction, we show that our monocular-trained deep networks are readily integrated into ORB-SLAM2 system for more robust and accurate tracking. The proposed hybrid Pseudo-RGBD SLAM shows compelling results in KITTI, and it generalizes well to the KAIST dataset without additional training. Finally, we provide several demos for qualitative evaluation. The source code is released on GitHub.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('31','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Yun Liu; Xin-Yu Zhang; Jia-Wang Bian; Le Zhang; Ming-Ming Cheng<\/p><p class=\"tp_pub_title\">Samnet: Stereoscopically Attentive Multi-Scale Network for Lightweight Salient Object Detection <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TIP, <\/span><span class=\"tp_pub_additional_volume\">30 <\/span>, <span class=\"tp_pub_additional_pages\">pp. 3804\u20133814, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_32\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('32','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_32\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('32','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/mmcheng.net\/SAMNet\/\" title=\"https:\/\/mmcheng.net\/SAMNet\/\" target=\"_blank\"><i class=\"fas fa-globe\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_32\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{liu2021samnet,<br \/>\r\ntitle = {Samnet: Stereoscopically Attentive Multi-Scale Network for Lightweight Salient Object Detection},<br \/>\r\nauthor = {Yun Liu and Xin-Yu Zhang and Jia-Wang Bian and Le Zhang and Ming-Ming Cheng},<br \/>\r\nurl = {https:\/\/mmcheng.net\/SAMNet\/},<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IEEE TIP},<br \/>\r\nvolume = {30},<br \/>\r\npages = {3804--3814},<br \/>\r\npublisher = {IEEE},<br \/>\r\nabstract = {Recent progress on salient object detection (SOD) mostly benefits from the explosive development of Convolutional Neural Networks (CNNs). However, much of the improvement comes with the larger network size and heavier computation overhead, which, in our view, is not mobile-friendly and thus difficult to deploy in practice. To promote more practical SOD systems, we introduce a novel Stereoscopically Attentive Multi-scale (SAM) module, which adopts a stereoscopic attention mechanism to adaptively fuse the features of various scales. Embarking on this module, we propose an extremely lightweight network, namely SAMNet, for SOD. Extensive experiments on popular benchmarks demonstrate that the proposed SAMNet yields comparable accuracy with state-of-the-art methods while running at a GPU speed of 343fps and a CPU speed of 5fps for 336 \u00d7336 inputs with only 1.33M parameters. Therefore, SAMNet paves a new path towards SOD. The source code is available on the project page https:\/\/mmcheng.net\/SAMNet\/},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('32','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_32\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Recent progress on salient object detection (SOD) mostly benefits from the explosive development of Convolutional Neural Networks (CNNs). However, much of the improvement comes with the larger network size and heavier computation overhead, which, in our view, is not mobile-friendly and thus difficult to deploy in practice. To promote more practical SOD systems, we introduce a novel Stereoscopically Attentive Multi-scale (SAM) module, which adopts a stereoscopic attention mechanism to adaptively fuse the features of various scales. Embarking on this module, we propose an extremely lightweight network, namely SAMNet, for SOD. Extensive experiments on popular benchmarks demonstrate that the proposed SAMNet yields comparable accuracy with state-of-the-art methods while running at a GPU speed of 343fps and a CPU speed of 5fps for 336 \u00d7336 inputs with only 1.33M parameters. Therefore, SAMNet paves a new path towards SOD. The source code is available on the project page https:\/\/mmcheng.net\/SAMNet\/<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('32','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Joey Tianyi Zhou; Le Zhang*; Jiawei Du; Xi Peng; Zhiwen Fang; Zhe Xiao; Hongyuan Zhu; <\/p><p class=\"tp_pub_title\">Locality-Aware Crowd Counting <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE TPAMI, <\/span><span class=\"tp_pub_additional_year\">2021<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_33\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('33','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_33\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{zhou2021locality,<br \/>\r\ntitle = {Locality-Aware Crowd Counting},<br \/>\r\nauthor = {Joey Tianyi Zhou; Le Zhang*; Jiawei Du; Xi Peng; Zhiwen Fang; Zhe Xiao; Hongyuan Zhu; },<br \/>\r\nyear  = {2021},<br \/>\r\ndate = {2021-01-01},<br \/>\r\nurldate = {2021-01-01},<br \/>\r\njournal = {IEEE TPAMI},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('33','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><h3 class=\"tp_h3\" id=\"tp_h3_2020\">2020<\/h3><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Le Zhang; Zhenghua Chen; Wei Cui; Bing Li; Cen Chen; Zhiguang Cao; Kaizhou Gao<\/p><p class=\"tp_pub_title\">Wifi-based indoor robot positioning using deep fuzzy forests <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE Internet of Things Journal, <\/span><span class=\"tp_pub_additional_volume\">7 <\/span><span class=\"tp_pub_additional_number\">(11), <\/span><span class=\"tp_pub_additional_pages\">pp. 10773\u201310781, <\/span><span class=\"tp_pub_additional_year\">2020<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_16\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('16','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_16\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{zhang2020wifi,<br \/>\r\ntitle = {Wifi-based indoor robot positioning using deep fuzzy forests},<br \/>\r\nauthor = {Le Zhang and Zhenghua Chen and Wei Cui and Bing Li and Cen Chen and Zhiguang Cao and Kaizhou Gao},<br \/>\r\nyear  = {2020},<br \/>\r\ndate = {2020-01-01},<br \/>\r\njournal = {IEEE Internet of Things Journal},<br \/>\r\nvolume = {7},<br \/>\r\nnumber = {11},<br \/>\r\npages = {10773--10781},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('16','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Zhiguang Cao; Hongliang Guo; Wen Song; Kaizhou Gao; Zhenghua Chen; Le Zhang; Xuexi Zhang<\/p><p class=\"tp_pub_title\">Using reinforcement learning to minimize the probability of delay occurrence in transportation <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE transactions on vehicular technology, <\/span><span class=\"tp_pub_additional_volume\">69 <\/span><span class=\"tp_pub_additional_number\">(3), <\/span><span class=\"tp_pub_additional_pages\">pp. 2424\u20132436, <\/span><span class=\"tp_pub_additional_year\">2020<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_17\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('17','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_17\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{cao2020using,<br \/>\r\ntitle = {Using reinforcement learning to minimize the probability of delay occurrence in transportation},<br \/>\r\nauthor = {Zhiguang Cao and Hongliang Guo and Wen Song and Kaizhou Gao and Zhenghua Chen and Le Zhang and Xuexi Zhang},<br \/>\r\nyear  = {2020},<br \/>\r\ndate = {2020-01-01},<br \/>\r\njournal = {IEEE transactions on vehicular technology},<br \/>\r\nvolume = {69},<br \/>\r\nnumber = {3},<br \/>\r\npages = {2424--2436},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('17','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">Cen Chen; Xiaofeng Zou; Zeng Zeng; Zhongyao Cheng; Le Zhang; Steven CH Hoi<\/p><p class=\"tp_pub_title\">Exploring structural knowledge for automated visual inspection of moving trains <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IEEE transactions on cybernetics, <\/span><span class=\"tp_pub_additional_year\">2020<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_18\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('18','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_18\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{chen2020exploring,<br \/>\r\ntitle = {Exploring structural knowledge for automated visual inspection of moving trains},<br \/>\r\nauthor = {Cen Chen and Xiaofeng Zou and Zeng Zeng and Zhongyao Cheng and Le Zhang and Steven CH Hoi},<br \/>\r\nyear  = {2020},<br \/>\r\ndate = {2020-01-01},<br \/>\r\njournal = {IEEE transactions on cybernetics},<br \/>\r\npublisher = {IEEE},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('18','tp_bibtex')\">Close<\/a><\/p><\/div><\/div><\/div><div class=\"tp_publication tp_publication_article\"><div class=\"tp_pub_info\"><p class=\"tp_pub_author\">JiaWang Bian; Wen-Yan Lin; Yun Liu; Le Zhang; Sai-Kit Yeung; Ming-Ming Cheng; Ian Reid<\/p><p class=\"tp_pub_title\">GMS: Grid-based Motion Statistics for Fast, Ultra-Robust Feature Correspondence <span class=\"tp_pub_type article\">Journal Article<\/span> <\/p><p class=\"tp_pub_additional\"><span class=\"tp_pub_additional_in\">In: <\/span><span class=\"tp_pub_additional_journal\">IJCV, <\/span><span class=\"tp_pub_additional_year\">2020<\/span>.<\/p><p class=\"tp_pub_menu\"><span class=\"tp_abstract_link\"><a id=\"tp_abstract_sh_21\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('21','tp_abstract')\" title=\"Show abstract\" style=\"cursor:pointer;\">Abstract<\/a><\/span> | <span class=\"tp_bibtex_link\"><a id=\"tp_bibtex_sh_21\" class=\"tp_show\" onclick=\"teachpress_pub_showhide('21','tp_bibtex')\" title=\"Show BibTeX entry\" style=\"cursor:pointer;\">BibTeX<\/a><\/span> | <span class=\"tp_pub_tags_label\">\u6807\u7b7e: <\/span><span class=\"tp_resource_link\"> | <span class=\"tp_pub_links_label\">Links: <\/span><a class=\"tp_pub_link\" href=\"https:\/\/github.com\/JiawangBian\/GMS-Feature-Matcher\" title=\"https:\/\/github.com\/JiawangBian\/GMS-Feature-Matcher\" target=\"_blank\"><i class=\"fab fa-github\"><\/i><\/a><\/span><\/p><div class=\"tp_bibtex\" id=\"tp_bibtex_21\" style=\"display:none;\"><div class=\"tp_bibtex_entry\"><pre>@article{Bian2020gms,<br \/>\r\ntitle = {GMS: Grid-based Motion Statistics for Fast, Ultra-Robust Feature Correspondence},<br \/>\r\nauthor = {JiaWang Bian and Wen-Yan Lin and Yun Liu and Le Zhang and Sai-Kit Yeung and Ming-Ming Cheng and Ian Reid},<br \/>\r\nurl = {https:\/\/github.com\/JiawangBian\/GMS-Feature-Matcher},<br \/>\r\nyear  = {2020},<br \/>\r\ndate = {2020-01-01},<br \/>\r\nurldate = {2020-01-01},<br \/>\r\njournal = {IJCV},<br \/>\r\nabstract = {Feature matching aims at generating correspondences across images, which is widely used in many computer vision tasks. Although considerable progress has been made on feature descriptors and fast matching for initial correspondence hypotheses, selecting good ones from them is still challenging and critical to the overall performance. More importantly, existing methods often take a long computational time, limiting their use in real-time applications. This paper attempts to separate true correspondences from false ones at high speed. We term the proposed method (GMS) grid-based motion Statistics, which incorporates the smoothness constraint into a statistic framework for separation and uses a grid-based implementation for fast calculation. GMS is robust to various challenging image changes, involving in viewpoint, scale, and rotation. It is also fast, e.g., take only 1 or 2 ms in a single CPU thread, even when 50K correspondences are processed. This has important implications for real-time applications. What\u2019s more, we show that incorporating GMS into the classic feature matching and epipolar geometry estimation pipeline can significantly boost the overall performance. Finally, we integrate GMS into the well-known ORB-SLAM system for monocular initialization, resulting in a significant improvement.},<br \/>\r\nkeywords = {},<br \/>\r\npubstate = {published},<br \/>\r\ntppubtype = {article}<br \/>\r\n}<br \/>\r\n<\/pre><\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('21','tp_bibtex')\">Close<\/a><\/p><\/div><div class=\"tp_abstract\" id=\"tp_abstract_21\" style=\"display:none;\"><div class=\"tp_abstract_entry\">Feature matching aims at generating correspondences across images, which is widely used in many computer vision tasks. Although considerable progress has been made on feature descriptors and fast matching for initial correspondence hypotheses, selecting good ones from them is still challenging and critical to the overall performance. More importantly, existing methods often take a long computational time, limiting their use in real-time applications. This paper attempts to separate true correspondences from false ones at high speed. We term the proposed method (GMS) grid-based motion Statistics, which incorporates the smoothness constraint into a statistic framework for separation and uses a grid-based implementation for fast calculation. GMS is robust to various challenging image changes, involving in viewpoint, scale, and rotation. It is also fast, e.g., take only 1 or 2 ms in a single CPU thread, even when 50K correspondences are processed. This has important implications for real-time applications. What\u2019s more, we show that incorporating GMS into the classic feature matching and epipolar geometry estimation pipeline can significantly boost the overall performance. Finally, we integrate GMS into the well-known ORB-SLAM system for monocular initialization, resulting in a significant improvement.<\/div><p class=\"tp_close_menu\"><a class=\"tp_close\" onclick=\"teachpress_pub_showhide('21','tp_abstract')\">Close<\/a><\/p><\/div><\/div><\/div><\/div><div class=\"tablenav\"><div class=\"tablenav-pages\"><span class=\"displaying-num\">62 entries<\/span> <a class=\"page-numbers button disabled\">&laquo;<\/a> <a class=\"page-numbers button disabled\">&lsaquo;<\/a> 1 of 2 <a href=\"http:\/\/zhangleuestc.cn\/index.php\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"next page\" class=\"page-numbers button\">&rsaquo;<\/a> <a href=\"http:\/\/zhangleuestc.cn\/index.php\/publications\/?limit=2&amp;tgid=&amp;yr=&amp;type=&amp;usr=&amp;auth=&amp;tsr=#tppubs\" title=\"last page\" class=\"page-numbers button\">&raquo;<\/a> <\/div><\/div><\/div>\n","protected":false},"excerpt":{"rendered":"<p>Publications<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"site-sidebar-layout":"no-sidebar","site-content-layout":"page-builder","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"disabled","ast-breadcrumbs-content":"","ast-featured-img":"disabled","footer-sml-layout":"","theme-transparent-header-meta":"enabled","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","footnotes":""},"_links":{"self":[{"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/pages\/691"}],"collection":[{"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/comments?post=691"}],"version-history":[{"count":106,"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/pages\/691\/revisions"}],"predecessor-version":[{"id":1334,"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/pages\/691\/revisions\/1334"}],"wp:attachment":[{"href":"http:\/\/zhangleuestc.cn\/index.php\/wp-json\/wp\/v2\/media?parent=691"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}