publications (ja)
2025
- ASJEnd-to-End Neural Speaker Diarization Using State Space Model MambaNaohiro Tawara, Alexis Plaquet, Marc Delcroix, Shota Horiguchi, Atsushi Ando, and Shoko ArakiIn The 2025 Spring Meeting of the Acoustical Society of Japan, Mar 2025
@inproceedings{tawara2025joutai, title = {End-to-End Neural Speaker Diarization Using State Space Model Mamba}, booktitle = {The 2025 Spring Meeting of the Acoustical Society of Japan}, author = {Tawara, Naohiro and Plaquet, Alexis and Delcroix, Marc and Horiguchi, Shota and Ando, Atsushi and Araki, Shoko}, year = {2025}, month = mar, date = {19}, location = {Saitama} }
- ASJMulti-Channel Speaker Counting for End-to-End Neural Speaker DiarizationNaohiro Tawara, Atsushi Ando, Shota Horiguchi, and Marc DelcroixIn The 2025 Spring Meeting of the Acoustical Society of Japan, Mar 2025
@inproceedings{tawara2025end, title = {Multi-Channel Speaker Counting for End-to-End Neural Speaker Diarization}, booktitle = {The 2025 Spring Meeting of the Acoustical Society of Japan}, author = {Tawara, Naohiro and Ando, Atsushi and Horiguchi, Shota and Delcroix, Marc}, year = {2025}, month = mar, date = {17}, location = {Saitama} }
- SPAn Analysis of Speaker Representation for Target-Speaker Speech ProcessingTakanori Ashihara, Takafumi Moriya, Shota Horiguchi, Junyi Peng, Tsubasa Ochiai, Marc Delcroix, Kohei Matsuura, and Hiroshi SatoIn IPSJ Special Interest Group on Spoken Language Processing (IPSJ-SLP), Mar 2025
Target-speaker (TS) speech processing tasks, including TS automatic speech recognition (TS-ASR), target speech extraction (TSE), and personal voice activity detection (p-VAD), extract desired speaker information in multi-speaker scenarios. While prior work emphasizes task-specific training schemes or architectures, the embedding network for TS cues remains underexplored across tasks. This study investigates the optimal speaker embedding for TS tasks by comparing pre-trained speaker encoders (self-supervised or speaker recognition models) with one-hot ideal embeddings derived from speaker identity. Additionally, embeddings are subsequently optimized using gradient-based approaches. Our analysis reveals that speaker verification performance is somewhat unrelated to TS task performances, the one-hot vector outperforms enrollment-based ones, and the optimal embedding depends on the input mixture.
@inproceedings{ashihara2025mokuteki, title = {An Analysis of Speaker Representation for Target-Speaker Speech Processing}, booktitle = {IPSJ Special Interest Group on Spoken Language Processing (IPSJ-SLP)}, author = {Ashihara, Takanori and Moriya, Takafumi and Horiguchi, Shota and Peng, Junyi and Ochiai, Tsubasa and Delcroix, Marc and Matsuura, Kohei and Sato, Hiroshi}, year = {2025}, month = mar, date = {4}, location = {Okinawa}, }
- IPSJ-SLPSpeaking Style Captioning Using Speech Factor ConditioningAtsushi Ando, Takafumi Moriya, Shota Horiguchi, and Ryo MasumuraIn IPSJ Special Interest Group on Spoken Language Processing (IPSJ-SLP), Mar 2025
This work presents a novel speaking-style captioning method that generates diverse descriptions while accurately including speaking-style information such as gender, pitch, and volume. Conventional methods rely on original captions, which contain not only speaking-style-related terms but also syntactic words, making it difficult to learn speaking-style characteristics from speech and often resulting in incorrect captions. To address this problem, the proposed method introduces factor-conditioned captioning (FCC), which first outputs a factor phrase representing speaking-style information and then generates a caption to ensure the model explicitly learns speaking-style factors. Additionally, we propose greedy-then-sampling (GtS) decoding, which first predicts speaking-style factors deterministically to guarantee semantic accuracy and then generates a caption based on factor-conditioned sampling to ensure diversity. Experiments show that the proposed method generates more diverse captions while improving style prediction performance compared to conventional methods.
@inproceedings{ando2025mokuteki, title = {Speaking Style Captioning Using Speech Factor Conditioning}, booktitle = {IPSJ Special Interest Group on Spoken Language Processing (IPSJ-SLP)}, author = {Ando, Atsushi and Moriya, Takafumi and Horiguchi, Shota and Masumura, Ryo}, year = {2025}, month = mar, date = {4}, location = {Okinawa}, }
- SPSpeech-Activity-Guided Speaker Embedding ExtractionShota Horiguchi, Takafumi Moriya, Atsushi Ando, Takanori Ashihara, Hiroshi Sato, Naohiro Tawara, and Marc DelcroixIn IEICE Technical Comittee on Speech (SP), Mar 2025
This paper proposes a guided speaker embedding extraction system, which extracts speaker embeddings of the target speaker using speech activities of target and interference speakers as clues. Several methods for long-form overlapped multi-speaker audio processing are typically two-staged: i) segment-level processing and ii) inter-segment speaker matching. Speaker embeddings are often used for the latter purpose. Typical speaker embedding extraction approaches only use single-speaker intervals to avoid corrupting the embeddings with speech from interference speakers. However, this often makes speaker embeddings impossible to extract because sufficiently long non-overlapping intervals are not always available. In this paper, we propose using speaker activities as clues to extract the embedding of the speaker-of-interest directly from overlapping speech. Specifically, we concatenate the activity of target and non-target speakers to acoustic features before being fed to the model. We also condition the attention weights used for pooling so that the attention weights of the intervals in which the target speaker is inactive are zero. The effectiveness of the proposed method is demonstrated in speaker verification and diarization.
@inproceedings{horiguchi2025hatsuwa, title = {Speech-Activity-Guided Speaker Embedding Extraction}, booktitle = {IEICE Technical Comittee on Speech (SP)}, author = {Horiguchi, Shota and Moriya, Takafumi and Ando, Atsushi and Ashihara, Takanori and Sato, Hiroshi and Tawara, Naohiro and Delcroix, Marc}, year = {2025}, month = mar, date = {2}, location = {Okinawa}, }
2024
- ASJImproving the Naturalness of Simulated Conversations for End-to-End Neural DiarizationNatsuo Yamashita, Shota Horiguchi, and Takeshi HommaIn The 2024 Autumn Meeting of the Acoustical Society of Japan, Sep 2024
@inproceedings{yamashita2024endtoend, title = {Improving the Naturalness of Simulated Conversations for End-to-End Neural Diarization}, booktitle = {The 2024 Autumn Meeting of the Acoustical Society of Japan}, author = {Yamashita, Natsuo and Horiguchi, Shota and Homma, Takeshi}, year = {2024}, month = sep, date = {4}, pages = {1091--1094}, location = {Osaka} }
- ASJSpeaker Embedding Extraction from Multi-Speaker RecordingsShota Horiguchi, Atsushi Ando, Takafumi Moriya, Takanori Ashihara, Hiroshi Sato, Naohiro Tawara, and Marc DelcroixIn The 2024 Autumn Meeting of the Acoustical Society of Japan, Sep 2024
@inproceedings{horiguchi2024fukusuu, title = {Speaker Embedding Extraction from Multi-Speaker Recordings}, booktitle = {The 2024 Autumn Meeting of the Acoustical Society of Japan}, author = {Horiguchi, Shota and Ando, Atsushi and Moriya, Takafumi and Ashihara, Takanori and Sato, Hiroshi and Tawara, Naohiro and Delcroix, Marc}, year = {2024}, month = sep, date = {4}, pages = {1087--1090}, location = {Osaka} }
2023
- IBISStreaming Active Learning for Regression Problems Using Regression via ClassificationShota Horiguchi, Kota Dohi, and Yohei KawaguchiIn The 26th Information-Based Induction Sciences Workshop, Oct 2023
@inproceedings{horiguchi2023regression, title = {Streaming Active Learning for Regression Problems Using Regression via Classification}, booktitle = {The 26th Information-Based Induction Sciences Workshop}, author = {Horiguchi, Shota and Dohi, Kota and Kawaguchi, Yohei}, year = {2023}, month = oct, date = {30}, location = {Fukuoka} }
- ASJRisk Assessment of Spoofing Attacks Using Self-Supervised Learning Models on Speaker Verification SystemsAoi Ito and Shota HoriguchiIn The 2023 Autumn Meeting of the Acoustical Society of Japan, Sep 2023🏆 ASJ Best Student Presentation Award
@inproceedings{ito2023jiko, title = {Risk Assessment of Spoofing Attacks Using Self-Supervised Learning Models on Speaker Verification Systems}, booktitle = {The 2023 Autumn Meeting of the Acoustical Society of Japan}, author = {Ito, Aoi and Horiguchi, Shota}, year = {2023}, month = sep, date = {29}, pages = {1151--1154}, location = {Aichi}, }
- ASJBlock-Online Speaker Diarization for Unlimited Numbers of SpeakersShota Horiguchi, Shinji Watanabe, Paola Garcia, Yuki Takashima, and Yohei KawaguchiIn The 2023 Autumn Meeting of the Acoustical Society of Japan, Sep 2023
@inproceedings{horiguchi2023washasu, title = {Block-Online Speaker Diarization for Unlimited Numbers of Speakers}, booktitle = {The 2023 Autumn Meeting of the Acoustical Society of Japan}, author = {Horiguchi, Shota and Watanabe, Shinji and Garcia, Paola and Takashima, Yuki and Kawaguchi, Yohei}, year = {2023}, month = sep, date = {28}, pages = {1239--1242}, location = {Aichi} }
- ASJMutual Learning of Single- and Multi-Channel Speaker Diarization ModelsShota Horiguchi, Yuki Takashima, Shinji Watanabe, and Paola GarciaIn The 2023 Spring Meeting of the Acoustical Society of Japan, Mar 2023
@inproceedings{horiguchi2023mutual, title = {Mutual Learning of Single- and Multi-Channel Speaker Diarization Models}, booktitle = {The 2023 Spring Meeting of the Acoustical Society of Japan}, author = {Horiguchi, Shota and Takashima, Yuki and Watanabe, Shinji and Garcia, Paola}, year = {2023}, month = mar, date = {17}, pages = {745--748}, location = {Online} }
2022
- ASJPreventing Catastrophic Forgetting by Partial Fine-Tuning for Continual Learning of End-to-End ASRYuki Takashima, Shota Horiguchi, Shinji Watanabe, Paola Garcia, and Yohei KawaguchiIn The 2022 Autumn Meeting of the Acoustical Society of Japan, Sep 2022
@inproceedings{takashima2022endtoend, title = {Preventing Catastrophic Forgetting by Partial Fine-Tuning for Continual Learning of End-to-End ASR}, booktitle = {The 2022 Autumn Meeting of the Acoustical Society of Japan}, author = {Takashima, Yuki and Horiguchi, Shota and Watanabe, Shinji and Garcia, Paola and Kawaguchi, Yohei}, year = {2022}, month = sep, date = {16}, pages = {1309--1312}, location = {Hokkaido} }
- ASJMulti-Channel Neural Diarization Using Distributed MicrophonesShota Horiguchi, Yuki Takashima, Paola Garcia, Shinji Watanabe, and Yohei KawaguchiIn The 2022 Autumn Meeting of the Acoustical Society of Japan, Sep 2022
@inproceedings{horiguchi2022multichannel, title = {Multi-Channel Neural Diarization Using Distributed Microphones}, booktitle = {The 2022 Autumn Meeting of the Acoustical Society of Japan}, author = {Horiguchi, Shota and Takashima, Yuki and Garcia, Paola and Watanabe, Shinji and Kawaguchi, Yohei}, year = {2022}, month = sep, date = {15}, pages = {1261--1264}, location = {Hokkaido} }
- ASJEnvironmental Sound Extraction Using OnomatopoeiaYuki Okamoto, Shota Horiguchi, Masaaki Yamamoto, Keisuke Imoto, and Yohei KawaguchiIn The 2022 Spring Meeting of the Acoustical Society of Japan, Mar 2022
@inproceedings{okamoto2022onomatopoe, title = {Environmental Sound Extraction Using Onomatopoeia}, booktitle = {The 2022 Spring Meeting of the Acoustical Society of Japan}, author = {Okamoto, Yuki and Horiguchi, Shota and Yamamoto, Masaaki and Imoto, Keisuke and Kawaguchi, Yohei}, year = {2022}, month = mar, date = {10}, pages = {247--250}, location = {Online} }
- ASJSemi-Supervised Adaptation with Pseudo-Labeling for End-to-End Neural DiarizationYuki Takashima, Yusuke Fujita, Shota Horiguchi, Shinji Watanabe, Paola Garcia, and Kenji NagamatsuIn The 2022 Spring Meeting of the Acoustical Society of Japan, Mar 2022
@inproceedings{takashima2022pseudolabel, title = {Semi-Supervised Adaptation with Pseudo-Labeling for End-to-End Neural Diarization}, booktitle = {The 2022 Spring Meeting of the Acoustical Society of Japan}, author = {Takashima, Yuki and Fujita, Yusuke and Horiguchi, Shota and Watanabe, Shinji and Garcia, Paola and Nagamatsu, Kenji}, year = {2022}, month = mar, date = {9}, pages = {919--922}, location = {Online} }
- ASJNeural Diarization for Unlimited Number of Speakers Using Global and Local AttractorsShota Horiguchi, Shinji Watanabe, Paola Garcia, Yawen Xue, Yuki Takashima, and Yohei KawaguchiIn The 2022 Spring Meeting of the Acoustical Society of Japan, Mar 2022
@inproceedings{horiguchi2022global, title = {Neural Diarization for Unlimited Number of Speakers Using Global and Local Attractors}, booktitle = {The 2022 Spring Meeting of the Acoustical Society of Japan}, author = {Horiguchi, Shota and Watanabe, Shinji and Garcia, Paola and Xue, Yawen and Takashima, Yuki and Kawaguchi, Yohei}, year = {2022}, month = mar, date = {9}, pages = {915--918}, location = {Online} }
2017
- MEA Comparative Evaluation of Deep Features—Classifier-Based Learning vs. Distance Metric LearningShota Horiguchi, Daiki Ikami, and Kiyoharu AizawaIn ITE Technical Group on Media Engineering (ME), Feb 2017🏆 ITE Outstanding Research Presentation Award
@inproceedings{horiguchi2017deepfeature, title = {A Comparative Evaluation of Deep Features---Classifier-Based Learning vs. Distance Metric Learning}, booktitle = {ITE Technical Group on Media Engineering (ME)}, author = {Horiguchi, Shota and Ikami, Daiki and Aizawa, Kiyoharu}, year = {2017}, month = feb, date = {21}, pages = {197--202}, location = {Hokkaido}, }
- PRMUIncremental Personalization of Image ClassifiersShota Horiguchi, Sosuke Amano, Kiyoharu Aizawa, and Makoto OgawaIn IEICE Technical Committee on Pattern Recognition and Media Understanding (PRMU), Feb 2017
@inproceedings{horiguchi2017personalization, title = {Incremental Personalization of Image Classifiers}, booktitle = {IEICE Technical Committee on Pattern Recognition and Media Understanding (PRMU)}, author = {Horiguchi, Shota and Amano, Sosuke and Aizawa, Kiyoharu and Ogawa, Makoto}, year = {2017}, month = feb, date = {19}, pages = {149--154}, location = {Hokkaido}, }
2016
- MIRULarge Scale Meal Image Recognition via Personalized ClassifiersShota Horiguchi, Sosuke Amano, Kiyoharu Aizawa, and Makoto OgawaIn The 19th Meeting on Image Recognition and Understanding (MIRU), Aug 2016
@inproceedings{horiguchi2016personalization, title = {Large Scale Meal Image Recognition via Personalized Classifiers}, booktitle = {The 19th Meeting on Image Recognition and Understanding (MIRU)}, author = {Horiguchi, Shota and Amano, Sosuke and Aizawa, Kiyoharu and Ogawa, Makoto}, year = {2016}, month = aug, date = {3}, location = {Osaka} }
2015
- ITEA Discussion of Novelty Detection in Image RecognitionMichihiro Mizuno, Akito Takeki, Shota Horiguchi, Toshihiko Yamasaki, and Kiyoharu AizawaIn The ITE Winter Annual Convention, Dec 2015🏆 ITE Outstanding Student Presentation Award
We present a new method of novelty detection in image recognition based on convolutional neural network (CNN). We use Sigmoid Layer as the last layer of a CNN instead of Softmax Layer. As a result, we discovered that a CNN with Sigmoid Layer can detect novelties in an easy dataset better than that with Softmax Layer, but worse in a difficult dataset.
@inproceedings{mizuno2015unknown, title = {A Discussion of Novelty Detection in Image Recognition}, booktitle = {The ITE Winter Annual Convention}, author = {Mizuno, Michihiro and Takeki, Akito and Horiguchi, Shota and Yamasaki, Toshihiko and Aizawa, Kiyoharu}, year = {2015}, month = dec, date = {21}, location = {Tokyo}, }
- DEA Study on Hierarchical Food ClassificationHokuto Kagaya, Shota Horiguchi, Sosuke Amanom, and Kiyoharu AizawaIn IEICE Technical Comittee on Data Engineering (DE), Sep 2015
Automatic food recognition or classification is very challenging task. One of the reason is that the number of food items is enormous, and so we can’t easily choose a single label for each food image. To solve this problem, we have studied hierarchical food classification. In this paper, we investigate the benefits of introducing hierarchy to classication and try to build hierarchy from words of food names automatically. As a result, we observed the difference between data used for training.
@inproceedings{kagaya2015hierarchical, title = {A Study on Hierarchical Food Classification}, booktitle = {IEICE Technical Comittee on Data Engineering (DE)}, author = {Kagaya, Hokuto and Horiguchi, Shota and Amanom, Sosuke and Aizawa, Kiyoharu}, year = {2015}, month = sep, date = {25}, pages = {59--64}, location = {Kanagawa}, }
- MIRUSelective Removal of Object Window Hypethesis Using GrabCutShota Horiguchi, Kiyoharu Aizawa, and Makoto OgawaIn The 18th Meeting on Image Recognition and Understanding (MIRU), Jul 2015
@inproceedings{horiguchi2015grabcut, title = {Selective Removal of Object Window Hypethesis Using GrabCut}, booktitle = {The 18th Meeting on Image Recognition and Understanding (MIRU)}, author = {Horiguchi, Shota and Aizawa, Kiyoharu and Ogawa, Makoto}, year = {2015}, month = jul, date = {29}, location = {Osaka} }
- PRMULog-Normal Distribution of Objects’ Size in Images and Its Applications to Object Detection—Comparing General Images and Food ImagesShota Horiguchi, Kiyoharu Aizawa, and Makoto OgawaIn IEICE Technical Committee on Pattern Recognition and Media Understanding (PRMU), Mar 2015
When detecting objects in images by which classifying many location hypotheses, it is necessary to define aspect ratio and scale of detection window in advance. In this paper, we construct a model of size distribution of objects in images, and revealed that the size of foods in images taken for recording dietary follows a log-normal distribution. We apply this characteristic to define parameters of selective search, the method of generating object hypotheses, and result in high Mean Average Best Overlap in spite of small number of object hypotheses when using FoodLog image dataset.
@inproceedings{horiguchi2015lognormal, title = {Log-Normal Distribution of Objects' Size in Images and Its Applications to Object Detection---Comparing General Images and Food Images}, booktitle = {IEICE Technical Committee on Pattern Recognition and Media Understanding (PRMU)}, author = {Horiguchi, Shota and Aizawa, Kiyoharu and Ogawa, Makoto}, year = {2015}, month = mar, date = {19}, pages = {135--140}, location = {Kanagawa}, }