CSC212 Splitting text into words while stripping punctuation marks

From dftwiki3
Revision as of 07:51, 2 November 2014 by Thiebaut (talk | contribs) (Output)
Jump to: navigation, search

--D. Thiebaut (talk) 07:49, 2 November 2014 (EST)




This is a quick introduction illustrating how to use a regular expression to split text into words in Java.


Source Code


public class RegExpTester {

	public static void main(String[] args) {

		String text = "The\tquick red fox jumped over\nthe lazy brown sleeping dog.";
		System.out.println( "===> text = " + text );
		
		// replace letters
		String newText = text.replaceAll( "i",  "I" );
		System.out.println( "\n\nreplace i with I: " + newText );

		// replace strings
		newText = text.replaceAll( "the",  "a" );
		System.out.println( "\n\nreplace \"the\" with \"a\":\n" + newText );

		// replace spaces
		newText = text.replaceAll( "[Tt]he", "a" );
		System.out.println( "\n\nreplace \"[Tt]he\" with \"a\":\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy]", "#" );
		System.out.println( "\n\nreplace non-vowels with #:\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace non-vowels and spaces with #:\n" + newText );

		// replace white space with simple space
		newText = text.replaceAll( "\\s", " " );
		System.out.println( "\n\nreplace white space with simple space:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "The\tquick     red   fox jumped over\n\n\tthe     lazy  brown sleeping dog.";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s+", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "Man!  It's a bird!    No; wait; Ah, it's a plane...  No!?!   It's... it's...   Superman!";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// remove non-alphabetic characters from text
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters#:\n" + newText );

		// remove non-alphabetic chars and remove extra white spaces
		newText = text.toLowerCase().replaceAll( "\\s+", " " ).replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters#:\n" + newText );
		
		// split the previous string into words.
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		String[] words = newText.split( "\\s+" );
		for ( int i=0; i<words.length; i++ )
			System.out.println( "words[" + i + "] = " + words[i] );
	}

}


Output


public class RegExpTester {

	public static void main(String[] args) {

		String text = "The\tquick red fox jumped over\nthe lazy brown sleeping dog.";
		System.out.println( "===> text = " + text );
		
		// replace letters
		String newText = text.replaceAll( "i",  "I" );
		System.out.println( "\n\nreplace i with I: " + newText );

		// replace strings
		newText = text.replaceAll( "the",  "a" );
		System.out.println( "\n\nreplace \"the\" with \"a\":\n" + newText );

		// replace spaces
		newText = text.replaceAll( "[Tt]he", "a" );
		System.out.println( "\n\nreplace \"[Tt]he\" with \"a\":\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy]", "#" );
		System.out.println( "\n\nreplace non-vowels with #:\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace non-vowels and spaces with #:\n" + newText );

		// replace white space with simple space
		newText = text.replaceAll( "\\s", " " );
		System.out.println( "\n\nreplace white space with simple space:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "The\tquick     red   fox jumped over\n\n\tthe     lazy  brown sleeping dog.";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s+", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "Man!  It's a bird!    No; wait; Ah, it's a plane...  No!?!   It's... it's...   Superman!";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// remove non-alphabetic characters from text
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters:\n" + newText );

		// remove non-alphabetic chars and remove extra white spaces
		newText = text.toLowerCase().replaceAll( "\\s+", " " ).replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters and remove extra white spaces:\n" + newText );
		
		// split the previous string into words.
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\nOutput list of single words in text:" );
		String[] words = newText.split( "\\s+" );
		for ( int i=0; i<words.length; i++ )
			System.out.println( "words[" + i + "] = " + words[i] );
	}

}